diff --git a/python/llm/src/ipex_llm/transformers/models/aquila.py b/python/llm/src/ipex_llm/transformers/models/aquila.py index 02054dcc..088c0fa5 100644 --- a/python/llm/src/ipex_llm/transformers/models/aquila.py +++ b/python/llm/src/ipex_llm/transformers/models/aquila.py @@ -50,7 +50,7 @@ from ipex_llm.utils.common import log4Error import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def aquila_attention_forward( diff --git a/python/llm/src/ipex_llm/transformers/models/baichuan.py b/python/llm/src/ipex_llm/transformers/models/baichuan.py index 0fef9131..134ca1a9 100644 --- a/python/llm/src/ipex_llm/transformers/models/baichuan.py +++ b/python/llm/src/ipex_llm/transformers/models/baichuan.py @@ -37,7 +37,7 @@ from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def baichuan_attention_forward_7b( diff --git a/python/llm/src/ipex_llm/transformers/models/baichuan2.py b/python/llm/src/ipex_llm/transformers/models/baichuan2.py index 309972d2..ea0a7718 100644 --- a/python/llm/src/ipex_llm/transformers/models/baichuan2.py +++ b/python/llm/src/ipex_llm/transformers/models/baichuan2.py @@ -46,7 +46,7 @@ except ImportError: import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def baichuan_13b_rms_norm_forward(self, hidden_states): diff --git a/python/llm/src/ipex_llm/transformers/models/bloom.py b/python/llm/src/ipex_llm/transformers/models/bloom.py index 5c2e658a..10010b72 100644 --- a/python/llm/src/ipex_llm/transformers/models/bloom.py +++ b/python/llm/src/ipex_llm/transformers/models/bloom.py @@ -42,7 +42,7 @@ from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, a import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool): diff --git a/python/llm/src/ipex_llm/transformers/models/chatglm.py b/python/llm/src/ipex_llm/transformers/models/chatglm.py index 0cd1cc94..b17ff131 100644 --- a/python/llm/src/ipex_llm/transformers/models/chatglm.py +++ b/python/llm/src/ipex_llm/transformers/models/chatglm.py @@ -40,7 +40,7 @@ def apply_rotary_pos_emb_index(q, k, cos, sin, position_id): import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) KV_CACHE_ALLOC_MIN_LENGTH = 512 diff --git a/python/llm/src/ipex_llm/transformers/models/chatglm2.py b/python/llm/src/ipex_llm/transformers/models/chatglm2.py index 9812926f..ce4216dd 100644 --- a/python/llm/src/ipex_llm/transformers/models/chatglm2.py +++ b/python/llm/src/ipex_llm/transformers/models/chatglm2.py @@ -30,7 +30,7 @@ from ipex_llm.transformers.models.utils import use_esimd_sdp import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) KV_CACHE_ALLOC_MIN_LENGTH = 512 diff --git a/python/llm/src/ipex_llm/transformers/models/chatglm2_32k.py b/python/llm/src/ipex_llm/transformers/models/chatglm2_32k.py index 38357e44..ea53aac3 100644 --- a/python/llm/src/ipex_llm/transformers/models/chatglm2_32k.py +++ b/python/llm/src/ipex_llm/transformers/models/chatglm2_32k.py @@ -25,7 +25,7 @@ from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, a import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) KV_CACHE_ALLOC_MIN_LENGTH = 512 diff --git a/python/llm/src/ipex_llm/transformers/models/decilm.py b/python/llm/src/ipex_llm/transformers/models/decilm.py index 771cf8b9..99a8c8f4 100644 --- a/python/llm/src/ipex_llm/transformers/models/decilm.py +++ b/python/llm/src/ipex_llm/transformers/models/decilm.py @@ -43,7 +43,7 @@ from ipex_llm.utils.common import invalidInputError import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def decilm_attention_forward_4_35_2( diff --git a/python/llm/src/ipex_llm/transformers/models/falcon.py b/python/llm/src/ipex_llm/transformers/models/falcon.py index 14d08d09..9f74b0dd 100644 --- a/python/llm/src/ipex_llm/transformers/models/falcon.py +++ b/python/llm/src/ipex_llm/transformers/models/falcon.py @@ -43,7 +43,7 @@ import warnings import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) # Copied from transformers.models.llama.modeling_llama.rotate_half diff --git a/python/llm/src/ipex_llm/transformers/models/gemma.py b/python/llm/src/ipex_llm/transformers/models/gemma.py index 4eb6f5fe..585fdb89 100644 --- a/python/llm/src/ipex_llm/transformers/models/gemma.py +++ b/python/llm/src/ipex_llm/transformers/models/gemma.py @@ -45,7 +45,7 @@ from ipex_llm.transformers.models.utils import decoding_fast_path_qtype_check import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): diff --git a/python/llm/src/ipex_llm/transformers/models/gptj.py b/python/llm/src/ipex_llm/transformers/models/gptj.py index 71bd4f7d..20af48c8 100644 --- a/python/llm/src/ipex_llm/transformers/models/gptj.py +++ b/python/llm/src/ipex_llm/transformers/models/gptj.py @@ -28,7 +28,7 @@ from ipex_llm.utils.common import invalidInputError import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def _get_embed_positions(self, position_ids): diff --git a/python/llm/src/ipex_llm/transformers/models/gptneox.py b/python/llm/src/ipex_llm/transformers/models/gptneox.py index 4e0129c9..54320300 100644 --- a/python/llm/src/ipex_llm/transformers/models/gptneox.py +++ b/python/llm/src/ipex_llm/transformers/models/gptneox.py @@ -40,7 +40,7 @@ from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def gptneox_attention_forward( diff --git a/python/llm/src/ipex_llm/transformers/models/internlm.py b/python/llm/src/ipex_llm/transformers/models/internlm.py index fe9f708c..f23f40b1 100644 --- a/python/llm/src/ipex_llm/transformers/models/internlm.py +++ b/python/llm/src/ipex_llm/transformers/models/internlm.py @@ -50,7 +50,7 @@ from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def internlm_attention_forward( diff --git a/python/llm/src/ipex_llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py index 6649c180..589d7149 100644 --- a/python/llm/src/ipex_llm/transformers/models/llama.py +++ b/python/llm/src/ipex_llm/transformers/models/llama.py @@ -83,7 +83,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: n_rep, slen, head_dim) return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) _ipex_version = None diff --git a/python/llm/src/ipex_llm/transformers/models/mistral.py b/python/llm/src/ipex_llm/transformers/models/mistral.py index c287ec0f..c43de710 100644 --- a/python/llm/src/ipex_llm/transformers/models/mistral.py +++ b/python/llm/src/ipex_llm/transformers/models/mistral.py @@ -63,7 +63,7 @@ except ImportError: import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: diff --git a/python/llm/src/ipex_llm/transformers/models/mixtral.py b/python/llm/src/ipex_llm/transformers/models/mixtral.py index 80ddd785..4f069d90 100644 --- a/python/llm/src/ipex_llm/transformers/models/mixtral.py +++ b/python/llm/src/ipex_llm/transformers/models/mixtral.py @@ -60,7 +60,7 @@ from ipex_llm.transformers.low_bit_linear import IQ2_XXS import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: diff --git a/python/llm/src/ipex_llm/transformers/models/mpt.py b/python/llm/src/ipex_llm/transformers/models/mpt.py index f6603d73..c3b11734 100644 --- a/python/llm/src/ipex_llm/transformers/models/mpt.py +++ b/python/llm/src/ipex_llm/transformers/models/mpt.py @@ -27,7 +27,7 @@ from ipex_llm.transformers.models.utils import extend_kv_cache, init_kv_cache, a import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def mpt_multihead_attention_forward(self, x, past_key_value=None, attn_bias=None, diff --git a/python/llm/src/ipex_llm/transformers/models/phixtral.py b/python/llm/src/ipex_llm/transformers/models/phixtral.py index 8feaabe8..b79c37f4 100644 --- a/python/llm/src/ipex_llm/transformers/models/phixtral.py +++ b/python/llm/src/ipex_llm/transformers/models/phixtral.py @@ -54,7 +54,7 @@ from ipex_llm.transformers.models.utils import mlp_fusion_check import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: diff --git a/python/llm/src/ipex_llm/transformers/models/qwen.py b/python/llm/src/ipex_llm/transformers/models/qwen.py index 271607ef..a02db2bd 100644 --- a/python/llm/src/ipex_llm/transformers/models/qwen.py +++ b/python/llm/src/ipex_llm/transformers/models/qwen.py @@ -56,7 +56,7 @@ logger = logging.get_logger(__name__) import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) SUPPORT_TORCH2 = hasattr(torch, '__version__') and int(torch.__version__.split(".")[0]) >= 2 diff --git a/python/llm/src/ipex_llm/transformers/models/qwen2.py b/python/llm/src/ipex_llm/transformers/models/qwen2.py index 2369c5a7..e411f691 100644 --- a/python/llm/src/ipex_llm/transformers/models/qwen2.py +++ b/python/llm/src/ipex_llm/transformers/models/qwen2.py @@ -71,7 +71,7 @@ logger = logging.get_logger(__name__) import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def should_use_fuse_rope(self, query_states, position_ids): diff --git a/python/llm/src/ipex_llm/transformers/models/qwen_vl.py b/python/llm/src/ipex_llm/transformers/models/qwen_vl.py index cfc390b7..869dc052 100644 --- a/python/llm/src/ipex_llm/transformers/models/qwen_vl.py +++ b/python/llm/src/ipex_llm/transformers/models/qwen_vl.py @@ -37,7 +37,7 @@ from ipex_llm.transformers.models.utils import decoding_fast_path_qtype_check import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def apply_rotary_pos_emb(t, freqs): diff --git a/python/llm/src/ipex_llm/transformers/models/stablelm.py b/python/llm/src/ipex_llm/transformers/models/stablelm.py index a6cd1bfb..4c8a6904 100644 --- a/python/llm/src/ipex_llm/transformers/models/stablelm.py +++ b/python/llm/src/ipex_llm/transformers/models/stablelm.py @@ -62,7 +62,7 @@ except ImportError: import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def merge_qkv(module: torch.nn.Module): diff --git a/python/llm/src/ipex_llm/transformers/models/yuan.py b/python/llm/src/ipex_llm/transformers/models/yuan.py index 43f86732..f0e8f9cc 100644 --- a/python/llm/src/ipex_llm/transformers/models/yuan.py +++ b/python/llm/src/ipex_llm/transformers/models/yuan.py @@ -40,7 +40,7 @@ from ipex_llm.transformers.models.utils import decoding_fast_path_qtype_check import os -KV_CACHE_ALLOC_BLOCK_LENGTH = os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256) +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def use_decoding_fast_path(proj, use_fuse_rope, enough_kv_room, bs):