diff --git a/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py b/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py index e5c099ba..7e1e2d0d 100644 --- a/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py +++ b/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py @@ -42,7 +42,6 @@ if __name__ == '__main__': # which convert the relevant layers in the model into INT4 format model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, - optimize_model=False, trust_remote_code=True, use_cache=True) model = model.to('xpu') diff --git a/python/llm/example/gpu/hf-transformers-models/baichuan2/generate.py b/python/llm/example/gpu/hf-transformers-models/baichuan2/generate.py index 16a7b9d8..ebb87ad9 100644 --- a/python/llm/example/gpu/hf-transformers-models/baichuan2/generate.py +++ b/python/llm/example/gpu/hf-transformers-models/baichuan2/generate.py @@ -46,7 +46,6 @@ if __name__ == '__main__': # to obtain optimal performance with BigDL-LLM INT4 optimizations model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, - optimize_model=False, trust_remote_code=True, use_cache=True) model = model.to('xpu') diff --git a/python/llm/src/bigdl/llm/transformers/models/baichuan.py b/python/llm/src/bigdl/llm/transformers/models/baichuan.py index 5d2d735c..71a4e9de 100644 --- a/python/llm/src/bigdl/llm/transformers/models/baichuan.py +++ b/python/llm/src/bigdl/llm/transformers/models/baichuan.py @@ -70,6 +70,8 @@ def baichuan_attention_forward_7b( cache_k = past_key_value[0] cache_v = past_key_value[1] if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3): + if device.type == 'xpu': + torch.xpu.empty_cache() # allocate new new_cache_k, new_cache_v = create_kv_cache(bsz, self.num_heads, @@ -168,6 +170,8 @@ def baichuan_attention_forward_13b( cache_k = past_key_value[0] cache_v = past_key_value[1] if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3): + if device.type == 'xpu': + torch.xpu.empty_cache() # allocate new new_cache_k, new_cache_v = create_kv_cache(bsz, self.num_heads, diff --git a/python/llm/src/bigdl/llm/transformers/models/baichuan2.py b/python/llm/src/bigdl/llm/transformers/models/baichuan2.py index b1179c55..64dc2532 100644 --- a/python/llm/src/bigdl/llm/transformers/models/baichuan2.py +++ b/python/llm/src/bigdl/llm/transformers/models/baichuan2.py @@ -82,6 +82,8 @@ def baichuan_attention_forward_7b( cache_k = past_key_value[0] cache_v = past_key_value[1] if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3): + if device.type == 'xpu': + torch.xpu.empty_cache() # allocate new new_cache_k, new_cache_v = create_kv_cache(bsz, self.num_heads, @@ -177,6 +179,8 @@ def baichuan_attention_forward_13b( cache_k = past_key_value[0] cache_v = past_key_value[1] if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3): + if device.type == 'xpu': + torch.xpu.empty_cache() # allocate new new_cache_k, new_cache_v = create_kv_cache(bsz, self.num_heads, diff --git a/python/llm/src/bigdl/llm/transformers/models/bloom.py b/python/llm/src/bigdl/llm/transformers/models/bloom.py index a6d42920..f3e08cba 100644 --- a/python/llm/src/bigdl/llm/transformers/models/bloom.py +++ b/python/llm/src/bigdl/llm/transformers/models/bloom.py @@ -105,6 +105,8 @@ def bloom_attention_forward( cache_k = layer_past[0].transpose(1, 2).view(batch_size, self.num_heads, -1, self.head_dim) cache_v = layer_past[1].view(batch_size, self.num_heads, -1, self.head_dim) if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3): + if device.type == 'xpu': + torch.xpu.empty_cache() # allocate new new_cache_k, new_cache_v = create_kv_cache( batch_size, diff --git a/python/llm/src/bigdl/llm/transformers/models/chatglm.py b/python/llm/src/bigdl/llm/transformers/models/chatglm.py index 6c1a0a8a..89525697 100644 --- a/python/llm/src/bigdl/llm/transformers/models/chatglm.py +++ b/python/llm/src/bigdl/llm/transformers/models/chatglm.py @@ -67,6 +67,8 @@ def attention_fn( cache_v = cache_v.permute(1, 2, 0, 3) past_length = cache_k.size(2) if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3): + if device.type == 'xpu': + torch.xpu.empty_cache() max_cache_length = past_length + cur_length + KV_CACHE_ALLOC_BLOCK_LENGTH new_cache_k, new_cache_v = create_kv_cache(batch_size, self.num_attention_heads_per_partition, diff --git a/python/llm/src/bigdl/llm/transformers/models/chatglm2.py b/python/llm/src/bigdl/llm/transformers/models/chatglm2.py index d43452cb..5de558e9 100644 --- a/python/llm/src/bigdl/llm/transformers/models/chatglm2.py +++ b/python/llm/src/bigdl/llm/transformers/models/chatglm2.py @@ -151,6 +151,8 @@ def chatglm2_attention_forward_8eb45c( past_length = cache_k.size(2) if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3): + if device.type == 'xpu': + torch.xpu.empty_cache() max_cache_length = past_length + cur_length + KV_CACHE_ALLOC_BLOCK_LENGTH new_cache_k, new_cache_v = create_kv_cache(batch_size, self.num_attention_heads_per_partition, diff --git a/python/llm/src/bigdl/llm/transformers/models/falcon.py b/python/llm/src/bigdl/llm/transformers/models/falcon.py index c8b1dcf1..0b8ef9c4 100644 --- a/python/llm/src/bigdl/llm/transformers/models/falcon.py +++ b/python/llm/src/bigdl/llm/transformers/models/falcon.py @@ -97,6 +97,8 @@ def rw_attention_forward_7b( cache_k = layer_past[0].view(batch_size, self.num_kv, -1, self.head_dim) cache_v = layer_past[1].view(batch_size, self.num_kv, -1, self.head_dim) if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3): + if device.type == 'xpu': + torch.xpu.empty_cache() # allocate new new_cache_k, new_cache_v = create_kv_cache( batch_size, diff --git a/python/llm/src/bigdl/llm/transformers/models/gptj.py b/python/llm/src/bigdl/llm/transformers/models/gptj.py index 65674360..8e390fca 100644 --- a/python/llm/src/bigdl/llm/transformers/models/gptj.py +++ b/python/llm/src/bigdl/llm/transformers/models/gptj.py @@ -144,6 +144,8 @@ def gptj_attention_forward( past_length = cache_k.size(2) if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3): + if device.type == 'xpu': + torch.xpu.empty_cache() new_cache_k, new_cache_v = create_kv_cache(batch_size, self.num_attention_heads, self.head_dim, diff --git a/python/llm/src/bigdl/llm/transformers/models/gptneox.py b/python/llm/src/bigdl/llm/transformers/models/gptneox.py index a0e3edde..0d0c16c6 100644 --- a/python/llm/src/bigdl/llm/transformers/models/gptneox.py +++ b/python/llm/src/bigdl/llm/transformers/models/gptneox.py @@ -90,6 +90,8 @@ def gptneox_attention_forward( past_key = layer_past[0] past_value = layer_past[1] if past_key.stride()[1] <= past_key.size(2) * past_key.size(3): + if device.type == 'xpu': + torch.xpu.empty_cache() # allocate new new_past_key, new_past_value = create_kv_cache(bsz, self.num_attention_heads, diff --git a/python/llm/src/bigdl/llm/transformers/models/llama.py b/python/llm/src/bigdl/llm/transformers/models/llama.py index 212abc2a..c8b07f63 100644 --- a/python/llm/src/bigdl/llm/transformers/models/llama.py +++ b/python/llm/src/bigdl/llm/transformers/models/llama.py @@ -112,6 +112,8 @@ def llama_attention_forward_4_31( cache_k = past_key_value[0] cache_v = past_key_value[1] if cache_k.stride()[1] <= cache_k.size(2) * cache_k.size(3): + if device.type == 'xpu': + torch.xpu.empty_cache() # allocate new new_cache_k, new_cache_v = create_kv_cache(bsz, self.num_key_value_heads, # Support GQA diff --git a/python/llm/src/bigdl/llm/transformers/models/utils.py b/python/llm/src/bigdl/llm/transformers/models/utils.py index 8890de1a..8d85db74 100644 --- a/python/llm/src/bigdl/llm/transformers/models/utils.py +++ b/python/llm/src/bigdl/llm/transformers/models/utils.py @@ -19,8 +19,6 @@ from bigdl.llm.utils.common import invalidInputError def create_kv_cache(batch_size, num_heads, head_dim, current_length, max_length, dtype, device): - if device.type == 'xpu': - torch.xpu.empty_cache() key_cache_storage = torch.empty(batch_size, num_heads, max_length, head_dim, dtype=dtype, device=device)