From e08c6bd018fd2750daed1ebc573de71f111dccd9 Mon Sep 17 00:00:00 2001 From: Ruonan Wang Date: Tue, 15 Apr 2025 11:13:12 +0800 Subject: [PATCH] Fix several models based on sdp api change (#13075) * fix baichuan based on sdp api change * fix several models based on api change * fix style --- python/llm/src/ipex_llm/transformers/models/baichuan.py | 7 +++++-- python/llm/src/ipex_llm/transformers/models/chatglm.py | 4 +++- python/llm/src/ipex_llm/transformers/models/qwen_vl.py | 4 +++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/models/baichuan.py b/python/llm/src/ipex_llm/transformers/models/baichuan.py index c74ad68f..3a6ea10b 100644 --- a/python/llm/src/ipex_llm/transformers/models/baichuan.py +++ b/python/llm/src/ipex_llm/transformers/models/baichuan.py @@ -326,14 +326,17 @@ def baichuan_attention_forward_13b( else: attention_mask = attention_mask[None, :, -q_len:, :] + head_dim = query_states.shape[-1] + scale = 1 / math.sqrt(head_dim) + if use_sdp(q_len, kv_seq_len, self.head_dim, query_states): import xe_addons if use_quantize_kv: attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, - attention_mask) + attention_mask, scale) else: attn_output = xe_addons.sdp(query_states, key_states, value_states, - attention_mask) + attention_mask, scale) attn_weights = None else: if use_quantize_kv: diff --git a/python/llm/src/ipex_llm/transformers/models/chatglm.py b/python/llm/src/ipex_llm/transformers/models/chatglm.py index 34241d89..c4f49aeb 100644 --- a/python/llm/src/ipex_llm/transformers/models/chatglm.py +++ b/python/llm/src/ipex_llm/transformers/models/chatglm.py @@ -68,7 +68,9 @@ def glm_sdpa(query, key, value, attention_mask=None, is_causal=False): if use_sdp(query.shape[2], key.shape[2], query.shape[-1], query): import xe_addons - attn_output = xe_addons.sdp(query, key, value, attn_bias) + head_dim = query.shape[-1] + scale = 1 / math.sqrt(head_dim) + attn_output = xe_addons.sdp(query, key, value, attn_bias, scale) context_layer = attn_output.view(query.shape) else: head_dim = query.size(-1) diff --git a/python/llm/src/ipex_llm/transformers/models/qwen_vl.py b/python/llm/src/ipex_llm/transformers/models/qwen_vl.py index a2f6e948..0c15a096 100644 --- a/python/llm/src/ipex_llm/transformers/models/qwen_vl.py +++ b/python/llm/src/ipex_llm/transformers/models/qwen_vl.py @@ -164,7 +164,9 @@ def qwen_attention_forward_vl( if not self.training and not hidden_states.requires_grad and \ use_sdp(q_len, key.shape[2], self.head_dim, query): import xe_addons - attn_output = xe_addons.sdp(query, key, value, attention_mask) + head_dim = query.shape[-1] + scale = 1 / math.sqrt(head_dim) + attn_output = xe_addons.sdp(query, key, value, attention_mask, scale) attn_output = attn_output.view(query.shape) attn_output = attn_output.transpose(1, 2) attn_weight = None