From e08c6bd018fd2750daed1ebc573de71f111dccd9 Mon Sep 17 00:00:00 2001
From: Ruonan Wang <ruonan1.wang@intel.com>
Date: Tue, 15 Apr 2025 11:13:12 +0800
Subject: [PATCH] Fix several models based on sdp api change (#13075)

* fix baichuan based on sdp api change

* fix several models based on api change

* fix style
---
 python/llm/src/ipex_llm/transformers/models/baichuan.py | 7 +++++--
 python/llm/src/ipex_llm/transformers/models/chatglm.py  | 4 +++-
 python/llm/src/ipex_llm/transformers/models/qwen_vl.py  | 4 +++-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/models/baichuan.py b/python/llm/src/ipex_llm/transformers/models/baichuan.py
index c74ad68f..3a6ea10b 100644
--- a/python/llm/src/ipex_llm/transformers/models/baichuan.py
+++ b/python/llm/src/ipex_llm/transformers/models/baichuan.py
@@ -326,14 +326,17 @@ def baichuan_attention_forward_13b(
         else:
             attention_mask = attention_mask[None, :, -q_len:, :]
 
+    head_dim = query_states.shape[-1]
+    scale = 1 / math.sqrt(head_dim)
+
     if use_sdp(q_len, kv_seq_len, self.head_dim, query_states):
         import xe_addons
         if use_quantize_kv:
             attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states,
-                                            attention_mask)
+                                            attention_mask, scale)
         else:
             attn_output = xe_addons.sdp(query_states, key_states, value_states,
-                                        attention_mask)
+                                        attention_mask, scale)
         attn_weights = None
     else:
         if use_quantize_kv:
diff --git a/python/llm/src/ipex_llm/transformers/models/chatglm.py b/python/llm/src/ipex_llm/transformers/models/chatglm.py
index 34241d89..c4f49aeb 100644
--- a/python/llm/src/ipex_llm/transformers/models/chatglm.py
+++ b/python/llm/src/ipex_llm/transformers/models/chatglm.py
@@ -68,7 +68,9 @@ def glm_sdpa(query, key, value, attention_mask=None, is_causal=False):
         if use_sdp(query.shape[2], key.shape[2],
                    query.shape[-1], query):
             import xe_addons
-            attn_output = xe_addons.sdp(query, key, value, attn_bias)
+            head_dim = query.shape[-1]
+            scale = 1 / math.sqrt(head_dim)
+            attn_output = xe_addons.sdp(query, key, value, attn_bias, scale)
             context_layer = attn_output.view(query.shape)
         else:
             head_dim = query.size(-1)
diff --git a/python/llm/src/ipex_llm/transformers/models/qwen_vl.py b/python/llm/src/ipex_llm/transformers/models/qwen_vl.py
index a2f6e948..0c15a096 100644
--- a/python/llm/src/ipex_llm/transformers/models/qwen_vl.py
+++ b/python/llm/src/ipex_llm/transformers/models/qwen_vl.py
@@ -164,7 +164,9 @@ def qwen_attention_forward_vl(
     if not self.training and not hidden_states.requires_grad and \
             use_sdp(q_len, key.shape[2], self.head_dim, query):
         import xe_addons
-        attn_output = xe_addons.sdp(query, key, value, attention_mask)
+        head_dim = query.shape[-1]
+        scale = 1 / math.sqrt(head_dim)
+        attn_output = xe_addons.sdp(query, key, value, attention_mask, scale)
         attn_output = attn_output.view(query.shape)
         attn_output = attn_output.transpose(1, 2)
         attn_weight = None