[NPU] Qwen prefill attn_mask type hotfix (#12395)

* qwen prefill attn_mask type fp16 * update
2024-11-13 11:51:34 +02:00 · 2024-11-13 11:51:34 +02:00 · d6d63d6b84
commit d6d63d6b84
parent 9220babaab
1 changed files with 2 additions and 1 deletions
--- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
@ -247,7 +247,8 @@ class LLMBaseNNFactory(NNFactory):
            attn_weight = self.matmul(query_states, key_states, False, True) / (
                math.sqrt(head_dim)
            )
-            attention_mask = self.convert_to_fp16(attention_mask)
+            if mode != "prefill":
+                attention_mask = self.convert_to_fp16(attention_mask)
            attn_weight = self.eltwise_add(attn_weight, attention_mask)
            attn_weight = self.convert_to_fp32(attn_weight)
            attn_weight = self.softmax(attn_weight, -1)