optimize npu llama2 first token performance (#11451)

2024-06-27 17:37:33 +08:00 · 2024-06-27 17:37:33 +08:00 · 029ff15d28
commit 029ff15d28
parent 4e4ecd5095
1 changed files with 19 additions and 7 deletions
--- a/python/llm/src/ipex_llm/transformers/npu_models/llama.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/llama.py
@ -219,13 +219,25 @@ def llama_attention_forward(
    else:
        causal_mask = None
-    attn_output = torch.nn.functional.scaled_dot_product_attention(
+    if query_states.size(2) == key_states.size(2):
-        query_states,
+        # first token
-        key_states,
+        from intel_npu_acceleration_library.functional import scaled_dot_product_attention
-        value_states,
+        attn_output = scaled_dot_product_attention(
-        attn_mask=causal_mask,
+            query_states,
-        is_causal=self.is_causal and attention_mask is None and q_len > 1,
+            key_states,
-    )
+            value_states,
            attn_mask=causal_mask,
            is_causal=self.is_causal and causal_mask is None and q_len > 1,
        )
    else:
        # second+ token
        attn_output = torch.nn.functional.scaled_dot_product_attention(
            query_states,
            key_states,
            value_states,
            attn_mask=causal_mask,
            is_causal=self.is_causal and causal_mask is None and q_len > 1,
        )
    attn_output = attn_output.transpose(1, 2).contiguous()