use new fp32 softmax kernel (#11776)

2024-08-13 14:48:11 +08:00 · 2024-08-13 14:48:11 +08:00 · aa861df066
commit aa861df066
parent 23d3acdc77
2 changed files with 6 additions and 5 deletions
--- a/python/llm/src/ipex_llm/transformers/models/minicpmv.py
+++ b/python/llm/src/ipex_llm/transformers/models/minicpmv.py
@ -42,8 +42,9 @@ def siglip_attention_forward(
    if attention_mask is not None:
        attn_weights = attn_weights + attention_mask

-    # upcast attention to fp32
-    attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
+    import xe_addons
+    xe_addons.attn_softmax_inplaced(attn_weights)
+
    attn_weights = torch.nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
    attn_output = torch.matmul(attn_weights, value_states)

--- a/python/llm/src/ipex_llm/transformers/models/phi3.py
+++ b/python/llm/src/ipex_llm/transformers/models/phi3.py
@ -184,9 +184,9 @@ def attention_forward(
        if attention_mask is not None:
            attn_weights = attn_weights + attention_mask

-        # upcast attention to fp32
-        attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1,
-                                                   dtype=torch.float32).to(value_states.dtype)
+        import xe_addons
+        xe_addons.attn_softmax_inplaced(attn_weights)
+
        attn_weights = torch.nn.functional.dropout(attn_weights, p=self.attention_dropout,
                                                   training=self.training)
        attn_output = torch.matmul(attn_weights, value_states)