Fix Llama 3.2 & 3.1 on LNL (#12196)

2024-10-14 17:39:20 +08:00 · 2024-10-14 17:39:20 +08:00 · f8d1adc573
commit f8d1adc573
parent 516b578104
2 changed files with 4 additions and 1 deletions
--- a/python/llm/src/ipex_llm/transformers/convert.py
+++ b/python/llm/src/ipex_llm/transformers/convert.py
@ -1268,7 +1268,7 @@ def _optimize_post(model, lightweight_bmm=False):
    from ipex_llm.transformers.models.llama import llama_mlp_forward

    if model.config.model_type == "llama" and model.config.rope_scaling is not None:
-        # llama 3.2
+        # llama 3.2 & llama 3.1
        modeling_module_name = model.__class__.__module__
        module = importlib.import_module(modeling_module_name)
        from ipex_llm.transformers.models.common import rms_norm_forward
@ -1279,6 +1279,7 @@ def _optimize_post(model, lightweight_bmm=False):
        convert_forward(model, module.LlamaMLP, mlp_silu_forward)
        convert_forward(model, module.LlamaModel, llama_model_forward)
        convert_forward(model, module.LlamaAttention, llama_attention_forward)
+        convert_forward(model, module.LlamaSdpaAttention, llama_attention_forward)
    elif model.config.model_type == "mllama":
        # llama 3.2 vision
        modeling_module_name = model.__class__.__module__
--- a/python/llm/src/ipex_llm/transformers/models/llama32.py
+++ b/python/llm/src/ipex_llm/transformers/models/llama32.py
@ -204,6 +204,8 @@ def llama_attention_forward(
    kv_seq_len = key_states.size(2)
    if attention_mask is not None:  # no matter the length, we just slice it
        causal_mask = attention_mask[:, :, :, :kv_seq_len]
+    else:
+        causal_mask = None

    attn_weights = None
    if use_sdp(q_len, kv_seq_len, self.head_dim, query_states):