From ae57e23e4f7cb58b936e8d58f47599f8c04b88de Mon Sep 17 00:00:00 2001 From: Ruonan Wang Date: Fri, 25 Oct 2024 10:31:44 +0800 Subject: [PATCH] fix incompatibility between llama GW & llama pipeline (#12267) * fix * fix --- .../LLM/Pipeline-Models/llama.py | 5 ++-- .../npu_pipeline_model/convert_pipeline.py | 24 ++++++++++++------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama.py index f843e488..469081c3 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama.py @@ -59,7 +59,8 @@ if __name__ == "__main__": model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, pipeline=True, - max_output_len=args.max_output_len) + max_output_len=args.max_output_len, + attn_implementation="eager") tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -69,8 +70,8 @@ if __name__ == "__main__": print("-" * 80) print("done") with torch.inference_mode(): + print("finish to load") for i in range(5): - print("finish to load") prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT) _input_ids = tokenizer.encode(prompt, return_tensors="pt") print("input length:", len(_input_ids[0])) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index e620e3d9..883616c8 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -246,15 +246,21 @@ def convert_llm(model: torch.nn.Module, attn_layer = curr_layer.self_attn mlp_layer = curr_layer.mlp - weights = [ - (attn_layer.q_proj.weight, attn_layer.q_proj.scale), - (attn_layer.k_proj.weight, attn_layer.k_proj.scale), - (attn_layer.v_proj.weight, attn_layer.v_proj.scale), - (attn_layer.o_proj.weight, attn_layer.o_proj.scale), - (mlp_layer.gate_proj.weight, mlp_layer.gate_proj.scale), - (mlp_layer.up_proj.weight, mlp_layer.up_proj.scale), - (mlp_layer.down_proj.weight, mlp_layer.down_proj.scale), - ] + weights = [] + for q, k, v, o, g, u, d in zip(attn_layer.q_proj_dq_list, + attn_layer.k_proj_dq_list, + attn_layer.v_proj_dq_list, + attn_layer.o_proj_dq_list, + mlp_layer.gate_proj_dq_list, + mlp_layer.up_proj_dq_list, + mlp_layer.down_proj_dq_list): + weights.append((q.weight, q.scale)) + weights.append((k.weight, k.scale)) + weights.append((v.weight, v.scale)) + weights.append((o.weight, o.scale)) + weights.append((g.weight, g.scale)) + weights.append((u.weight, u.scale)) + weights.append((d.weight, d.scale)) cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)