diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py index 867527be..e0811a5c 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py @@ -24,6 +24,8 @@ from transformers.utils import logging from packaging import version import os import shutil +import time + logger = logging.get_logger(__name__) @@ -55,6 +57,7 @@ if __name__ == "__main__": model_path = args.repo_id_or_model_path save_dir = args.save_directory + t0 = time.perf_counter() model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, pipeline=True, @@ -69,6 +72,7 @@ if __name__ == "__main__": trust_remote_code=True, convert_model=True, save_directory=save_dir) + t1 = time.perf_counter() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -81,5 +85,6 @@ if __name__ == "__main__": tokenizer.save_pretrained(save_dir) print("-" * 80) + print(f"Convert model cost {t1 - t0}s.") print(f"finish save model to {save_dir}") print("success shut down") diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index 38520827..5760e7fb 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -135,9 +135,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, if mode == "decode": input_len = 1 decoder_name = f"decoder_layer_{layer_idx}" + npu_dpu_groups = None else: input_len = kv_len decoder_name = "decoder_layer_prefill" + npu_dpu_groups = 6 + single_decoder = LowBitQwenMultiDecoderlayer( [1, input_len, num_heads * head_dim], input_layernorm_weights=None, @@ -162,7 +165,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, ) rest_blob_path = update_names_of_IR_and_export_blob(single_decoder, decoder_name, - temp_dir, True, False) + temp_dir, True, False, + npu_dpu_groups=npu_dpu_groups) # 0, 1, 2 are input_embed/attention_mask/position_id if mode == "decode":