diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md index a59b9ec2..5ad0627b 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md @@ -8,6 +8,7 @@ In this directory, you will find examples on how to directly run HuggingFace `tr |------------|----------------------------------------------------------------| | Llama2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | | Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | +| Qwen2 | [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) | | Qwen2.5 | [Qwen/Qwen2.5-7b-Instruct](https://huggingface.co/Qwen/Qwen2.5-7b-Instruct) | | Baichuan2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan-7B-Chat) | | MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16) | @@ -50,6 +51,9 @@ python llama3.py :: to run Qwen2.5-7b-Instruct python qwen.py +:: to run Qwen2-1.5b-Instruct +python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --load_in_low_bit "sym_int8" + :: to run Baichuan2-7B-Chat python baichuan2.py diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py index c8fd4038..54338da6 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py @@ -32,8 +32,8 @@ if __name__ == "__main__": parser.add_argument( "--repo-id-or-model-path", type=str, - default="Qwen/Qwen2.5-7B-Instruct", # Or Qwen2-7B-Instruct - help="The huggingface repo id for the Baichuan2 model to be downloaded" + default="Qwen/Qwen2.5-7B-Instruct", # Or Qwen2-7B-Instruct, Qwen2-1.5B-Instruct + help="The huggingface repo id for the Qwen model to be downloaded" ", or the path to the huggingface checkpoint folder", ) parser.add_argument("--lowbit-path", type=str, @@ -47,6 +47,8 @@ if __name__ == "__main__": parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=960) + parser.add_argument('--load_in_low_bit', type=str, default="sym_int4", + help='Load in low bit to use') parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) args = parser.parse_args() @@ -56,6 +58,7 @@ if __name__ == "__main__": model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, pipeline=True, + load_in_low_bit=args.load_in_low_bit, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, torch_dtype=torch.float16, diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index 1d514835..c151ac93 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -27,7 +27,10 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir): rms_norm_eps = model.config.rms_norm_eps vocab_size = model.config.vocab_size model_norm = model.model.norm - lm_heads = model.lm_head.lm_heads # Qwen2 is always SlicedLMHead + if model.config.intermediate_size == 18944: + lm_heads = model.lm_head.lm_heads # Qwen2-7B is always SlicedLMHead + else: + lm_heads = [model.lm_head] if n_splits_linear == 1: weights = [(lm_heads[0].weight, lm_heads[0].scale)] else: