From 73a4360f3f821146808561012de021f3531b2431 Mon Sep 17 00:00:00 2001 From: Ch1y0q Date: Tue, 10 Sep 2024 15:35:24 +0800 Subject: [PATCH] update lowbit path for baichuan2, qwen2, `generate.py` (#12051) * update lowbit path for baichuan2, qwen2, `generate.py` * update readme --- .../HF-Transformers-AutoModels/LLM/README.md | 4 ++ .../LLM/baichuan2.py | 49 ++++++++++++++----- .../LLM/generate.py | 26 ++++++++-- .../HF-Transformers-AutoModels/LLM/qwen2.py | 48 +++++++++++++----- 4 files changed, 98 insertions(+), 29 deletions(-) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index 08f0f23c..1445c561 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -61,6 +61,7 @@ python ./generate.py Arguments info: - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Llama2 model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'meta-llama/Llama-2-7b-chat-hf'`, and more verified models please see the list in [Verified Models](#verified-models). +- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string. - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun'`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. - `--load_in_low_bit`: argument defining the `load_in_low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used. @@ -131,6 +132,9 @@ Arguments info: ### Troubleshooting +#### `TypeError: can't convert meta device type tensor to numpy.` Error +If you encounter `TypeError: can't convert meta device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.` error when loading lowbit model, please try re-saving the lowbit model with the example script you are currently using. Please note that lowbit models saved by `qwen2.py`, `llama.py`, etc. cannot be loaded by `generate.py`. + #### Output Problem If you encounter output problem, please try to disable the optimization of transposing value cache with following command: ```bash diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py index f3f4cb10..d9af25df 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py @@ -50,6 +50,12 @@ if __name__ == "__main__": help="The huggingface repo id for the Baichuan2 model to be downloaded" ", or the path to the huggingface checkpoint folder", ) + parser.add_argument("--lowbit-path", type=str, + default="", + help="The path to the lowbit model folder, leave blank if you do not want to save. \ + If path not exists, lowbit model will be saved there. \ + Else, lowbit model will be loaded.", + ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") @@ -62,22 +68,39 @@ if __name__ == "__main__": args = parser.parse_args() model_path = args.repo_id_or_model_path - model = AutoModelForCausalLM.from_pretrained( - model_path, - torch_dtype=torch.bfloat16, - trust_remote_code=True, - attn_implementation="eager", - load_in_low_bit="sym_int4", - optimize_model=True, - max_output_len=args.max_output_len, - max_prompt_len=args.max_prompt_len, - intra_pp=args.intra_pp, - inter_pp=args.inter_pp, - transpose_value_cache=not args.disable_transpose_value_cache, - ) + if not args.lowbit_path or not os.path.exists(args.lowbit_path): + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + ) + else: + model = AutoModelForCausalLM.load_low_bit( + args.lowbit_path, + attn_implementation="eager", + torch_dtype=torch.bfloat16, + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + trust_remote_code=True, + ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + if args.lowbit_path and not os.path.exists(args.lowbit_path): + model.save_low_bit(args.lowbit_path) + DEFAULT_SYSTEM_PROMPT = """\ """ diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py index a3536ccc..6c1da3d8 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py @@ -17,6 +17,7 @@ import torch import time import argparse +import os from ipex_llm.transformers.npu_model import AutoModelForCausalLM from transformers import AutoTokenizer @@ -27,6 +28,11 @@ if __name__ == '__main__': parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf", help='The huggingface repo id for the Llama2 model to be downloaded' ', or the path to the huggingface checkpoint folder') + parser.add_argument("--lowbit-path", type=str, + default="", + help='The path to the lowbit model folder, leave blank if you do not want to save. \ + If path not exists, lowbit model will be saved there. \ + Else, lowbit model will be loaded.') parser.add_argument('--prompt', type=str, default="Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun", help='Prompt to infer') parser.add_argument('--n-predict', type=int, default=32, @@ -39,12 +45,26 @@ if __name__ == '__main__': tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, - load_in_low_bit=args.load_in_low_bit, - attn_implementation="eager") + if not args.lowbit_path or not os.path.exists(args.lowbit_path): + model = AutoModelForCausalLM.from_pretrained( + model_path, + trust_remote_code=True, + load_in_low_bit=args.load_in_low_bit, + attn_implementation="eager" + ) + else: + model = AutoModelForCausalLM.load_low_bit( + args.lowbit_path, + trust_remote_code=True, + bigdl_transformers_low_bit=args.load_in_low_bit, + attn_implementation="eager" + ) print(model) + if args.lowbit_path and not os.path.exists(args.lowbit_path): + model.save_low_bit(args.lowbit_path) + with torch.inference_mode(): prompt = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" input_ids = tokenizer.encode(prompt, return_tensors="pt") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen2.py index 2e4d195f..465a9910 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen2.py @@ -37,6 +37,12 @@ if __name__ == "__main__": help="The huggingface repo id for the Qwen2 model to be downloaded" ", or the path to the huggingface checkpoint folder", ) + parser.add_argument("--lowbit-path", type=str, + default="", + help="The path to the lowbit model folder, leave blank if you do not want to save. \ + If path not exists, lowbit model will be saved there. \ + Else, lowbit model will be loaded.", + ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") @@ -49,22 +55,38 @@ if __name__ == "__main__": args = parser.parse_args() model_path = args.repo_id_or_model_path - model = AutoModelForCausalLM.from_pretrained( - model_path, - torch_dtype=torch.float16, - trust_remote_code=True, - attn_implementation="eager", - load_in_low_bit="sym_int4", - optimize_model=True, - max_output_len=args.max_output_len, - max_prompt_len=args.max_prompt_len, - intra_pp=args.intra_pp, - inter_pp=args.inter_pp, - transpose_value_cache=not args.disable_transpose_value_cache, - ) + if not args.lowbit_path or not os.path.exists(args.lowbit_path): + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float16, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + ) + else: + model = AutoModelForCausalLM.load_low_bit( + args.lowbit_path, + attn_implementation="eager", + torch_dtype=torch.float16, + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + if args.lowbit_path and not os.path.exists(args.lowbit_path): + model.save_low_bit(args.lowbit_path) + print("-" * 80) print("done") messages = [{"role": "system", "content": "You are a helpful assistant."},