From 820f8a45543abdc5dfc2b288f47f7de9077a50cd Mon Sep 17 00:00:00 2001 From: Ch1y0q Date: Thu, 5 Sep 2024 15:31:01 +0800 Subject: [PATCH] add `--lowbit-path` option for NPU llama example (#12020) * add option" `--lowbit-path` * add descriptions in `README.md` and formatting * Update llama.py --- .../HF-Transformers-AutoModels/LLM/README.md | 1 + .../HF-Transformers-AutoModels/LLM/llama.py | 48 ++++++++++++++----- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index efc5aaf2..10415966 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -118,6 +118,7 @@ python baichuan2.py Arguments info: - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Llama2 model (i.e. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'meta-llama/Llama-2-7b-chat-hf'`. +- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string. - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is AI?`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. - `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py index a808a551..19138da5 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py @@ -50,6 +50,12 @@ if __name__ == "__main__": help="The huggingface repo id for the Llama2 model to be downloaded" ", or the path to the huggingface checkpoint folder", ) + parser.add_argument("--lowbit-path", type=str, + default="", + help="The path to the lowbit model folder, leave blank if you do not want to save. \ + If path not exists, lowbit model will be saved there. \ + Else, lowbit model will be loaded.", + ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") @@ -62,22 +68,38 @@ if __name__ == "__main__": args = parser.parse_args() model_path = args.repo_id_or_model_path - model = AutoModelForCausalLM.from_pretrained( - model_path, - torch_dtype=torch.float16, - trust_remote_code=True, - attn_implementation="eager", - load_in_low_bit="sym_int4", - optimize_model=True, - max_output_len=args.max_output_len, - max_prompt_len=args.max_prompt_len, - intra_pp=args.intra_pp, - inter_pp=args.inter_pp, - transpose_value_cache=not args.disable_transpose_value_cache, - ) + if not args.lowbit_path or not os.path.exists(args.lowbit_path): + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float16, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + ) + else: + model = AutoModelForCausalLM.load_low_bit( + args.lowbit_path, + attn_implementation="eager", + torch_dtype=torch.float16, + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + if args.lowbit_path and not os.path.exists(args.lowbit_path): + model.save_low_bit(args.lowbit_path) + DEFAULT_SYSTEM_PROMPT = """\ """