From a81a329a5f14d5098ba80f5a1f69110f450bda92 Mon Sep 17 00:00:00 2001 From: SONG Ge <38711238+sgwhat@users.noreply.github.com> Date: Tue, 27 Aug 2024 14:57:46 +0800 Subject: [PATCH] [NPU] Add example for NPU multi-processing minicpm-1b model (#11935) * add minicpm example --- .../HF-Transformers-AutoModels/LLM/README.md | 7 ++ .../HF-Transformers-AutoModels/LLM/minicpm.py | 91 +++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index 80c82880..e7f84ab4 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -81,6 +81,7 @@ The example below shows how to run the **_optimized model implementations_** on - [Llama2-7B](./llama.py) - [Llama3-8B](./llama.py) - [Qwen2-1.5B](./qwen2.py) +- [MiniCPM-1B](./minicpm.py) ```bash # to run Llama-2-7b-chat-hf @@ -91,6 +92,9 @@ python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct # to run Qwen2-1.5B-Instruct python qwen2.py + +# to run MiniCPM-1B-sft-bf16 +python minicpm.py ``` Arguments info: @@ -113,6 +117,9 @@ python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --d # to run Qwen2-1.5B-Instruct python qwen2.py --disable-transpose-value-cache + +# to run MiniCPM-1B-sft-bf16 +python minicpm.py --disable-transpose-value-cache ``` diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py new file mode 100644 index 00000000..a0d92a38 --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py @@ -0,0 +1,91 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import torch +import time +import argparse + +from ipex_llm.transformers.npu_model import AutoModelForCausalLM +from transformers import AutoTokenizer + +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Predict Tokens using `generate()` API for npu model" + ) + parser.add_argument( + "--repo-id-or-model-path", + type=str, + default="openbmb/MiniCPM-1B-sft-bf16", + help="The huggingface repo id for the Llama2 model to be downloaded" + ", or the path to the huggingface checkpoint folder", + ) + parser.add_argument('--prompt', type=str, default="What is AI?", + help='Prompt to infer') + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=512) + parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--intra-pp", type=int, default=2) + parser.add_argument("--inter-pp", type=int, default=2) + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float16, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + print("-" * 80) + print("done") + with torch.inference_mode(): + + print("finish to load") + for i in range(5): + _input_ids = tokenizer.encode("<用户>{}".format(args.prompt), return_tensors="pt") + print("input length:", len(_input_ids[0])) + st = time.time() + output = model.generate( + _input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict + ) + end = time.time() + print(f"Inference time: {end-st} s") + input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False) + print("-" * 20, "Input", "-" * 20) + print(input_str) + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print("-" * 20, "Output", "-" * 20) + print(output_str) + + print("-" * 80) + print("done") + print("success shut down")