From 158289d2054ecdf21978717ebb865ec264c54b25 Mon Sep 17 00:00:00 2001 From: SONG Ge <38711238+sgwhat@users.noreply.github.com> Date: Fri, 30 Aug 2024 16:00:33 +0800 Subject: [PATCH] [NPU] Add initial support for minicpm-llama-v2.5 (#11962) * add initial support for minicpm-llama-v2.5 * update impl * add minicpm-llama3-v2.5 example --- .../Multimodal/minicpm-llama3-v2.5.py | 103 ++++++++++++++++++ .../src/ipex_llm/transformers/npu_model.py | 16 ++- 2 files changed, 115 insertions(+), 4 deletions(-) create mode 100644 python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py new file mode 100644 index 00000000..4d223ee3 --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py @@ -0,0 +1,103 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import torch +import time +import argparse + +from ipex_llm.transformers.npu_model import AutoModel, AutoModelForCausalLM +from transformers import AutoTokenizer +from transformers.utils import logging + +import requests +from PIL import Image + +logger = logging.get_logger(__name__) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Predict Tokens using `chat()` API for npu model" + ) + parser.add_argument( + "--repo-id-or-model-path", + type=str, + default="openbmb/MiniCPM-Llama3-V-2_5", + help="The huggingface repo id for the MiniCPM-Llama3-V-2_5 model to be downloaded" + ", or the path to the huggingface checkpoint folder", + ) + parser.add_argument('--image-url-or-path', type=str, + default='http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg', + help='The URL or path to the image to infer') + parser.add_argument('--prompt', type=str, default="What is in the image?", + help='Prompt to infer') + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=512) + parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--intra-pp", type=int, default=2) + parser.add_argument("--inter-pp", type=int, default=2) + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float32, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + print("-" * 80) + print("done") + + msgs = [{'role': 'user', 'content': args.prompt}] + image_path = args.image_url_or_path + if os.path.exists(image_path): + image = Image.open(image_path).convert('RGB') + else: + image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB') + + st = time.time() + res = model.chat( + image=image, + msgs=msgs, + tokenizer=tokenizer, + sampling=True, + temperature=0.7, + # system_prompt='' # pass system_prompt if needed + ) + end = time.time() + + print(f'Inference time: {end-st} s') + print('-'*20, 'Input', '-'*20) + print(image_path) + print('-'*20, 'Prompt', '-'*20) + print(args.prompt) + output_str = res + print('-'*20, 'Output', '-'*20) + print(output_str) + + print("done") + print("success shut down") diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index cc273dcd..b5f72f33 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -152,17 +152,25 @@ class _BaseAutoModelClass: ) from ipex_llm.transformers.npu_models.convert_mp import optimize_llm, optimize_llm_pre + if model.config.model_type == "minicpmv": + llm = model.llm + if llm.config.hidden_size == 4096 and llm.config.vocab_size == 128256: + # MiniCPM-llama3-V2.5 + llm.config.model_type = "llama" + else: + llm = model + with torch.no_grad(): - optimize_llm_pre(model, qtype) - cls.load_convert(qtype, model, "cpu", *args, **kwargs) - create_npu_kernels(model) + optimize_llm_pre(llm, qtype) + cls.load_convert(qtype, llm, "cpu", *args, **kwargs) + create_npu_kernels(llm) model = model.eval() logger.info(f"Finish to convert model") model.config.update({"bigdl_transformers_low_bit": qtype}) model.share_memory() optimize_llm( - model, + llm, max_output_len=max_output_len, max_prompt_len=max_prompt_len, inter_pp=inter_pp,