From 60aa1a2c0ff91681f149fa3c24e6527e20186ea7 Mon Sep 17 00:00:00 2001 From: Ruonan Wang Date: Fri, 30 Aug 2024 01:34:35 -0700 Subject: [PATCH] Initial NPU support for MiniCPM-V-2_6 (#11966) * initial pr * update npu model * fix * fix kv cache type * fix * small fix * fix style * fix model id * change inter_pp=4 * address comment * fix * fix style * fix * rebase --- .../Multimodal/minicpm_v_2_6.py | 92 +++++++++++++++++++ .../src/ipex_llm/transformers/npu_model.py | 23 ++--- .../transformers/npu_models/convert.py | 17 ++-- .../transformers/npu_models/convert_mp.py | 10 ++ .../ipex_llm/transformers/npu_models/kv.py | 3 +- .../transformers/npu_models/qwen2_mp.py | 4 + 6 files changed, 129 insertions(+), 20 deletions(-) create mode 100644 python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py new file mode 100644 index 00000000..8c80231b --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py @@ -0,0 +1,92 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import torch +import os +import time +import argparse +import requests +from PIL import Image +from ipex_llm.transformers.npu_model import AutoModel +from transformers import AutoTokenizer + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for openbmb/MiniCPM-V-2_6 model') + parser.add_argument('--repo-id-or-model-path', type=str, default="openbmb/MiniCPM-V-2_6", + help='The huggingface repo id for the openbmb/MiniCPM-V-2_6 model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--image-url-or-path', type=str, + default='http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg', + help='The URL or path to the image to infer') + parser.add_argument('--prompt', type=str, default="What is in this image?", + help='Prompt to infer') + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=960) + parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--intra-pp", type=int, default=2) + parser.add_argument("--inter-pp", type=int, default=2) + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + image_path = args.image_url_or_path + + model = AutoModel.from_pretrained(model_path, + torch_dtype=torch.float32, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + modules_to_not_convert=['vpm', 'resampler'] + ) + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + model.eval() + + query = args.prompt + if os.path.exists(image_path): + image = Image.open(image_path).convert('RGB') + else: + image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB') + + # Generate predicted tokens + # here the prompt tuning refers to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/README.md + msg = [{'role': 'user', 'content': args.prompt}] + st = time.time() + with torch.inference_mode(): + res = model.chat( + image=image, + msgs=msg, + context=None, + tokenizer=tokenizer, + sampling=True, + ) + end = time.time() + print(f'Inference time: {end-st} s') + print('-'*20, 'Input', '-'*20) + print(image_path) + print('-'*20, 'Prompt', '-'*20) + print(args.prompt) + output_str = res + print('-'*20, 'Output', '-'*20) + print(output_str) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index b5f72f33..df18d597 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -113,7 +113,6 @@ class _BaseAutoModelClass: ignore_argument(kwargs, "cpu_embedding") ignore_argument(kwargs, "embedding_qtype") ignore_argument(kwargs, "enable_mp") - ignore_argument(kwargs, "modules_to_not_convert") ignore_argument(kwargs, "quantization_config") ignore_argument(kwargs, "speculative") ignore_argument(kwargs, "pipeline_parallel_stages") @@ -123,6 +122,7 @@ class _BaseAutoModelClass: inter_pp = kwargs.pop("inter_pp", None) intra_pp = kwargs.pop("intra_pp", None) transpose_value_cache = kwargs.pop("transpose_value_cache", True) + modules_to_not_convert = kwargs.pop("modules_to_not_convert", []) _args = copy.deepcopy(args) _kwargs = copy.deepcopy(kwargs) @@ -152,17 +152,14 @@ class _BaseAutoModelClass: ) from ipex_llm.transformers.npu_models.convert_mp import optimize_llm, optimize_llm_pre - if model.config.model_type == "minicpmv": + if hasattr(model, "llm"): llm = model.llm - if llm.config.hidden_size == 4096 and llm.config.vocab_size == 128256: - # MiniCPM-llama3-V2.5 - llm.config.model_type = "llama" else: llm = model with torch.no_grad(): - optimize_llm_pre(llm, qtype) - cls.load_convert(qtype, llm, "cpu", *args, **kwargs) + optimize_llm_pre(model, qtype) + cls.load_convert(qtype, model, "cpu", modules_to_not_convert, *args, **kwargs) create_npu_kernels(llm) model = model.eval() logger.info(f"Finish to convert model") @@ -181,8 +178,11 @@ class _BaseAutoModelClass: from ipex_llm.transformers.npu_models.convert import optimize_llm optimize_llm(model) with torch.no_grad(): - cls.load_convert(qtype, model, "cpu", *args, **kwargs) - create_npu_kernels(model) + cls.load_convert(qtype, model, "cpu", modules_to_not_convert, *args, **kwargs) + if hasattr(model, "llm"): + create_npu_kernels(model.llm) + else: + create_npu_kernels(model) model = model.eval() logger.info(f"Finish to convert model") model.config.update({"bigdl_transformers_low_bit": qtype}) @@ -192,10 +192,11 @@ class _BaseAutoModelClass: return model @classmethod - def load_convert(cls, q_k, optimize_model, device, *arg, **kwarg): + def load_convert(cls, q_k, optimize_model, device, modules_to_not_convert, *arg, **kwarg): from ipex_llm.transformers.npu_models.convert import replace_with_QuantizedLinear - replace_with_QuantizedLinear(optimize_model, q_k, device=device) + replace_with_QuantizedLinear(optimize_model, q_k, device=device, + modules_to_not_convert=modules_to_not_convert) @classmethod @patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index 95c02fdb..d2df2977 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -31,7 +31,7 @@ def module_optimization(func) -> torch.nn.Module: torch.nn.Module: optimized module """ - def wrapper(model: torch.nn.Module, qtype, device, *args, **kwargs): + def wrapper(model: torch.nn.Module, qtype, device, modules_to_not_convert, *args, **kwargs): """Recursively apply the optimization function. Args: @@ -41,18 +41,19 @@ def module_optimization(func) -> torch.nn.Module: """ for name, layer in model.named_children(): - new_layer = func(layer, qtype, device, *args, **kwargs) - if new_layer: - model.add_module(name, new_layer) - wrapper(new_layer, qtype, device, *args, **kwargs) - else: - wrapper(layer, qtype, device, *args, **kwargs) + if name not in modules_to_not_convert: + new_layer = func(layer, qtype, device, modules_to_not_convert, *args, **kwargs) + if new_layer: + model.add_module(name, new_layer) + wrapper(new_layer, qtype, device, modules_to_not_convert, *args, **kwargs) + else: + wrapper(layer, qtype, device, modules_to_not_convert, *args, **kwargs) return wrapper @module_optimization -def replace_with_QuantizedLinear(layer, qtype, device): +def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert): from ipex_llm.transformers.low_bit_linear import ggml_convert_qtype from ipex_llm.ggml.quantize import ggml_tensor_qtype iqtype = ggml_tensor_qtype[qtype] diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index ba40729a..1aac9e32 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -42,6 +42,16 @@ def optimize_llm_pre(model: torch.nn.Module, qtype): from ipex_llm.transformers.models.baichuan import pre_compute_inv_freq model.apply(pre_compute_inv_freq) + if model.config.model_type == "minicpmv" and hasattr(model, "llm"): + # MiniCPM-V + if model.config.hidden_size == 2304 and model.config.vocab_size == 122753: + model.llm.config.model_type = "minicpm" + elif model.config.hidden_size == 3584 and model.config.vocab_size == 151666: + model.llm.config.model_type = "qwen2" + elif model.config.hidden_size == 4096 and model.config.vocab_size == 128256: + model.llm.config.model_type = "llama" + model = model.llm + # lm_head to cpu optimization if os.environ.get("IPEX_LLM_CPU_LM_HEAD", "0") != "0": # disable the optimization by default diff --git a/python/llm/src/ipex_llm/transformers/npu_models/kv.py b/python/llm/src/ipex_llm/transformers/npu_models/kv.py index 31c52c36..4f112a1c 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/kv.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/kv.py @@ -173,7 +173,8 @@ class DynamicFusedNormalCache(DynamicCache): head_dim, 0, max_len, - key_states.dtype, + # key_states.dtype, + torch.float16, key_states.device, tranpose_value=transpose_value, ) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py index 60f8e2ba..61bff6e7 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py @@ -197,7 +197,9 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory): new_key_states = self.convert_to_fp16(curr_key_values[i][0]) new_value_states = self.convert_to_fp16(curr_key_values[i][1]) + print("start compiling") self.compile() + print("end compiling") def mlp(self, hidden_states): mm1 = self.linear( @@ -862,6 +864,8 @@ class PrefillRunner: self.p.daemon = True self.p.start() output = self.prefill_result_queue.get() + print(Fore.GREEN + f"prefill process output: {output}") + print(Style.RESET_ALL) def forward( self,