From 534566e2902f9d1007db18f0948dd4e36c6b8b13 Mon Sep 17 00:00:00 2001 From: binbin Deng <108676127+plusbang@users.noreply.github.com> Date: Thu, 2 Jan 2025 11:13:15 +0800 Subject: [PATCH] [NPU] Support minicpm-v with python cpp backend (#12637) --- python/llm/dev/benchmark/all-in-one/run.py | 2 +- .../src/ipex_llm/transformers/npu_model.py | 6 +- .../transformers/npu_models/convert.py | 58 +++++++++++++------ .../transformers/npu_models/npu_llm_cpp.py | 27 ++++++--- .../transformers/npu_pipeline_model/qwen.py | 4 ++ 5 files changed, 66 insertions(+), 31 deletions(-) diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index be02721e..4ea7d898 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -650,7 +650,7 @@ def transformers_int4_npu_win(repo_id, load_time = end - st print(">> loading of model costs {}s".format(load_time)) - if not hasattr(model, "model_ptr"): + if not hasattr(model, "model_ptr") or repo_id in MINICPM_V_IDS: model = BenchmarkWrapper(model) result = {} diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 9220c0cb..cb67b157 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -301,8 +301,7 @@ class _BaseAutoModelClass: model.share_memory() if not pipeline: - if (not hasattr(model, 'llm') and - model.config.model_type in ["qwen2", "llama", "minicpm"]): + if model.config.model_type in ["qwen2", "llama", "minicpm"]: from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process optimize_llm_single_process( llm, @@ -312,7 +311,8 @@ class _BaseAutoModelClass: group_size=quantization_group_size, qtype=qtype, save_directory=save_directory, - fuse_layers=fuse_layers + fuse_layers=fuse_layers, + has_llm=hasattr(model, "llm") ) else: optimize_llm( diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index 2ae1f264..3dece12f 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -449,7 +449,8 @@ def optimize_llm_single_process( group_size: int, qtype: str, save_directory: str, - fuse_layers: int=None + fuse_layers: int=None, + has_llm: bool=False ): from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm from .npu_llm_cpp import load_model_from_file @@ -468,8 +469,13 @@ def optimize_llm_single_process( model.kv_len = kv_len model.model_ptr = model_ptr model.save_directory = save_directory - model.vocab_size = model.config.vocab_size + if model.config.vocab_size == 151666: + # for MiniCPM-V 2.6, 152064 is vocab_size of Qwen2-7B + model.vocab_size = 152064 + else: + model.vocab_size = model.config.vocab_size model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32) + model.max_prompt_len = max_prompt_len except: invalidInputError(False, "False to InitLLMPipeline.") @@ -478,9 +484,10 @@ def optimize_llm_single_process( general_convert(model, PreTrainedModel, prepare_input_ids, "prepare_inputs_for_generation") general_convert(model, PreTrainedModel, causal_lm_forward) # patch generate function - import types - model.original_generate = model.generate - model.generate = types.MethodType(generate, model) + if not has_llm: + import types + model.original_generate = model.generate + model.generate = types.MethodType(generate, model) return model @@ -491,9 +498,10 @@ def prepare_input_ids( else: # prefill, reset the model here from .npu_llm_cpp import reset reset(self.model_ptr) - model_inputs = { - "input_ids": input_ids - } + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} return model_inputs @@ -511,17 +519,31 @@ def causal_lm_forward( return_dict: Optional[bool] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: from .npu_llm_cpp import run_prefill_with_logits, run_decode_with_logits - if isinstance(input_ids[0], torch.Tensor): - input_list = input_ids[0].flatten().tolist() + if input_ids is not None: + if isinstance(input_ids[0], torch.Tensor): + input_list = input_ids[0].flatten().tolist() + else: + input_list = input_ids[0] + input_length = len(input_list) + if input_length > 1: + logits = run_prefill_with_logits(self.model_ptr, input_list, + self.logits_buffer, self.vocab_size) + else: + logits = run_decode_with_logits(self.model_ptr, input_list[0], + self.logits_buffer, self.vocab_size) + elif inputs_embeds is not None: + seq_len = inputs_embeds.shape[1] + pad_len = self.max_prompt_len - seq_len + inputs_embeds = torch.nn.functional.pad(inputs_embeds.to(torch.float16), + (0, 0, 0, pad_len), value=0.0) + logits = run_prefill_with_logits(self.model_ptr, None, self.logits_buffer, + self.vocab_size, inputs_embeds, seq_len) else: - input_list = input_ids[0] - input_length = len(input_list) - if input_length > 1: - logits = run_prefill_with_logits(self.model_ptr, input_list, - self.logits_buffer, self.vocab_size) - else: - logits = run_decode_with_logits(self.model_ptr, input_list[0], - self.logits_buffer, self.vocab_size) + invalidInputError(False, "Please specify either input_ids or inputs_embeds.") + + if self.config.vocab_size == 151666: + # for MiniCPM-V 2.6 + logits = logits[:, :, :151666] return CausalLMOutputWithPast( loss=None, diff --git a/python/llm/src/ipex_llm/transformers/npu_models/npu_llm_cpp.py b/python/llm/src/ipex_llm/transformers/npu_models/npu_llm_cpp.py index dc97852d..2abdb9cc 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/npu_llm_cpp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/npu_llm_cpp.py @@ -48,8 +48,8 @@ _lib = ctypes.cdll.LoadLibrary(_lib_path) _lib.load_model_from_file.argtypes = [ctypes.c_char_p] _lib.load_model_from_file.restype = ctypes.c_void_p -_lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), ctypes.c_int, - ctypes.c_float] +_lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, + ctypes.c_float, ctypes.c_bool] _lib.run_prefill.restype = ctypes.POINTER(ctypes.c_float) _lib.run_decode.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_float] @@ -61,8 +61,10 @@ _lib.llm_sample_token.restype = ctypes.c_int _lib.reset.argtypes = [ctypes.c_void_p] _lib.reset.restype = None -_lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), - ctypes.c_int, ctypes.POINTER(ctypes.c_float), ctypes.c_int] +_lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_void_p, + ctypes.c_int, ctypes.POINTER(ctypes.c_float), + ctypes.c_int, ctypes.c_bool] + _lib.run_prefill_with_logits.restype = None _lib.run_decode_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_int, @@ -77,7 +79,7 @@ def load_model_from_file(model_dir: str): def run_prefill(model_ptr, input_ids, vocab_size, repetition_penalty=1.0): input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids) input_len = len(input_ids) - plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty) + plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty, False) new_token = _lib.llm_sample_token(plogits, True, vocab_size) return new_token @@ -88,12 +90,19 @@ def run_decode(model_ptr, input_id, vocab_size, repetition_penalty=1.0): return new_token -def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size): - input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids) - input_len = len(input_ids) +def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size, + inputs_embeds=None, seq_len=None): + if input_ids is not None: + input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids) + input_len = len(input_ids) + else: + input_ptr = inputs_embeds.contiguous().data.data_ptr() + input_ptr = ctypes.cast(input_ptr, ctypes.c_void_p) + input_len = seq_len logits_ptr = logits.data.data_ptr() logits_ptr = ctypes.cast(logits_ptr, ctypes.POINTER(ctypes.c_float)) - _lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr, vocab_size) + _lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr, + vocab_size, (input_ids is None)) return logits diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index ffe2707d..5137fd4a 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -34,6 +34,10 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir, lm_head_n_splits = 1 asym = getattr(model.config, "asym", False) + if vocab_size == 151666: + # for MiniCPM-V 2.6 lm_head on NPU + vocab_size = 152064 + if not isinstance(lm_head, SlicedLMHead): asym = lm_head.qtype == "asym_int4_rtn" if asym: