[NPU] Support minicpm-v with python cpp backend (#12637)

This commit is contained in:
binbin Deng 2025-01-02 11:13:15 +08:00 committed by GitHub
parent f289f68d57
commit 534566e290
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 66 additions and 31 deletions

View file

@ -650,7 +650,7 @@ def transformers_int4_npu_win(repo_id,
load_time = end - st load_time = end - st
print(">> loading of model costs {}s".format(load_time)) print(">> loading of model costs {}s".format(load_time))
if not hasattr(model, "model_ptr"): if not hasattr(model, "model_ptr") or repo_id in MINICPM_V_IDS:
model = BenchmarkWrapper(model) model = BenchmarkWrapper(model)
result = {} result = {}

View file

@ -301,8 +301,7 @@ class _BaseAutoModelClass:
model.share_memory() model.share_memory()
if not pipeline: if not pipeline:
if (not hasattr(model, 'llm') and if model.config.model_type in ["qwen2", "llama", "minicpm"]:
model.config.model_type in ["qwen2", "llama", "minicpm"]):
from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
optimize_llm_single_process( optimize_llm_single_process(
llm, llm,
@ -312,7 +311,8 @@ class _BaseAutoModelClass:
group_size=quantization_group_size, group_size=quantization_group_size,
qtype=qtype, qtype=qtype,
save_directory=save_directory, save_directory=save_directory,
fuse_layers=fuse_layers fuse_layers=fuse_layers,
has_llm=hasattr(model, "llm")
) )
else: else:
optimize_llm( optimize_llm(

View file

@ -449,7 +449,8 @@ def optimize_llm_single_process(
group_size: int, group_size: int,
qtype: str, qtype: str,
save_directory: str, save_directory: str,
fuse_layers: int=None fuse_layers: int=None,
has_llm: bool=False
): ):
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
from .npu_llm_cpp import load_model_from_file from .npu_llm_cpp import load_model_from_file
@ -468,8 +469,13 @@ def optimize_llm_single_process(
model.kv_len = kv_len model.kv_len = kv_len
model.model_ptr = model_ptr model.model_ptr = model_ptr
model.save_directory = save_directory model.save_directory = save_directory
if model.config.vocab_size == 151666:
# for MiniCPM-V 2.6, 152064 is vocab_size of Qwen2-7B
model.vocab_size = 152064
else:
model.vocab_size = model.config.vocab_size model.vocab_size = model.config.vocab_size
model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32) model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32)
model.max_prompt_len = max_prompt_len
except: except:
invalidInputError(False, invalidInputError(False,
"False to InitLLMPipeline.") "False to InitLLMPipeline.")
@ -478,6 +484,7 @@ def optimize_llm_single_process(
general_convert(model, PreTrainedModel, prepare_input_ids, "prepare_inputs_for_generation") general_convert(model, PreTrainedModel, prepare_input_ids, "prepare_inputs_for_generation")
general_convert(model, PreTrainedModel, causal_lm_forward) general_convert(model, PreTrainedModel, causal_lm_forward)
# patch generate function # patch generate function
if not has_llm:
import types import types
model.original_generate = model.generate model.original_generate = model.generate
model.generate = types.MethodType(generate, model) model.generate = types.MethodType(generate, model)
@ -491,9 +498,10 @@ def prepare_input_ids(
else: # prefill, reset the model here else: # prefill, reset the model here
from .npu_llm_cpp import reset from .npu_llm_cpp import reset
reset(self.model_ptr) reset(self.model_ptr)
model_inputs = { if inputs_embeds is not None and past_key_values is None:
"input_ids": input_ids model_inputs = {"inputs_embeds": inputs_embeds}
} else:
model_inputs = {"input_ids": input_ids}
return model_inputs return model_inputs
@ -511,6 +519,7 @@ def causal_lm_forward(
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithPast]: ) -> Union[Tuple, CausalLMOutputWithPast]:
from .npu_llm_cpp import run_prefill_with_logits, run_decode_with_logits from .npu_llm_cpp import run_prefill_with_logits, run_decode_with_logits
if input_ids is not None:
if isinstance(input_ids[0], torch.Tensor): if isinstance(input_ids[0], torch.Tensor):
input_list = input_ids[0].flatten().tolist() input_list = input_ids[0].flatten().tolist()
else: else:
@ -522,6 +531,19 @@ def causal_lm_forward(
else: else:
logits = run_decode_with_logits(self.model_ptr, input_list[0], logits = run_decode_with_logits(self.model_ptr, input_list[0],
self.logits_buffer, self.vocab_size) self.logits_buffer, self.vocab_size)
elif inputs_embeds is not None:
seq_len = inputs_embeds.shape[1]
pad_len = self.max_prompt_len - seq_len
inputs_embeds = torch.nn.functional.pad(inputs_embeds.to(torch.float16),
(0, 0, 0, pad_len), value=0.0)
logits = run_prefill_with_logits(self.model_ptr, None, self.logits_buffer,
self.vocab_size, inputs_embeds, seq_len)
else:
invalidInputError(False, "Please specify either input_ids or inputs_embeds.")
if self.config.vocab_size == 151666:
# for MiniCPM-V 2.6
logits = logits[:, :, :151666]
return CausalLMOutputWithPast( return CausalLMOutputWithPast(
loss=None, loss=None,

View file

@ -48,8 +48,8 @@ _lib = ctypes.cdll.LoadLibrary(_lib_path)
_lib.load_model_from_file.argtypes = [ctypes.c_char_p] _lib.load_model_from_file.argtypes = [ctypes.c_char_p]
_lib.load_model_from_file.restype = ctypes.c_void_p _lib.load_model_from_file.restype = ctypes.c_void_p
_lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), ctypes.c_int, _lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int,
ctypes.c_float] ctypes.c_float, ctypes.c_bool]
_lib.run_prefill.restype = ctypes.POINTER(ctypes.c_float) _lib.run_prefill.restype = ctypes.POINTER(ctypes.c_float)
_lib.run_decode.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_float] _lib.run_decode.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_float]
@ -61,8 +61,10 @@ _lib.llm_sample_token.restype = ctypes.c_int
_lib.reset.argtypes = [ctypes.c_void_p] _lib.reset.argtypes = [ctypes.c_void_p]
_lib.reset.restype = None _lib.reset.restype = None
_lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), _lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_void_p,
ctypes.c_int, ctypes.POINTER(ctypes.c_float), ctypes.c_int] ctypes.c_int, ctypes.POINTER(ctypes.c_float),
ctypes.c_int, ctypes.c_bool]
_lib.run_prefill_with_logits.restype = None _lib.run_prefill_with_logits.restype = None
_lib.run_decode_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_int, _lib.run_decode_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_int,
@ -77,7 +79,7 @@ def load_model_from_file(model_dir: str):
def run_prefill(model_ptr, input_ids, vocab_size, repetition_penalty=1.0): def run_prefill(model_ptr, input_ids, vocab_size, repetition_penalty=1.0):
input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids) input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
input_len = len(input_ids) input_len = len(input_ids)
plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty) plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty, False)
new_token = _lib.llm_sample_token(plogits, True, vocab_size) new_token = _lib.llm_sample_token(plogits, True, vocab_size)
return new_token return new_token
@ -88,12 +90,19 @@ def run_decode(model_ptr, input_id, vocab_size, repetition_penalty=1.0):
return new_token return new_token
def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size): def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size,
inputs_embeds=None, seq_len=None):
if input_ids is not None:
input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids) input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
input_len = len(input_ids) input_len = len(input_ids)
else:
input_ptr = inputs_embeds.contiguous().data.data_ptr()
input_ptr = ctypes.cast(input_ptr, ctypes.c_void_p)
input_len = seq_len
logits_ptr = logits.data.data_ptr() logits_ptr = logits.data.data_ptr()
logits_ptr = ctypes.cast(logits_ptr, ctypes.POINTER(ctypes.c_float)) logits_ptr = ctypes.cast(logits_ptr, ctypes.POINTER(ctypes.c_float))
_lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr, vocab_size) _lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr,
vocab_size, (input_ids is None))
return logits return logits

View file

@ -34,6 +34,10 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
lm_head_n_splits = 1 lm_head_n_splits = 1
asym = getattr(model.config, "asym", False) asym = getattr(model.config, "asym", False)
if vocab_size == 151666:
# for MiniCPM-V 2.6 lm_head on NPU
vocab_size = 152064
if not isinstance(lm_head, SlicedLMHead): if not isinstance(lm_head, SlicedLMHead):
asym = lm_head.qtype == "asym_int4_rtn" asym = lm_head.qtype == "asym_int4_rtn"
if asym: if asym: