[NPU] Support minicpm-v with python cpp backend (#12637)
This commit is contained in:
parent
f289f68d57
commit
534566e290
5 changed files with 66 additions and 31 deletions
|
|
@ -650,7 +650,7 @@ def transformers_int4_npu_win(repo_id,
|
||||||
load_time = end - st
|
load_time = end - st
|
||||||
print(">> loading of model costs {}s".format(load_time))
|
print(">> loading of model costs {}s".format(load_time))
|
||||||
|
|
||||||
if not hasattr(model, "model_ptr"):
|
if not hasattr(model, "model_ptr") or repo_id in MINICPM_V_IDS:
|
||||||
model = BenchmarkWrapper(model)
|
model = BenchmarkWrapper(model)
|
||||||
|
|
||||||
result = {}
|
result = {}
|
||||||
|
|
|
||||||
|
|
@ -301,8 +301,7 @@ class _BaseAutoModelClass:
|
||||||
model.share_memory()
|
model.share_memory()
|
||||||
|
|
||||||
if not pipeline:
|
if not pipeline:
|
||||||
if (not hasattr(model, 'llm') and
|
if model.config.model_type in ["qwen2", "llama", "minicpm"]:
|
||||||
model.config.model_type in ["qwen2", "llama", "minicpm"]):
|
|
||||||
from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
|
from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
|
||||||
optimize_llm_single_process(
|
optimize_llm_single_process(
|
||||||
llm,
|
llm,
|
||||||
|
|
@ -312,7 +311,8 @@ class _BaseAutoModelClass:
|
||||||
group_size=quantization_group_size,
|
group_size=quantization_group_size,
|
||||||
qtype=qtype,
|
qtype=qtype,
|
||||||
save_directory=save_directory,
|
save_directory=save_directory,
|
||||||
fuse_layers=fuse_layers
|
fuse_layers=fuse_layers,
|
||||||
|
has_llm=hasattr(model, "llm")
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
optimize_llm(
|
optimize_llm(
|
||||||
|
|
|
||||||
|
|
@ -449,7 +449,8 @@ def optimize_llm_single_process(
|
||||||
group_size: int,
|
group_size: int,
|
||||||
qtype: str,
|
qtype: str,
|
||||||
save_directory: str,
|
save_directory: str,
|
||||||
fuse_layers: int=None
|
fuse_layers: int=None,
|
||||||
|
has_llm: bool=False
|
||||||
):
|
):
|
||||||
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
|
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
|
||||||
from .npu_llm_cpp import load_model_from_file
|
from .npu_llm_cpp import load_model_from_file
|
||||||
|
|
@ -468,8 +469,13 @@ def optimize_llm_single_process(
|
||||||
model.kv_len = kv_len
|
model.kv_len = kv_len
|
||||||
model.model_ptr = model_ptr
|
model.model_ptr = model_ptr
|
||||||
model.save_directory = save_directory
|
model.save_directory = save_directory
|
||||||
model.vocab_size = model.config.vocab_size
|
if model.config.vocab_size == 151666:
|
||||||
|
# for MiniCPM-V 2.6, 152064 is vocab_size of Qwen2-7B
|
||||||
|
model.vocab_size = 152064
|
||||||
|
else:
|
||||||
|
model.vocab_size = model.config.vocab_size
|
||||||
model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32)
|
model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32)
|
||||||
|
model.max_prompt_len = max_prompt_len
|
||||||
except:
|
except:
|
||||||
invalidInputError(False,
|
invalidInputError(False,
|
||||||
"False to InitLLMPipeline.")
|
"False to InitLLMPipeline.")
|
||||||
|
|
@ -478,9 +484,10 @@ def optimize_llm_single_process(
|
||||||
general_convert(model, PreTrainedModel, prepare_input_ids, "prepare_inputs_for_generation")
|
general_convert(model, PreTrainedModel, prepare_input_ids, "prepare_inputs_for_generation")
|
||||||
general_convert(model, PreTrainedModel, causal_lm_forward)
|
general_convert(model, PreTrainedModel, causal_lm_forward)
|
||||||
# patch generate function
|
# patch generate function
|
||||||
import types
|
if not has_llm:
|
||||||
model.original_generate = model.generate
|
import types
|
||||||
model.generate = types.MethodType(generate, model)
|
model.original_generate = model.generate
|
||||||
|
model.generate = types.MethodType(generate, model)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -491,9 +498,10 @@ def prepare_input_ids(
|
||||||
else: # prefill, reset the model here
|
else: # prefill, reset the model here
|
||||||
from .npu_llm_cpp import reset
|
from .npu_llm_cpp import reset
|
||||||
reset(self.model_ptr)
|
reset(self.model_ptr)
|
||||||
model_inputs = {
|
if inputs_embeds is not None and past_key_values is None:
|
||||||
"input_ids": input_ids
|
model_inputs = {"inputs_embeds": inputs_embeds}
|
||||||
}
|
else:
|
||||||
|
model_inputs = {"input_ids": input_ids}
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -511,17 +519,31 @@ def causal_lm_forward(
|
||||||
return_dict: Optional[bool] = None,
|
return_dict: Optional[bool] = None,
|
||||||
) -> Union[Tuple, CausalLMOutputWithPast]:
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
||||||
from .npu_llm_cpp import run_prefill_with_logits, run_decode_with_logits
|
from .npu_llm_cpp import run_prefill_with_logits, run_decode_with_logits
|
||||||
if isinstance(input_ids[0], torch.Tensor):
|
if input_ids is not None:
|
||||||
input_list = input_ids[0].flatten().tolist()
|
if isinstance(input_ids[0], torch.Tensor):
|
||||||
|
input_list = input_ids[0].flatten().tolist()
|
||||||
|
else:
|
||||||
|
input_list = input_ids[0]
|
||||||
|
input_length = len(input_list)
|
||||||
|
if input_length > 1:
|
||||||
|
logits = run_prefill_with_logits(self.model_ptr, input_list,
|
||||||
|
self.logits_buffer, self.vocab_size)
|
||||||
|
else:
|
||||||
|
logits = run_decode_with_logits(self.model_ptr, input_list[0],
|
||||||
|
self.logits_buffer, self.vocab_size)
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
seq_len = inputs_embeds.shape[1]
|
||||||
|
pad_len = self.max_prompt_len - seq_len
|
||||||
|
inputs_embeds = torch.nn.functional.pad(inputs_embeds.to(torch.float16),
|
||||||
|
(0, 0, 0, pad_len), value=0.0)
|
||||||
|
logits = run_prefill_with_logits(self.model_ptr, None, self.logits_buffer,
|
||||||
|
self.vocab_size, inputs_embeds, seq_len)
|
||||||
else:
|
else:
|
||||||
input_list = input_ids[0]
|
invalidInputError(False, "Please specify either input_ids or inputs_embeds.")
|
||||||
input_length = len(input_list)
|
|
||||||
if input_length > 1:
|
if self.config.vocab_size == 151666:
|
||||||
logits = run_prefill_with_logits(self.model_ptr, input_list,
|
# for MiniCPM-V 2.6
|
||||||
self.logits_buffer, self.vocab_size)
|
logits = logits[:, :, :151666]
|
||||||
else:
|
|
||||||
logits = run_decode_with_logits(self.model_ptr, input_list[0],
|
|
||||||
self.logits_buffer, self.vocab_size)
|
|
||||||
|
|
||||||
return CausalLMOutputWithPast(
|
return CausalLMOutputWithPast(
|
||||||
loss=None,
|
loss=None,
|
||||||
|
|
|
||||||
|
|
@ -48,8 +48,8 @@ _lib = ctypes.cdll.LoadLibrary(_lib_path)
|
||||||
_lib.load_model_from_file.argtypes = [ctypes.c_char_p]
|
_lib.load_model_from_file.argtypes = [ctypes.c_char_p]
|
||||||
_lib.load_model_from_file.restype = ctypes.c_void_p
|
_lib.load_model_from_file.restype = ctypes.c_void_p
|
||||||
|
|
||||||
_lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), ctypes.c_int,
|
_lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int,
|
||||||
ctypes.c_float]
|
ctypes.c_float, ctypes.c_bool]
|
||||||
_lib.run_prefill.restype = ctypes.POINTER(ctypes.c_float)
|
_lib.run_prefill.restype = ctypes.POINTER(ctypes.c_float)
|
||||||
|
|
||||||
_lib.run_decode.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_float]
|
_lib.run_decode.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_float]
|
||||||
|
|
@ -61,8 +61,10 @@ _lib.llm_sample_token.restype = ctypes.c_int
|
||||||
_lib.reset.argtypes = [ctypes.c_void_p]
|
_lib.reset.argtypes = [ctypes.c_void_p]
|
||||||
_lib.reset.restype = None
|
_lib.reset.restype = None
|
||||||
|
|
||||||
_lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int),
|
_lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_void_p,
|
||||||
ctypes.c_int, ctypes.POINTER(ctypes.c_float), ctypes.c_int]
|
ctypes.c_int, ctypes.POINTER(ctypes.c_float),
|
||||||
|
ctypes.c_int, ctypes.c_bool]
|
||||||
|
|
||||||
_lib.run_prefill_with_logits.restype = None
|
_lib.run_prefill_with_logits.restype = None
|
||||||
|
|
||||||
_lib.run_decode_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_int,
|
_lib.run_decode_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_int,
|
||||||
|
|
@ -77,7 +79,7 @@ def load_model_from_file(model_dir: str):
|
||||||
def run_prefill(model_ptr, input_ids, vocab_size, repetition_penalty=1.0):
|
def run_prefill(model_ptr, input_ids, vocab_size, repetition_penalty=1.0):
|
||||||
input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
|
input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
|
||||||
input_len = len(input_ids)
|
input_len = len(input_ids)
|
||||||
plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty)
|
plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty, False)
|
||||||
new_token = _lib.llm_sample_token(plogits, True, vocab_size)
|
new_token = _lib.llm_sample_token(plogits, True, vocab_size)
|
||||||
return new_token
|
return new_token
|
||||||
|
|
||||||
|
|
@ -88,12 +90,19 @@ def run_decode(model_ptr, input_id, vocab_size, repetition_penalty=1.0):
|
||||||
return new_token
|
return new_token
|
||||||
|
|
||||||
|
|
||||||
def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size):
|
def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size,
|
||||||
input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
|
inputs_embeds=None, seq_len=None):
|
||||||
input_len = len(input_ids)
|
if input_ids is not None:
|
||||||
|
input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
|
||||||
|
input_len = len(input_ids)
|
||||||
|
else:
|
||||||
|
input_ptr = inputs_embeds.contiguous().data.data_ptr()
|
||||||
|
input_ptr = ctypes.cast(input_ptr, ctypes.c_void_p)
|
||||||
|
input_len = seq_len
|
||||||
logits_ptr = logits.data.data_ptr()
|
logits_ptr = logits.data.data_ptr()
|
||||||
logits_ptr = ctypes.cast(logits_ptr, ctypes.POINTER(ctypes.c_float))
|
logits_ptr = ctypes.cast(logits_ptr, ctypes.POINTER(ctypes.c_float))
|
||||||
_lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr, vocab_size)
|
_lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr,
|
||||||
|
vocab_size, (input_ids is None))
|
||||||
return logits
|
return logits
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,10 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
||||||
lm_head_n_splits = 1
|
lm_head_n_splits = 1
|
||||||
asym = getattr(model.config, "asym", False)
|
asym = getattr(model.config, "asym", False)
|
||||||
|
|
||||||
|
if vocab_size == 151666:
|
||||||
|
# for MiniCPM-V 2.6 lm_head on NPU
|
||||||
|
vocab_size = 152064
|
||||||
|
|
||||||
if not isinstance(lm_head, SlicedLMHead):
|
if not isinstance(lm_head, SlicedLMHead):
|
||||||
asym = lm_head.qtype == "asym_int4_rtn"
|
asym = lm_head.qtype == "asym_int4_rtn"
|
||||||
if asym:
|
if asym:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue