Initial NPU support for MiniCPM-V-2_6 (#11966)
* initial pr * update npu model * fix * fix kv cache type * fix * small fix * fix style * fix model id * change inter_pp=4 * address comment * fix * fix style * fix * rebase
This commit is contained in:
parent
158289d205
commit
60aa1a2c0f
6 changed files with 129 additions and 20 deletions
|
|
@ -0,0 +1,92 @@
|
|||
#
|
||||
# Copyright 2016 The BigDL Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
|
||||
import torch
|
||||
import os
|
||||
import time
|
||||
import argparse
|
||||
import requests
|
||||
from PIL import Image
|
||||
from ipex_llm.transformers.npu_model import AutoModel
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for openbmb/MiniCPM-V-2_6 model')
|
||||
parser.add_argument('--repo-id-or-model-path', type=str, default="openbmb/MiniCPM-V-2_6",
|
||||
help='The huggingface repo id for the openbmb/MiniCPM-V-2_6 model to be downloaded'
|
||||
', or the path to the huggingface checkpoint folder')
|
||||
parser.add_argument('--image-url-or-path', type=str,
|
||||
default='http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg',
|
||||
help='The URL or path to the image to infer')
|
||||
parser.add_argument('--prompt', type=str, default="What is in this image?",
|
||||
help='Prompt to infer')
|
||||
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
|
||||
parser.add_argument("--max-output-len", type=int, default=1024)
|
||||
parser.add_argument("--max-prompt-len", type=int, default=960)
|
||||
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||
parser.add_argument("--intra-pp", type=int, default=2)
|
||||
parser.add_argument("--inter-pp", type=int, default=2)
|
||||
|
||||
args = parser.parse_args()
|
||||
model_path = args.repo_id_or_model_path
|
||||
image_path = args.image_url_or_path
|
||||
|
||||
model = AutoModel.from_pretrained(model_path,
|
||||
torch_dtype=torch.float32,
|
||||
trust_remote_code=True,
|
||||
attn_implementation="eager",
|
||||
load_in_low_bit="sym_int4",
|
||||
optimize_model=True,
|
||||
max_output_len=args.max_output_len,
|
||||
max_prompt_len=args.max_prompt_len,
|
||||
intra_pp=args.intra_pp,
|
||||
inter_pp=args.inter_pp,
|
||||
transpose_value_cache=not args.disable_transpose_value_cache,
|
||||
modules_to_not_convert=['vpm', 'resampler']
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path,
|
||||
trust_remote_code=True)
|
||||
model.eval()
|
||||
|
||||
query = args.prompt
|
||||
if os.path.exists(image_path):
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
else:
|
||||
image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB')
|
||||
|
||||
# Generate predicted tokens
|
||||
# here the prompt tuning refers to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/README.md
|
||||
msg = [{'role': 'user', 'content': args.prompt}]
|
||||
st = time.time()
|
||||
with torch.inference_mode():
|
||||
res = model.chat(
|
||||
image=image,
|
||||
msgs=msg,
|
||||
context=None,
|
||||
tokenizer=tokenizer,
|
||||
sampling=True,
|
||||
)
|
||||
end = time.time()
|
||||
print(f'Inference time: {end-st} s')
|
||||
print('-'*20, 'Input', '-'*20)
|
||||
print(image_path)
|
||||
print('-'*20, 'Prompt', '-'*20)
|
||||
print(args.prompt)
|
||||
output_str = res
|
||||
print('-'*20, 'Output', '-'*20)
|
||||
print(output_str)
|
||||
|
|
@ -113,7 +113,6 @@ class _BaseAutoModelClass:
|
|||
ignore_argument(kwargs, "cpu_embedding")
|
||||
ignore_argument(kwargs, "embedding_qtype")
|
||||
ignore_argument(kwargs, "enable_mp")
|
||||
ignore_argument(kwargs, "modules_to_not_convert")
|
||||
ignore_argument(kwargs, "quantization_config")
|
||||
ignore_argument(kwargs, "speculative")
|
||||
ignore_argument(kwargs, "pipeline_parallel_stages")
|
||||
|
|
@ -123,6 +122,7 @@ class _BaseAutoModelClass:
|
|||
inter_pp = kwargs.pop("inter_pp", None)
|
||||
intra_pp = kwargs.pop("intra_pp", None)
|
||||
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
|
||||
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
|
||||
|
||||
_args = copy.deepcopy(args)
|
||||
_kwargs = copy.deepcopy(kwargs)
|
||||
|
|
@ -152,17 +152,14 @@ class _BaseAutoModelClass:
|
|||
)
|
||||
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm, optimize_llm_pre
|
||||
|
||||
if model.config.model_type == "minicpmv":
|
||||
if hasattr(model, "llm"):
|
||||
llm = model.llm
|
||||
if llm.config.hidden_size == 4096 and llm.config.vocab_size == 128256:
|
||||
# MiniCPM-llama3-V2.5
|
||||
llm.config.model_type = "llama"
|
||||
else:
|
||||
llm = model
|
||||
|
||||
with torch.no_grad():
|
||||
optimize_llm_pre(llm, qtype)
|
||||
cls.load_convert(qtype, llm, "cpu", *args, **kwargs)
|
||||
optimize_llm_pre(model, qtype)
|
||||
cls.load_convert(qtype, model, "cpu", modules_to_not_convert, *args, **kwargs)
|
||||
create_npu_kernels(llm)
|
||||
model = model.eval()
|
||||
logger.info(f"Finish to convert model")
|
||||
|
|
@ -181,7 +178,10 @@ class _BaseAutoModelClass:
|
|||
from ipex_llm.transformers.npu_models.convert import optimize_llm
|
||||
optimize_llm(model)
|
||||
with torch.no_grad():
|
||||
cls.load_convert(qtype, model, "cpu", *args, **kwargs)
|
||||
cls.load_convert(qtype, model, "cpu", modules_to_not_convert, *args, **kwargs)
|
||||
if hasattr(model, "llm"):
|
||||
create_npu_kernels(model.llm)
|
||||
else:
|
||||
create_npu_kernels(model)
|
||||
model = model.eval()
|
||||
logger.info(f"Finish to convert model")
|
||||
|
|
@ -192,10 +192,11 @@ class _BaseAutoModelClass:
|
|||
return model
|
||||
|
||||
@classmethod
|
||||
def load_convert(cls, q_k, optimize_model, device, *arg, **kwarg):
|
||||
def load_convert(cls, q_k, optimize_model, device, modules_to_not_convert, *arg, **kwarg):
|
||||
from ipex_llm.transformers.npu_models.convert import replace_with_QuantizedLinear
|
||||
|
||||
replace_with_QuantizedLinear(optimize_model, q_k, device=device)
|
||||
replace_with_QuantizedLinear(optimize_model, q_k, device=device,
|
||||
modules_to_not_convert=modules_to_not_convert)
|
||||
|
||||
@classmethod
|
||||
@patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ def module_optimization(func) -> torch.nn.Module:
|
|||
torch.nn.Module: optimized module
|
||||
"""
|
||||
|
||||
def wrapper(model: torch.nn.Module, qtype, device, *args, **kwargs):
|
||||
def wrapper(model: torch.nn.Module, qtype, device, modules_to_not_convert, *args, **kwargs):
|
||||
"""Recursively apply the optimization function.
|
||||
|
||||
Args:
|
||||
|
|
@ -41,18 +41,19 @@ def module_optimization(func) -> torch.nn.Module:
|
|||
|
||||
"""
|
||||
for name, layer in model.named_children():
|
||||
new_layer = func(layer, qtype, device, *args, **kwargs)
|
||||
if name not in modules_to_not_convert:
|
||||
new_layer = func(layer, qtype, device, modules_to_not_convert, *args, **kwargs)
|
||||
if new_layer:
|
||||
model.add_module(name, new_layer)
|
||||
wrapper(new_layer, qtype, device, *args, **kwargs)
|
||||
wrapper(new_layer, qtype, device, modules_to_not_convert, *args, **kwargs)
|
||||
else:
|
||||
wrapper(layer, qtype, device, *args, **kwargs)
|
||||
wrapper(layer, qtype, device, modules_to_not_convert, *args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
@module_optimization
|
||||
def replace_with_QuantizedLinear(layer, qtype, device):
|
||||
def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert):
|
||||
from ipex_llm.transformers.low_bit_linear import ggml_convert_qtype
|
||||
from ipex_llm.ggml.quantize import ggml_tensor_qtype
|
||||
iqtype = ggml_tensor_qtype[qtype]
|
||||
|
|
|
|||
|
|
@ -42,6 +42,16 @@ def optimize_llm_pre(model: torch.nn.Module, qtype):
|
|||
from ipex_llm.transformers.models.baichuan import pre_compute_inv_freq
|
||||
model.apply(pre_compute_inv_freq)
|
||||
|
||||
if model.config.model_type == "minicpmv" and hasattr(model, "llm"):
|
||||
# MiniCPM-V
|
||||
if model.config.hidden_size == 2304 and model.config.vocab_size == 122753:
|
||||
model.llm.config.model_type = "minicpm"
|
||||
elif model.config.hidden_size == 3584 and model.config.vocab_size == 151666:
|
||||
model.llm.config.model_type = "qwen2"
|
||||
elif model.config.hidden_size == 4096 and model.config.vocab_size == 128256:
|
||||
model.llm.config.model_type = "llama"
|
||||
model = model.llm
|
||||
|
||||
# lm_head to cpu optimization
|
||||
if os.environ.get("IPEX_LLM_CPU_LM_HEAD", "0") != "0":
|
||||
# disable the optimization by default
|
||||
|
|
|
|||
|
|
@ -173,7 +173,8 @@ class DynamicFusedNormalCache(DynamicCache):
|
|||
head_dim,
|
||||
0,
|
||||
max_len,
|
||||
key_states.dtype,
|
||||
# key_states.dtype,
|
||||
torch.float16,
|
||||
key_states.device,
|
||||
tranpose_value=transpose_value,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -197,7 +197,9 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
|||
new_key_states = self.convert_to_fp16(curr_key_values[i][0])
|
||||
new_value_states = self.convert_to_fp16(curr_key_values[i][1])
|
||||
|
||||
print("start compiling")
|
||||
self.compile()
|
||||
print("end compiling")
|
||||
|
||||
def mlp(self, hidden_states):
|
||||
mm1 = self.linear(
|
||||
|
|
@ -862,6 +864,8 @@ class PrefillRunner:
|
|||
self.p.daemon = True
|
||||
self.p.start()
|
||||
output = self.prefill_result_queue.get()
|
||||
print(Fore.GREEN + f"prefill process output: {output}")
|
||||
print(Style.RESET_ALL)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
|
|
|||
Loading…
Reference in a new issue