Initial NPU support for MiniCPM-V-2_6 (#11966)
* initial pr * update npu model * fix * fix kv cache type * fix * small fix * fix style * fix model id * change inter_pp=4 * address comment * fix * fix style * fix * rebase
This commit is contained in:
parent
158289d205
commit
60aa1a2c0f
6 changed files with 129 additions and 20 deletions
|
|
@ -0,0 +1,92 @@
|
||||||
|
#
|
||||||
|
# Copyright 2016 The BigDL Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import argparse
|
||||||
|
import requests
|
||||||
|
from PIL import Image
|
||||||
|
from ipex_llm.transformers.npu_model import AutoModel
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for openbmb/MiniCPM-V-2_6 model')
|
||||||
|
parser.add_argument('--repo-id-or-model-path', type=str, default="openbmb/MiniCPM-V-2_6",
|
||||||
|
help='The huggingface repo id for the openbmb/MiniCPM-V-2_6 model to be downloaded'
|
||||||
|
', or the path to the huggingface checkpoint folder')
|
||||||
|
parser.add_argument('--image-url-or-path', type=str,
|
||||||
|
default='http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg',
|
||||||
|
help='The URL or path to the image to infer')
|
||||||
|
parser.add_argument('--prompt', type=str, default="What is in this image?",
|
||||||
|
help='Prompt to infer')
|
||||||
|
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
|
||||||
|
parser.add_argument("--max-output-len", type=int, default=1024)
|
||||||
|
parser.add_argument("--max-prompt-len", type=int, default=960)
|
||||||
|
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
|
||||||
|
parser.add_argument("--intra-pp", type=int, default=2)
|
||||||
|
parser.add_argument("--inter-pp", type=int, default=2)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
model_path = args.repo_id_or_model_path
|
||||||
|
image_path = args.image_url_or_path
|
||||||
|
|
||||||
|
model = AutoModel.from_pretrained(model_path,
|
||||||
|
torch_dtype=torch.float32,
|
||||||
|
trust_remote_code=True,
|
||||||
|
attn_implementation="eager",
|
||||||
|
load_in_low_bit="sym_int4",
|
||||||
|
optimize_model=True,
|
||||||
|
max_output_len=args.max_output_len,
|
||||||
|
max_prompt_len=args.max_prompt_len,
|
||||||
|
intra_pp=args.intra_pp,
|
||||||
|
inter_pp=args.inter_pp,
|
||||||
|
transpose_value_cache=not args.disable_transpose_value_cache,
|
||||||
|
modules_to_not_convert=['vpm', 'resampler']
|
||||||
|
)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_path,
|
||||||
|
trust_remote_code=True)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
query = args.prompt
|
||||||
|
if os.path.exists(image_path):
|
||||||
|
image = Image.open(image_path).convert('RGB')
|
||||||
|
else:
|
||||||
|
image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB')
|
||||||
|
|
||||||
|
# Generate predicted tokens
|
||||||
|
# here the prompt tuning refers to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/README.md
|
||||||
|
msg = [{'role': 'user', 'content': args.prompt}]
|
||||||
|
st = time.time()
|
||||||
|
with torch.inference_mode():
|
||||||
|
res = model.chat(
|
||||||
|
image=image,
|
||||||
|
msgs=msg,
|
||||||
|
context=None,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
sampling=True,
|
||||||
|
)
|
||||||
|
end = time.time()
|
||||||
|
print(f'Inference time: {end-st} s')
|
||||||
|
print('-'*20, 'Input', '-'*20)
|
||||||
|
print(image_path)
|
||||||
|
print('-'*20, 'Prompt', '-'*20)
|
||||||
|
print(args.prompt)
|
||||||
|
output_str = res
|
||||||
|
print('-'*20, 'Output', '-'*20)
|
||||||
|
print(output_str)
|
||||||
|
|
@ -113,7 +113,6 @@ class _BaseAutoModelClass:
|
||||||
ignore_argument(kwargs, "cpu_embedding")
|
ignore_argument(kwargs, "cpu_embedding")
|
||||||
ignore_argument(kwargs, "embedding_qtype")
|
ignore_argument(kwargs, "embedding_qtype")
|
||||||
ignore_argument(kwargs, "enable_mp")
|
ignore_argument(kwargs, "enable_mp")
|
||||||
ignore_argument(kwargs, "modules_to_not_convert")
|
|
||||||
ignore_argument(kwargs, "quantization_config")
|
ignore_argument(kwargs, "quantization_config")
|
||||||
ignore_argument(kwargs, "speculative")
|
ignore_argument(kwargs, "speculative")
|
||||||
ignore_argument(kwargs, "pipeline_parallel_stages")
|
ignore_argument(kwargs, "pipeline_parallel_stages")
|
||||||
|
|
@ -123,6 +122,7 @@ class _BaseAutoModelClass:
|
||||||
inter_pp = kwargs.pop("inter_pp", None)
|
inter_pp = kwargs.pop("inter_pp", None)
|
||||||
intra_pp = kwargs.pop("intra_pp", None)
|
intra_pp = kwargs.pop("intra_pp", None)
|
||||||
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
|
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
|
||||||
|
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
|
||||||
|
|
||||||
_args = copy.deepcopy(args)
|
_args = copy.deepcopy(args)
|
||||||
_kwargs = copy.deepcopy(kwargs)
|
_kwargs = copy.deepcopy(kwargs)
|
||||||
|
|
@ -152,17 +152,14 @@ class _BaseAutoModelClass:
|
||||||
)
|
)
|
||||||
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm, optimize_llm_pre
|
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm, optimize_llm_pre
|
||||||
|
|
||||||
if model.config.model_type == "minicpmv":
|
if hasattr(model, "llm"):
|
||||||
llm = model.llm
|
llm = model.llm
|
||||||
if llm.config.hidden_size == 4096 and llm.config.vocab_size == 128256:
|
|
||||||
# MiniCPM-llama3-V2.5
|
|
||||||
llm.config.model_type = "llama"
|
|
||||||
else:
|
else:
|
||||||
llm = model
|
llm = model
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
optimize_llm_pre(llm, qtype)
|
optimize_llm_pre(model, qtype)
|
||||||
cls.load_convert(qtype, llm, "cpu", *args, **kwargs)
|
cls.load_convert(qtype, model, "cpu", modules_to_not_convert, *args, **kwargs)
|
||||||
create_npu_kernels(llm)
|
create_npu_kernels(llm)
|
||||||
model = model.eval()
|
model = model.eval()
|
||||||
logger.info(f"Finish to convert model")
|
logger.info(f"Finish to convert model")
|
||||||
|
|
@ -181,7 +178,10 @@ class _BaseAutoModelClass:
|
||||||
from ipex_llm.transformers.npu_models.convert import optimize_llm
|
from ipex_llm.transformers.npu_models.convert import optimize_llm
|
||||||
optimize_llm(model)
|
optimize_llm(model)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
cls.load_convert(qtype, model, "cpu", *args, **kwargs)
|
cls.load_convert(qtype, model, "cpu", modules_to_not_convert, *args, **kwargs)
|
||||||
|
if hasattr(model, "llm"):
|
||||||
|
create_npu_kernels(model.llm)
|
||||||
|
else:
|
||||||
create_npu_kernels(model)
|
create_npu_kernels(model)
|
||||||
model = model.eval()
|
model = model.eval()
|
||||||
logger.info(f"Finish to convert model")
|
logger.info(f"Finish to convert model")
|
||||||
|
|
@ -192,10 +192,11 @@ class _BaseAutoModelClass:
|
||||||
return model
|
return model
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load_convert(cls, q_k, optimize_model, device, *arg, **kwarg):
|
def load_convert(cls, q_k, optimize_model, device, modules_to_not_convert, *arg, **kwarg):
|
||||||
from ipex_llm.transformers.npu_models.convert import replace_with_QuantizedLinear
|
from ipex_llm.transformers.npu_models.convert import replace_with_QuantizedLinear
|
||||||
|
|
||||||
replace_with_QuantizedLinear(optimize_model, q_k, device=device)
|
replace_with_QuantizedLinear(optimize_model, q_k, device=device,
|
||||||
|
modules_to_not_convert=modules_to_not_convert)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
|
@patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
|
||||||
|
|
|
||||||
|
|
@ -31,7 +31,7 @@ def module_optimization(func) -> torch.nn.Module:
|
||||||
torch.nn.Module: optimized module
|
torch.nn.Module: optimized module
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def wrapper(model: torch.nn.Module, qtype, device, *args, **kwargs):
|
def wrapper(model: torch.nn.Module, qtype, device, modules_to_not_convert, *args, **kwargs):
|
||||||
"""Recursively apply the optimization function.
|
"""Recursively apply the optimization function.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -41,18 +41,19 @@ def module_optimization(func) -> torch.nn.Module:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for name, layer in model.named_children():
|
for name, layer in model.named_children():
|
||||||
new_layer = func(layer, qtype, device, *args, **kwargs)
|
if name not in modules_to_not_convert:
|
||||||
|
new_layer = func(layer, qtype, device, modules_to_not_convert, *args, **kwargs)
|
||||||
if new_layer:
|
if new_layer:
|
||||||
model.add_module(name, new_layer)
|
model.add_module(name, new_layer)
|
||||||
wrapper(new_layer, qtype, device, *args, **kwargs)
|
wrapper(new_layer, qtype, device, modules_to_not_convert, *args, **kwargs)
|
||||||
else:
|
else:
|
||||||
wrapper(layer, qtype, device, *args, **kwargs)
|
wrapper(layer, qtype, device, modules_to_not_convert, *args, **kwargs)
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
@module_optimization
|
@module_optimization
|
||||||
def replace_with_QuantizedLinear(layer, qtype, device):
|
def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert):
|
||||||
from ipex_llm.transformers.low_bit_linear import ggml_convert_qtype
|
from ipex_llm.transformers.low_bit_linear import ggml_convert_qtype
|
||||||
from ipex_llm.ggml.quantize import ggml_tensor_qtype
|
from ipex_llm.ggml.quantize import ggml_tensor_qtype
|
||||||
iqtype = ggml_tensor_qtype[qtype]
|
iqtype = ggml_tensor_qtype[qtype]
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,16 @@ def optimize_llm_pre(model: torch.nn.Module, qtype):
|
||||||
from ipex_llm.transformers.models.baichuan import pre_compute_inv_freq
|
from ipex_llm.transformers.models.baichuan import pre_compute_inv_freq
|
||||||
model.apply(pre_compute_inv_freq)
|
model.apply(pre_compute_inv_freq)
|
||||||
|
|
||||||
|
if model.config.model_type == "minicpmv" and hasattr(model, "llm"):
|
||||||
|
# MiniCPM-V
|
||||||
|
if model.config.hidden_size == 2304 and model.config.vocab_size == 122753:
|
||||||
|
model.llm.config.model_type = "minicpm"
|
||||||
|
elif model.config.hidden_size == 3584 and model.config.vocab_size == 151666:
|
||||||
|
model.llm.config.model_type = "qwen2"
|
||||||
|
elif model.config.hidden_size == 4096 and model.config.vocab_size == 128256:
|
||||||
|
model.llm.config.model_type = "llama"
|
||||||
|
model = model.llm
|
||||||
|
|
||||||
# lm_head to cpu optimization
|
# lm_head to cpu optimization
|
||||||
if os.environ.get("IPEX_LLM_CPU_LM_HEAD", "0") != "0":
|
if os.environ.get("IPEX_LLM_CPU_LM_HEAD", "0") != "0":
|
||||||
# disable the optimization by default
|
# disable the optimization by default
|
||||||
|
|
|
||||||
|
|
@ -173,7 +173,8 @@ class DynamicFusedNormalCache(DynamicCache):
|
||||||
head_dim,
|
head_dim,
|
||||||
0,
|
0,
|
||||||
max_len,
|
max_len,
|
||||||
key_states.dtype,
|
# key_states.dtype,
|
||||||
|
torch.float16,
|
||||||
key_states.device,
|
key_states.device,
|
||||||
tranpose_value=transpose_value,
|
tranpose_value=transpose_value,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -197,7 +197,9 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
||||||
new_key_states = self.convert_to_fp16(curr_key_values[i][0])
|
new_key_states = self.convert_to_fp16(curr_key_values[i][0])
|
||||||
new_value_states = self.convert_to_fp16(curr_key_values[i][1])
|
new_value_states = self.convert_to_fp16(curr_key_values[i][1])
|
||||||
|
|
||||||
|
print("start compiling")
|
||||||
self.compile()
|
self.compile()
|
||||||
|
print("end compiling")
|
||||||
|
|
||||||
def mlp(self, hidden_states):
|
def mlp(self, hidden_states):
|
||||||
mm1 = self.linear(
|
mm1 = self.linear(
|
||||||
|
|
@ -862,6 +864,8 @@ class PrefillRunner:
|
||||||
self.p.daemon = True
|
self.p.daemon = True
|
||||||
self.p.start()
|
self.p.start()
|
||||||
output = self.prefill_result_queue.get()
|
output = self.prefill_result_queue.get()
|
||||||
|
print(Fore.GREEN + f"prefill process output: {output}")
|
||||||
|
print(Style.RESET_ALL)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue