[NPU] dump prefill IR for further C++ solution (#12402)
* save prefill ir * fix * shorten convert time * fix * fix * fix * fix * fix style * dump config.json * meet review * small fix
This commit is contained in:
parent
1bfcbc0640
commit
54c62feb74
4 changed files with 93 additions and 39 deletions
|
|
@ -134,6 +134,8 @@ class _BaseAutoModelClass:
|
||||||
mixed_precision = kwargs.pop('mixed_precision', False)
|
mixed_precision = kwargs.pop('mixed_precision', False)
|
||||||
quantization_group_size = kwargs.pop("quantization_group_size", 0)
|
quantization_group_size = kwargs.pop("quantization_group_size", 0)
|
||||||
mock_device = kwargs.pop('device', None) # For mock on CPU
|
mock_device = kwargs.pop('device', None) # For mock on CPU
|
||||||
|
compile_full_model = kwargs.pop('compile_full_model', False)
|
||||||
|
save_directory = kwargs.pop('save_directory', None)
|
||||||
|
|
||||||
invalidInputError(
|
invalidInputError(
|
||||||
quantization_group_size in [0, 32, 64, 128],
|
quantization_group_size in [0, 32, 64, 128],
|
||||||
|
|
@ -199,7 +201,9 @@ class _BaseAutoModelClass:
|
||||||
"max_prompt_len": max_prompt_len,
|
"max_prompt_len": max_prompt_len,
|
||||||
"inter_pp": inter_pp,
|
"inter_pp": inter_pp,
|
||||||
"intra_pp": intra_pp,
|
"intra_pp": intra_pp,
|
||||||
"transpose_value_cache": transpose_value_cache
|
"transpose_value_cache": transpose_value_cache,
|
||||||
|
"compile_full_model": compile_full_model,
|
||||||
|
"save_directory": save_directory,
|
||||||
}
|
}
|
||||||
model = cls.optimize_npu_model(*args, **optimize_kwargs)
|
model = cls.optimize_npu_model(*args, **optimize_kwargs)
|
||||||
else:
|
else:
|
||||||
|
|
@ -237,6 +241,8 @@ class _BaseAutoModelClass:
|
||||||
inter_pp = kwargs.pop("inter_pp", None)
|
inter_pp = kwargs.pop("inter_pp", None)
|
||||||
intra_pp = kwargs.pop("intra_pp", None)
|
intra_pp = kwargs.pop("intra_pp", None)
|
||||||
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
|
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
|
||||||
|
compile_full_model = kwargs.pop('compile_full_model', False)
|
||||||
|
save_directory = kwargs.pop('save_directory', None)
|
||||||
|
|
||||||
if hasattr(model, "llm"):
|
if hasattr(model, "llm"):
|
||||||
llm = model.llm
|
llm = model.llm
|
||||||
|
|
@ -273,7 +279,9 @@ class _BaseAutoModelClass:
|
||||||
kv_len=max_context_len,
|
kv_len=max_context_len,
|
||||||
max_prompt_len=max_prompt_len,
|
max_prompt_len=max_prompt_len,
|
||||||
transpose_value_cache=transpose_value_cache,
|
transpose_value_cache=transpose_value_cache,
|
||||||
group_size=quantization_group_size)
|
group_size=quantization_group_size,
|
||||||
|
compile_full_model=compile_full_model,
|
||||||
|
save_directory=save_directory)
|
||||||
model.save_low_bit = types.MethodType(save_low_bit, model)
|
model.save_low_bit = types.MethodType(save_low_bit, model)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -23,16 +23,19 @@ from intel_npu_acceleration_library.backend.factory import NNFactory
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def update_names_of_IR_and_export_blob(model, model_name, dir):
|
def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True):
|
||||||
xml_path = os.path.join(dir, model_name + ".xml")
|
xml_path = os.path.join(dir, model_name + ".xml")
|
||||||
|
bin_path = os.path.join(dir, model_name + ".bin")
|
||||||
model.save(xml_path)
|
model.save(xml_path)
|
||||||
new_ir_path = os.path.join(dir, model_name + "_new.xml")
|
new_ir_path = os.path.join(dir, model_name + "_new.xml")
|
||||||
|
new_bin_path = os.path.join(dir, model_name + "_new.bin")
|
||||||
blob_path = os.path.join(dir, model_name + ".blob")
|
blob_path = os.path.join(dir, model_name + ".blob")
|
||||||
|
|
||||||
core = Core()
|
core = Core()
|
||||||
core.set_property("NPU", {"NPU_COMPILATION_MODE_PARAMS":
|
core.set_property("NPU", {"NPU_COMPILATION_MODE_PARAMS":
|
||||||
"compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"})
|
"compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"})
|
||||||
core.set_property("NPU", {"PERFORMANCE_HINT": "LATENCY"})
|
core.set_property("NPU", {"PERFORMANCE_HINT": "LATENCY"})
|
||||||
|
|
||||||
model = core.read_model(xml_path)
|
model = core.read_model(xml_path)
|
||||||
inputs = model.inputs
|
inputs = model.inputs
|
||||||
for idx, input in enumerate(inputs):
|
for idx, input in enumerate(inputs):
|
||||||
|
|
@ -46,14 +49,17 @@ def update_names_of_IR_and_export_blob(model, model_name, dir):
|
||||||
if new_ir_path is not None:
|
if new_ir_path is not None:
|
||||||
serialize(model, new_ir_path)
|
serialize(model, new_ir_path)
|
||||||
|
|
||||||
if blob_path is not None:
|
if compile_blob:
|
||||||
compiledModel = core.compile_model(model, device_name="NPU")
|
compiledModel = core.compile_model(model, device_name="NPU")
|
||||||
model_stream = compiledModel.export_model()
|
model_stream = compiledModel.export_model()
|
||||||
with open(blob_path, 'wb') as f:
|
with open(blob_path, 'wb') as f:
|
||||||
f.write(model_stream)
|
f.write(model_stream)
|
||||||
|
|
||||||
os.remove(xml_path)
|
os.remove(xml_path)
|
||||||
|
|
||||||
|
if not keep_ir:
|
||||||
os.remove(new_ir_path)
|
os.remove(new_ir_path)
|
||||||
|
os.remove(new_bin_path)
|
||||||
|
|
||||||
return blob_path
|
return blob_path
|
||||||
|
|
||||||
|
|
@ -123,6 +129,7 @@ class LLMEmbedding(NNFactory):
|
||||||
embedding_weight,
|
embedding_weight,
|
||||||
padding_idx,
|
padding_idx,
|
||||||
dtype, # fp16
|
dtype, # fp16
|
||||||
|
input_length: int = 1,
|
||||||
device: str = "NPU",
|
device: str = "NPU",
|
||||||
):
|
):
|
||||||
super().__init__(False, device)
|
super().__init__(False, device)
|
||||||
|
|
@ -133,7 +140,7 @@ class LLMEmbedding(NNFactory):
|
||||||
|
|
||||||
# define input
|
# define input
|
||||||
weight = self.constant(embedding_weight)
|
weight = self.constant(embedding_weight)
|
||||||
input = self.parameter((1, 1), dtype=np.int32)
|
input = self.parameter((1, input_length), dtype=np.int32)
|
||||||
|
|
||||||
if padding_idx == -1:
|
if padding_idx == -1:
|
||||||
padding_idx += vocab_size
|
padding_idx += vocab_size
|
||||||
|
|
|
||||||
|
|
@ -192,7 +192,9 @@ def convert_llm(model: torch.nn.Module,
|
||||||
kv_len: int,
|
kv_len: int,
|
||||||
max_prompt_len: int,
|
max_prompt_len: int,
|
||||||
transpose_value_cache: bool,
|
transpose_value_cache: bool,
|
||||||
group_size: int):
|
group_size: int,
|
||||||
|
compile_full_model: bool=False,
|
||||||
|
save_directory: str=None):
|
||||||
# whether to set layernorm weight as const
|
# whether to set layernorm weight as const
|
||||||
layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1"
|
layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1"
|
||||||
if group_size == 0:
|
if group_size == 0:
|
||||||
|
|
@ -329,12 +331,16 @@ def convert_llm(model: torch.nn.Module,
|
||||||
elif model.config.model_type == "qwen2":
|
elif model.config.model_type == "qwen2":
|
||||||
layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "0") == "1"
|
layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "0") == "1"
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
if save_directory is not None:
|
||||||
|
temp_dir = save_directory
|
||||||
|
os.mkdir(temp_dir)
|
||||||
weight_dir = os.path.join(temp_dir, "model_weights")
|
weight_dir = os.path.join(temp_dir, "model_weights")
|
||||||
os.mkdir(weight_dir)
|
os.mkdir(weight_dir)
|
||||||
layer_num = len(model.model.layers)
|
layer_num = len(model.model.layers)
|
||||||
from .qwen import convert_qwen_layer, convert_lm_head_and_embedding
|
from .qwen import convert_qwen_layer, convert_lm_head_and_embedding
|
||||||
first_blob_path, last_blob_path = convert_lm_head_and_embedding(model, n_splits_linear,
|
first_blob_path, last_blob_path = convert_lm_head_and_embedding(model, n_splits_linear,
|
||||||
temp_dir, weight_dir)
|
temp_dir, weight_dir,
|
||||||
|
compile_full_model)
|
||||||
|
|
||||||
param_list = []
|
param_list = []
|
||||||
for layer_idx in range(0, layer_num):
|
for layer_idx in range(0, layer_num):
|
||||||
|
|
@ -344,6 +350,11 @@ def convert_llm(model: torch.nn.Module,
|
||||||
with Pool() as pool:
|
with Pool() as pool:
|
||||||
result = pool.starmap(convert_qwen_layer, param_list)
|
result = pool.starmap(convert_qwen_layer, param_list)
|
||||||
|
|
||||||
|
if compile_full_model:
|
||||||
|
convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
||||||
|
temp_dir, weight_dir, transpose_value_cache, max_prompt_len,
|
||||||
|
group_size, layernorm_const, "prefill")
|
||||||
|
|
||||||
# Prefill Runner
|
# Prefill Runner
|
||||||
from ipex_llm.transformers.npu_models.convert_mp import convert_qwen
|
from ipex_llm.transformers.npu_models.convert_mp import convert_qwen
|
||||||
convert_qwen(model,
|
convert_qwen(model,
|
||||||
|
|
@ -360,6 +371,16 @@ def convert_llm(model: torch.nn.Module,
|
||||||
model.transpose_value_cache = transpose_value_cache
|
model.transpose_value_cache = transpose_value_cache
|
||||||
model.vocab_size = model.config.vocab_size
|
model.vocab_size = model.config.vocab_size
|
||||||
|
|
||||||
|
if save_directory is not None:
|
||||||
|
update_dict = {"kv_len": kv_len, "num_head": model.num_head,
|
||||||
|
"head_dim": model.head_dim,
|
||||||
|
"transpose_value_cache": transpose_value_cache,
|
||||||
|
"max_prompt_len": max_prompt_len,
|
||||||
|
"layernorm_const": layernorm_const,
|
||||||
|
"group_size": group_size}
|
||||||
|
model.config.update(update_dict)
|
||||||
|
model.config.save_pretrained(save_directory)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
res = InitLLMPipeline("qwen", kv_len, model.num_head, model.head_dim, layer_num,
|
res = InitLLMPipeline("qwen", kv_len, model.num_head, model.head_dim, layer_num,
|
||||||
model.vocab_size, weight_dir, "model",
|
model.vocab_size, weight_dir, "model",
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,8 @@ from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLML
|
||||||
from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
|
from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
|
||||||
|
|
||||||
|
|
||||||
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir):
|
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
||||||
|
compile_full_model=False):
|
||||||
num_heads = model.model.layers[0].self_attn.num_heads
|
num_heads = model.model.layers[0].self_attn.num_heads
|
||||||
head_dim = model.model.layers[0].self_attn.head_dim
|
head_dim = model.model.layers[0].self_attn.head_dim
|
||||||
rms_norm_eps = model.config.rms_norm_eps
|
rms_norm_eps = model.config.rms_norm_eps
|
||||||
|
|
@ -57,7 +58,9 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir):
|
||||||
vocab_size=vocab_size,
|
vocab_size=vocab_size,
|
||||||
n_splits=n_splits_linear
|
n_splits=n_splits_linear
|
||||||
)
|
)
|
||||||
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir)
|
|
||||||
|
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
|
||||||
|
temp_dir, True, True)
|
||||||
|
|
||||||
# save weights bins files
|
# save weights bins files
|
||||||
if not isinstance(lm_head, SlicedLMHead):
|
if not isinstance(lm_head, SlicedLMHead):
|
||||||
|
|
@ -78,15 +81,19 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir):
|
||||||
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
||||||
padding_idx=model.config.pad_token_id,
|
padding_idx=model.config.pad_token_id,
|
||||||
dtype=np.float16,
|
dtype=np.float16,
|
||||||
|
input_length=1,
|
||||||
)
|
)
|
||||||
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
|
||||||
temp_dir)
|
temp_dir, True, keep_ir=True)
|
||||||
|
if compile_full_model:
|
||||||
|
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
||||||
|
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
||||||
return first_blob_path, last_blob_path
|
return first_blob_path, last_blob_path
|
||||||
|
|
||||||
|
|
||||||
def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
||||||
layernorm_const):
|
layernorm_const, mode="decode"):
|
||||||
num_heads = model.model.layers[0].self_attn.num_heads
|
num_heads = model.model.layers[0].self_attn.num_heads
|
||||||
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
||||||
head_dim = model.model.layers[0].self_attn.head_dim
|
head_dim = model.model.layers[0].self_attn.head_dim
|
||||||
|
|
@ -123,10 +130,20 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
else: # FP16 Linear
|
else: # FP16 Linear
|
||||||
np_dtype = np.float16
|
np_dtype = np.float16
|
||||||
|
|
||||||
|
if mode == "decode":
|
||||||
|
input_len = 1
|
||||||
|
decoder_name = f"decoder_layer_{layer_idx}"
|
||||||
|
compile = True
|
||||||
|
keep_ir = True
|
||||||
|
else:
|
||||||
|
input_len = kv_len
|
||||||
|
decoder_name = "decoder_layer_prefill"
|
||||||
|
compile = False
|
||||||
|
keep_ir = True
|
||||||
single_decoder = LowBitQwenMultiDecoderlayer(
|
single_decoder = LowBitQwenMultiDecoderlayer(
|
||||||
[1, 1, num_heads * head_dim],
|
[1, input_len, num_heads * head_dim],
|
||||||
input_layernorm_weights=[layer_norm_0] if layernorm_const else None,
|
input_layernorm_weights=None,
|
||||||
post_attn_layernorm_weights=[layer_norm_1] if layernorm_const else None,
|
post_attn_layernorm_weights=None,
|
||||||
q_biases=None,
|
q_biases=None,
|
||||||
k_biases=None,
|
k_biases=None,
|
||||||
v_biases=None,
|
v_biases=None,
|
||||||
|
|
@ -138,7 +155,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
max_seq_len=kv_len,
|
max_seq_len=kv_len,
|
||||||
rms_norm_eps=rms_norm_eps,
|
rms_norm_eps=rms_norm_eps,
|
||||||
intermediate_size=intermediate_size,
|
intermediate_size=intermediate_size,
|
||||||
mode="decode",
|
mode=mode,
|
||||||
transpose_value=transpose_value_cache,
|
transpose_value=transpose_value_cache,
|
||||||
dtype=np_dtype,
|
dtype=np_dtype,
|
||||||
n_splits_linear=n_splits_linear,
|
n_splits_linear=n_splits_linear,
|
||||||
|
|
@ -146,10 +163,11 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
group_size=group_size
|
group_size=group_size
|
||||||
)
|
)
|
||||||
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
||||||
f"decoder_layer_{layer_idx}",
|
decoder_name,
|
||||||
temp_dir)
|
temp_dir, compile, keep_ir)
|
||||||
|
|
||||||
# 0, 1, 2 are input_embed/attention_mask/position_id
|
# 0, 1, 2 are input_embed/attention_mask/position_id
|
||||||
|
if mode == "decode":
|
||||||
if layernorm_const:
|
if layernorm_const:
|
||||||
st_idx = 3
|
st_idx = 3
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue