[NPU] Expose parameter to control blob / IR save logic (#12767)
* update api * fix convert.py * fix style * remove unnecessary bin file * fix style
This commit is contained in:
parent
9c0daf6396
commit
094a25b740
7 changed files with 115 additions and 48 deletions
|
|
@ -51,6 +51,8 @@ if __name__ == "__main__":
|
||||||
parser.add_argument("--quantization-group-size", type=int, default=0)
|
parser.add_argument("--quantization-group-size", type=int, default=0)
|
||||||
parser.add_argument('--low-bit', type=str, default="sym_int4",
|
parser.add_argument('--low-bit', type=str, default="sym_int4",
|
||||||
help='Low bit optimizations that will be applied to the model.')
|
help='Low bit optimizations that will be applied to the model.')
|
||||||
|
parser.add_argument("--keep-ir", action="store_true")
|
||||||
|
parser.add_argument("--disable-compile-blob", action="store_true")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
model_path = args.repo_id_or_model_path
|
model_path = args.repo_id_or_model_path
|
||||||
|
|
@ -66,7 +68,9 @@ if __name__ == "__main__":
|
||||||
torch_dtype=torch.float16,
|
torch_dtype=torch.float16,
|
||||||
attn_implementation="eager",
|
attn_implementation="eager",
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
save_directory=save_dir)
|
save_directory=save_dir,
|
||||||
|
keep_ir=args.keep_ir,
|
||||||
|
compile_blob=not args.disable_compile_blob)
|
||||||
t1 = time.perf_counter()
|
t1 = time.perf_counter()
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
|
||||||
|
|
@ -139,8 +139,10 @@ class _BaseAutoModelClass:
|
||||||
mock_device = kwargs.pop('device', None) # For mock on CPU
|
mock_device = kwargs.pop('device', None) # For mock on CPU
|
||||||
convert_model = kwargs.pop('convert_model', False)
|
convert_model = kwargs.pop('convert_model', False)
|
||||||
save_directory = kwargs.pop('save_directory', None)
|
save_directory = kwargs.pop('save_directory', None)
|
||||||
fuse_layers = kwargs.pop('fuse_layers', None)
|
fuse_layers = kwargs.pop("fuse_layers", None)
|
||||||
imatrix_file = kwargs.pop('imatrix_file', None)
|
imatrix_file = kwargs.pop("imatrix_file", None)
|
||||||
|
keep_ir = kwargs.pop("keep_ir", False)
|
||||||
|
compile_blob = kwargs.pop("compile_blob", True)
|
||||||
|
|
||||||
if imatrix_file is not None:
|
if imatrix_file is not None:
|
||||||
imatrix_data = load_imatrix_data(imatrix_file)
|
imatrix_data = load_imatrix_data(imatrix_file)
|
||||||
|
|
@ -236,6 +238,8 @@ class _BaseAutoModelClass:
|
||||||
"fuse_layers": fuse_layers,
|
"fuse_layers": fuse_layers,
|
||||||
"imatrix_data": imatrix_data,
|
"imatrix_data": imatrix_data,
|
||||||
"skip_npu_logic": mock_device == "dummy",
|
"skip_npu_logic": mock_device == "dummy",
|
||||||
|
"keep_ir": keep_ir,
|
||||||
|
"compile_blob": compile_blob,
|
||||||
}
|
}
|
||||||
# Dummy will skip npu related logic and save the quantized model
|
# Dummy will skip npu related logic and save the quantized model
|
||||||
if mock_device == "dummy":
|
if mock_device == "dummy":
|
||||||
|
|
@ -280,9 +284,14 @@ class _BaseAutoModelClass:
|
||||||
fuse_layers = kwargs.pop('fuse_layers', None)
|
fuse_layers = kwargs.pop('fuse_layers', None)
|
||||||
imatrix_data = kwargs.pop('imatrix_data', None)
|
imatrix_data = kwargs.pop('imatrix_data', None)
|
||||||
skip_npu_logic = kwargs.pop("skip_npu_logic", False)
|
skip_npu_logic = kwargs.pop("skip_npu_logic", False)
|
||||||
|
keep_ir = kwargs.pop("keep_ir", False)
|
||||||
|
compile_blob = kwargs.pop("compile_blob", True)
|
||||||
|
|
||||||
invalidInputError(save_directory is not None,
|
invalidInputError(save_directory is not None,
|
||||||
"Please provide the path to save converted model "
|
"Please provide the path to save converted model "
|
||||||
"through `save_directory`.")
|
"through `save_directory`.")
|
||||||
|
invalidInputError(keep_ir or compile_blob,
|
||||||
|
"Please save blob or save IR either.")
|
||||||
|
|
||||||
if hasattr(model, "llm"):
|
if hasattr(model, "llm"):
|
||||||
llm = model.llm
|
llm = model.llm
|
||||||
|
|
@ -323,7 +332,9 @@ class _BaseAutoModelClass:
|
||||||
qtype=qtype,
|
qtype=qtype,
|
||||||
save_directory=save_directory,
|
save_directory=save_directory,
|
||||||
fuse_layers=fuse_layers,
|
fuse_layers=fuse_layers,
|
||||||
has_llm=hasattr(model, "llm")
|
has_llm=hasattr(model, "llm"),
|
||||||
|
keep_ir=keep_ir,
|
||||||
|
compile_blob=compile_blob
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
optimize_llm(
|
optimize_llm(
|
||||||
|
|
@ -346,7 +357,9 @@ class _BaseAutoModelClass:
|
||||||
qtype=qtype,
|
qtype=qtype,
|
||||||
convert_model=convert_model,
|
convert_model=convert_model,
|
||||||
save_directory=save_directory,
|
save_directory=save_directory,
|
||||||
fuse_layers=fuse_layers)
|
fuse_layers=fuse_layers,
|
||||||
|
keep_ir=keep_ir,
|
||||||
|
compile_blob=compile_blob)
|
||||||
model.save_low_bit = types.MethodType(save_low_bit, model)
|
model.save_low_bit = types.MethodType(save_low_bit, model)
|
||||||
model.save_low_bit(save_directory)
|
model.save_low_bit(save_directory)
|
||||||
logger.info(f"Converted model has already saved to {save_directory}.")
|
logger.info(f"Converted model has already saved to {save_directory}.")
|
||||||
|
|
|
||||||
|
|
@ -450,7 +450,9 @@ def optimize_llm_single_process(
|
||||||
qtype: str,
|
qtype: str,
|
||||||
save_directory: str,
|
save_directory: str,
|
||||||
fuse_layers: int=None,
|
fuse_layers: int=None,
|
||||||
has_llm: bool=False
|
has_llm: bool=False,
|
||||||
|
keep_ir: bool=False,
|
||||||
|
compile_blob: bool=True
|
||||||
):
|
):
|
||||||
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
|
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
|
||||||
from .npu_llm_cpp import load_model_from_file
|
from .npu_llm_cpp import load_model_from_file
|
||||||
|
|
@ -463,7 +465,9 @@ def optimize_llm_single_process(
|
||||||
qtype=qtype,
|
qtype=qtype,
|
||||||
convert_model=True,
|
convert_model=True,
|
||||||
save_directory=save_directory,
|
save_directory=save_directory,
|
||||||
fuse_layers=fuse_layers)
|
fuse_layers=fuse_layers,
|
||||||
|
keep_ir=keep_ir,
|
||||||
|
compile_blob=compile_blob)
|
||||||
try:
|
try:
|
||||||
model_ptr = load_model_from_file(save_directory)
|
model_ptr = load_model_from_file(save_directory)
|
||||||
model.kv_len = kv_len
|
model.kv_len = kv_len
|
||||||
|
|
|
||||||
|
|
@ -196,7 +196,9 @@ def convert_llm(model: torch.nn.Module,
|
||||||
qtype: str,
|
qtype: str,
|
||||||
convert_model: bool=False,
|
convert_model: bool=False,
|
||||||
save_directory: str=None,
|
save_directory: str=None,
|
||||||
fuse_layers: int=None):
|
fuse_layers: int=None,
|
||||||
|
keep_ir: bool=False,
|
||||||
|
compile_blob: bool=True):
|
||||||
# whether to set layernorm weight as const
|
# whether to set layernorm weight as const
|
||||||
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
|
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
|
||||||
if group_size == 0:
|
if group_size == 0:
|
||||||
|
|
@ -220,7 +222,9 @@ def convert_llm(model: torch.nn.Module,
|
||||||
n_splits_down_proj,
|
n_splits_down_proj,
|
||||||
group_size,
|
group_size,
|
||||||
save_directory,
|
save_directory,
|
||||||
fuse_layers=fuse_layers)
|
fuse_layers=fuse_layers,
|
||||||
|
keep_ir=keep_ir,
|
||||||
|
compile_blob=compile_blob)
|
||||||
return 0
|
return 0
|
||||||
if model.config.model_type == "llama":
|
if model.config.model_type == "llama":
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
|
@ -428,7 +432,9 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
n_splits_down_proj: int,
|
n_splits_down_proj: int,
|
||||||
group_size: int,
|
group_size: int,
|
||||||
save_directory: str=None,
|
save_directory: str=None,
|
||||||
fuse_layers: int=None):
|
fuse_layers: int=None,
|
||||||
|
keep_ir: bool=False,
|
||||||
|
compile_blob: bool=True):
|
||||||
if not os.path.exists(save_directory):
|
if not os.path.exists(save_directory):
|
||||||
os.mkdir(save_directory)
|
os.mkdir(save_directory)
|
||||||
weight_dir = os.path.join(save_directory, "model_weights")
|
weight_dir = os.path.join(save_directory, "model_weights")
|
||||||
|
|
@ -479,14 +485,17 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
# save fused_layers blobs of fused decoder layers
|
# save fused_layers blobs of fused decoder layers
|
||||||
convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
||||||
save_directory, weight_dir, transpose_value_cache, kv_len,
|
save_directory, weight_dir, transpose_value_cache, kv_len,
|
||||||
group_size, layernorm_const, "decode")
|
group_size, layernorm_const, "decode",
|
||||||
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
# save blob of single prefill layer
|
# save blob of single prefill layer
|
||||||
convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
||||||
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
||||||
group_size, layernorm_const, "prefill")
|
group_size, layernorm_const, "prefill",
|
||||||
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
# save blob of lmhead and bin of embedding
|
# save blob of lmhead and bin of embedding
|
||||||
convert_lm_head_and_embedding(model, save_directory, weight_dir,
|
convert_lm_head_and_embedding(model, save_directory, weight_dir,
|
||||||
convert_model=True, group_size=group_size)
|
convert_model=True, group_size=group_size,
|
||||||
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
elif model.config.model_type == "llama":
|
elif model.config.model_type == "llama":
|
||||||
embedding_post = False
|
embedding_post = False
|
||||||
cos_sin_input = False
|
cos_sin_input = False
|
||||||
|
|
@ -540,15 +549,18 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
convert_lm_head_and_embedding(model, n_splits_linear,
|
convert_lm_head_and_embedding(model, n_splits_linear,
|
||||||
save_directory, weight_dir,
|
save_directory, weight_dir,
|
||||||
convert_model=True,
|
convert_model=True,
|
||||||
max_prompt_len=max_prompt_len)
|
max_prompt_len=max_prompt_len,
|
||||||
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
# save fused_layers blobs of fused decoder layers
|
# save fused_layers blobs of fused decoder layers
|
||||||
convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
||||||
save_directory, weight_dir, transpose_value_cache, kv_len,
|
save_directory, weight_dir, transpose_value_cache, kv_len,
|
||||||
group_size, layernorm_const, "decode")
|
group_size, layernorm_const, "decode",
|
||||||
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
# save blob of single prefill layer
|
# save blob of single prefill layer
|
||||||
convert_llama_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
convert_llama_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
||||||
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
||||||
group_size, layernorm_const, "prefill")
|
group_size, layernorm_const, "prefill",
|
||||||
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
elif model.config.model_type == "minicpm":
|
elif model.config.model_type == "minicpm":
|
||||||
if group_size == 0:
|
if group_size == 0:
|
||||||
fused_layers = 4 if fuse_layers is None else fuse_layers
|
fused_layers = 4 if fuse_layers is None else fuse_layers
|
||||||
|
|
@ -577,16 +589,19 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
# save fused_layers blobs of fused decoder layers
|
# save fused_layers blobs of fused decoder layers
|
||||||
convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
||||||
save_directory, weight_dir, transpose_value_cache, kv_len,
|
save_directory, weight_dir, transpose_value_cache, kv_len,
|
||||||
group_size, layernorm_const, "decode")
|
group_size, layernorm_const, "decode",
|
||||||
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
# save blob of single prefill layer
|
# save blob of single prefill layer
|
||||||
convert_minicpm_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
convert_minicpm_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
||||||
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
||||||
group_size, layernorm_const, "prefill")
|
group_size, layernorm_const, "prefill",
|
||||||
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
# save blob of lmhead and bin of embedding and embedding_post
|
# save blob of lmhead and bin of embedding and embedding_post
|
||||||
convert_lm_head_and_embedding(model, n_splits_linear,
|
convert_lm_head_and_embedding(model, n_splits_linear,
|
||||||
save_directory, weight_dir,
|
save_directory, weight_dir,
|
||||||
convert_model=True,
|
convert_model=True,
|
||||||
max_prompt_len=max_prompt_len)
|
max_prompt_len=max_prompt_len,
|
||||||
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
|
|
||||||
model.config.update(update_dict)
|
model.config.update(update_dict)
|
||||||
model.config.save_pretrained(save_directory)
|
model.config.save_pretrained(save_directory)
|
||||||
|
|
|
||||||
|
|
@ -123,7 +123,8 @@ class Llama32PostEmbedding(NNFactory):
|
||||||
|
|
||||||
|
|
||||||
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
||||||
convert_model=False, max_prompt_len=1):
|
convert_model=False, max_prompt_len=1,
|
||||||
|
keep_ir=False, compile_blob=True):
|
||||||
num_heads = model.model.layers[0].self_attn.num_heads
|
num_heads = model.model.layers[0].self_attn.num_heads
|
||||||
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
||||||
head_dim = model.model.layers[0].self_attn.head_dim
|
head_dim = model.model.layers[0].self_attn.head_dim
|
||||||
|
|
@ -175,7 +176,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
||||||
asym=asym
|
asym=asym
|
||||||
)
|
)
|
||||||
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
|
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
|
||||||
True, False)
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
|
os.remove(os.path.join(temp_dir, "lm_head.bin"))
|
||||||
|
|
||||||
# save weights bins files
|
# save weights bins files
|
||||||
if n_splits_linear == 1:
|
if n_splits_linear == 1:
|
||||||
|
|
@ -211,7 +213,9 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
||||||
first_blob_path = None
|
first_blob_path = None
|
||||||
else:
|
else:
|
||||||
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
||||||
temp_dir, True, False)
|
temp_dir, keep_ir=keep_ir,
|
||||||
|
compile_blob=compile_blob)
|
||||||
|
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
||||||
else:
|
else:
|
||||||
# llama-3.2-3B & llama-3.2-1B
|
# llama-3.2-3B & llama-3.2-1B
|
||||||
embedding_layer = model.model.embed_tokens
|
embedding_layer = model.model.embed_tokens
|
||||||
|
|
@ -235,22 +239,28 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
||||||
attention_scaling=attention_scaling,
|
attention_scaling=attention_scaling,
|
||||||
input_len=1)
|
input_len=1)
|
||||||
update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
|
update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
|
||||||
temp_dir, True, False)
|
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
|
embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
|
||||||
attention_scaling=attention_scaling,
|
attention_scaling=attention_scaling,
|
||||||
input_len=max_prompt_len)
|
input_len=max_prompt_len)
|
||||||
update_names_of_IR_and_export_blob(embedding_post_prefill,
|
update_names_of_IR_and_export_blob(embedding_post_prefill,
|
||||||
"embedding_post_prefill",
|
"embedding_post_prefill",
|
||||||
temp_dir, True, False)
|
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
|
os.remove(os.path.join(temp_dir, "embedding_post.bin"))
|
||||||
|
os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
|
||||||
else:
|
else:
|
||||||
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
||||||
temp_dir)
|
temp_dir, keep_ir=keep_ir,
|
||||||
|
compile_blob=compile_blob)
|
||||||
|
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
||||||
|
|
||||||
return first_blob_path, last_blob_path
|
return first_blob_path, last_blob_path
|
||||||
|
|
||||||
|
|
||||||
def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
||||||
layernorm_const, mode="decode"):
|
layernorm_const, mode="decode",
|
||||||
|
keep_ir=False, compile_blob=True):
|
||||||
num_heads = model.model.layers[0].self_attn.num_heads
|
num_heads = model.model.layers[0].self_attn.num_heads
|
||||||
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
||||||
head_dim = model.model.layers[0].self_attn.head_dim
|
head_dim = model.model.layers[0].self_attn.head_dim
|
||||||
|
|
@ -317,8 +327,9 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
||||||
decoder_name,
|
decoder_name,
|
||||||
temp_dir,
|
temp_dir,
|
||||||
True, False,
|
keep_ir=keep_ir, compile_blob=compile_blob,
|
||||||
npu_dpu_groups=npu_dpu_groups)
|
npu_dpu_groups=npu_dpu_groups)
|
||||||
|
os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
|
||||||
|
|
||||||
if mode == "decode":
|
if mode == "decode":
|
||||||
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
||||||
|
|
@ -364,7 +375,8 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
|
|
||||||
def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
||||||
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
||||||
layernorm_const, mode="decode"):
|
layernorm_const, mode="decode",
|
||||||
|
keep_ir=False, compile_blob=True):
|
||||||
num_heads = model.model.layers[0].self_attn.num_heads
|
num_heads = model.model.layers[0].self_attn.num_heads
|
||||||
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
||||||
head_dim = model.model.layers[0].self_attn.head_dim
|
head_dim = model.model.layers[0].self_attn.head_dim
|
||||||
|
|
@ -457,6 +469,7 @@ def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_dow
|
||||||
update_names_of_IR_and_export_blob(fused_decoder,
|
update_names_of_IR_and_export_blob(fused_decoder,
|
||||||
f"decoder_layer_{i}",
|
f"decoder_layer_{i}",
|
||||||
save_dir,
|
save_dir,
|
||||||
compile_blob=True,
|
keep_ir=keep_ir,
|
||||||
keep_ir=False)
|
compile_blob=compile_blob)
|
||||||
|
os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
|
||||||
return 0
|
return 0
|
||||||
|
|
|
||||||
|
|
@ -162,7 +162,8 @@ class MiniCPMLMHead(LLMBaseNNFactory):
|
||||||
|
|
||||||
|
|
||||||
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
||||||
convert_model=False, max_prompt_len=1):
|
convert_model=False, max_prompt_len=1,
|
||||||
|
keep_ir=False, compile_blob=True):
|
||||||
num_heads = model.model.layers[0].self_attn.num_heads
|
num_heads = model.model.layers[0].self_attn.num_heads
|
||||||
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
||||||
head_dim = model.model.layers[0].self_attn.head_dim
|
head_dim = model.model.layers[0].self_attn.head_dim
|
||||||
|
|
@ -230,7 +231,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
||||||
asym=asym
|
asym=asym
|
||||||
)
|
)
|
||||||
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
|
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
|
||||||
True, True)
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
|
os.remove(os.path.join(temp_dir, "lm_head.bin"))
|
||||||
|
|
||||||
# save weights bins files
|
# save weights bins files
|
||||||
if n_splits_linear == 1:
|
if n_splits_linear == 1:
|
||||||
|
|
@ -280,22 +282,27 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
||||||
dtype=np.float16,
|
dtype=np.float16,
|
||||||
scale_emb=model.config.scale_emb)
|
scale_emb=model.config.scale_emb)
|
||||||
update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
|
update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
|
||||||
temp_dir, True, False)
|
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
embedding_post_prefill = MiniCPMPostEmbedding(max_prompt_len, model.config.hidden_size,
|
embedding_post_prefill = MiniCPMPostEmbedding(max_prompt_len, model.config.hidden_size,
|
||||||
dtype=np.float16,
|
dtype=np.float16,
|
||||||
scale_emb=model.config.scale_emb)
|
scale_emb=model.config.scale_emb)
|
||||||
update_names_of_IR_and_export_blob(embedding_post_prefill,
|
update_names_of_IR_and_export_blob(embedding_post_prefill,
|
||||||
"embedding_post_prefill",
|
"embedding_post_prefill",
|
||||||
temp_dir, True, False)
|
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
|
os.remove(os.path.join(temp_dir, "embedding_post.bin"))
|
||||||
|
os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
|
||||||
else:
|
else:
|
||||||
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
||||||
temp_dir, True, False)
|
temp_dir, keep_ir=keep_ir,
|
||||||
|
compile_blob=compile_blob)
|
||||||
|
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
||||||
return first_blob_path, last_blob_path
|
return first_blob_path, last_blob_path
|
||||||
|
|
||||||
|
|
||||||
def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
||||||
layernorm_const, mode="decode"):
|
layernorm_const, mode="decode",
|
||||||
|
keep_ir=False, compile_blob=True):
|
||||||
num_heads = model.model.layers[0].self_attn.num_heads
|
num_heads = model.model.layers[0].self_attn.num_heads
|
||||||
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
||||||
head_dim = model.model.layers[0].self_attn.head_dim
|
head_dim = model.model.layers[0].self_attn.head_dim
|
||||||
|
|
@ -353,7 +360,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
||||||
decoder_name,
|
decoder_name,
|
||||||
temp_dir,
|
temp_dir,
|
||||||
True, True)
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
|
os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
|
||||||
|
|
||||||
if mode == "decode":
|
if mode == "decode":
|
||||||
if layernorm_const:
|
if layernorm_const:
|
||||||
|
|
@ -386,7 +394,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
|
|
||||||
def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
||||||
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
||||||
layernorm_const, mode="decode"):
|
layernorm_const, mode="decode",
|
||||||
|
keep_ir=False, compile_blob=True):
|
||||||
num_heads = model.model.layers[0].self_attn.num_heads
|
num_heads = model.model.layers[0].self_attn.num_heads
|
||||||
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
||||||
head_dim = model.model.layers[0].self_attn.head_dim
|
head_dim = model.model.layers[0].self_attn.head_dim
|
||||||
|
|
@ -477,6 +486,6 @@ def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_d
|
||||||
update_names_of_IR_and_export_blob(fused_decoder,
|
update_names_of_IR_and_export_blob(fused_decoder,
|
||||||
f"decoder_layer_{i}",
|
f"decoder_layer_{i}",
|
||||||
save_dir,
|
save_dir,
|
||||||
compile_blob=True,
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
keep_ir=False)
|
os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
|
||||||
return 0
|
return 0
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,8 @@ from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
|
||||||
|
|
||||||
|
|
||||||
def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
||||||
convert_model=False, group_size=0):
|
convert_model=False, group_size=0,
|
||||||
|
keep_ir=False, compile_blob=True):
|
||||||
num_heads = model.model.layers[0].self_attn.num_heads
|
num_heads = model.model.layers[0].self_attn.num_heads
|
||||||
head_dim = model.model.layers[0].self_attn.head_dim
|
head_dim = model.model.layers[0].self_attn.head_dim
|
||||||
rms_norm_eps = model.config.rms_norm_eps
|
rms_norm_eps = model.config.rms_norm_eps
|
||||||
|
|
@ -84,7 +85,9 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
||||||
)
|
)
|
||||||
|
|
||||||
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
|
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
|
||||||
temp_dir, True, False)
|
temp_dir, keep_ir=keep_ir,
|
||||||
|
compile_blob=compile_blob)
|
||||||
|
os.remove(os.path.join(temp_dir, "lm_head.bin"))
|
||||||
|
|
||||||
# save weights bins files
|
# save weights bins files
|
||||||
if not isinstance(lm_head, SlicedLMHead):
|
if not isinstance(lm_head, SlicedLMHead):
|
||||||
|
|
@ -119,13 +122,16 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
||||||
first_blob_path = True
|
first_blob_path = True
|
||||||
else:
|
else:
|
||||||
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
|
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
|
||||||
temp_dir, True, keep_ir=True)
|
temp_dir, keep_ir=keep_ir,
|
||||||
|
compile_blob=compile_blob)
|
||||||
|
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
||||||
return first_blob_path, last_blob_path
|
return first_blob_path, last_blob_path
|
||||||
|
|
||||||
|
|
||||||
def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
||||||
layernorm_const, mode="decode"):
|
layernorm_const, mode="decode",
|
||||||
|
keep_ir=False, compile_blob=True):
|
||||||
num_heads = model.model.layers[0].self_attn.num_heads
|
num_heads = model.model.layers[0].self_attn.num_heads
|
||||||
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
||||||
head_dim = model.model.layers[0].self_attn.head_dim
|
head_dim = model.model.layers[0].self_attn.head_dim
|
||||||
|
|
@ -183,8 +189,10 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
)
|
)
|
||||||
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
||||||
decoder_name,
|
decoder_name,
|
||||||
temp_dir, True, False,
|
temp_dir, keep_ir=keep_ir,
|
||||||
|
compile_blob=compile_blob,
|
||||||
npu_dpu_groups=npu_dpu_groups)
|
npu_dpu_groups=npu_dpu_groups)
|
||||||
|
os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
|
||||||
|
|
||||||
# 0, 1, 2 are input_embed/attention_mask/position_id
|
# 0, 1, 2 are input_embed/attention_mask/position_id
|
||||||
if mode == "decode":
|
if mode == "decode":
|
||||||
|
|
@ -226,7 +234,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
|
|
||||||
def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
||||||
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
|
||||||
layernorm_const, mode="decode"):
|
layernorm_const, mode="decode",
|
||||||
|
keep_ir=False, compile_blob=True):
|
||||||
num_heads = model.model.layers[0].self_attn.num_heads
|
num_heads = model.model.layers[0].self_attn.num_heads
|
||||||
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
|
||||||
head_dim = model.model.layers[0].self_attn.head_dim
|
head_dim = model.model.layers[0].self_attn.head_dim
|
||||||
|
|
@ -330,6 +339,6 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
|
||||||
update_names_of_IR_and_export_blob(fused_decoder,
|
update_names_of_IR_and_export_blob(fused_decoder,
|
||||||
f"decoder_layer_{i}",
|
f"decoder_layer_{i}",
|
||||||
save_dir,
|
save_dir,
|
||||||
compile_blob=True,
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
||||||
keep_ir=False)
|
os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
|
||||||
return 0
|
return 0
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue