[NPU] Add L0 support for NPU C++ (#12454)
* add L0 models support * meet review * fix style
This commit is contained in:
parent
ce6fcaa9ba
commit
281c9b0bb9
1 changed files with 26 additions and 12 deletions
|
|
@ -197,7 +197,7 @@ def convert_llm(model: torch.nn.Module,
|
||||||
convert_model: bool=False,
|
convert_model: bool=False,
|
||||||
save_directory: str=None):
|
save_directory: str=None):
|
||||||
# whether to set layernorm weight as const
|
# whether to set layernorm weight as const
|
||||||
layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1"
|
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
|
||||||
if group_size == 0:
|
if group_size == 0:
|
||||||
n_splits_linear = 1
|
n_splits_linear = 1
|
||||||
if qtype == "sym_int8_rtn":
|
if qtype == "sym_int8_rtn":
|
||||||
|
|
@ -344,7 +344,7 @@ def convert_llm(model: torch.nn.Module,
|
||||||
invalidInputError(False,
|
invalidInputError(False,
|
||||||
"False to InitLLMPipeline.")
|
"False to InitLLMPipeline.")
|
||||||
elif model.config.model_type == "qwen2":
|
elif model.config.model_type == "qwen2":
|
||||||
layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "0") == "1"
|
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "0") == "1"
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
if save_directory is not None:
|
if save_directory is not None:
|
||||||
temp_dir = save_directory
|
temp_dir = save_directory
|
||||||
|
|
@ -426,9 +426,11 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
os.mkdir(save_directory)
|
os.mkdir(save_directory)
|
||||||
weight_dir = os.path.join(save_directory, "model_weights")
|
weight_dir = os.path.join(save_directory, "model_weights")
|
||||||
os.mkdir(weight_dir)
|
os.mkdir(weight_dir)
|
||||||
|
use_level_zero = os.environ.get("IPEX_LLM_NPU_USE_LEVEL0", "0") == "1"
|
||||||
|
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
|
||||||
|
|
||||||
if model.config.model_type == "qwen2":
|
if model.config.model_type == "qwen2":
|
||||||
layernorm_const = True
|
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "0") == "1"
|
||||||
if model.config.hidden_size == 1536:
|
if model.config.hidden_size == 1536:
|
||||||
# Qwen2-1.5B-Instruct
|
# Qwen2-1.5B-Instruct
|
||||||
fused_layers = 1
|
fused_layers = 1
|
||||||
|
|
@ -447,16 +449,28 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
"weight_num": 7,
|
"weight_num": 7,
|
||||||
"weight_idx": 8,
|
"weight_idx": 8,
|
||||||
"n_splits_linear": n_splits_linear,
|
"n_splits_linear": n_splits_linear,
|
||||||
"n_splits_down_proj": n_splits_down_proj}
|
"n_splits_down_proj": n_splits_down_proj,
|
||||||
|
"use_level_zero": use_level_zero}
|
||||||
model.config.update(update_dict)
|
model.config.update(update_dict)
|
||||||
model.config.save_pretrained(save_directory)
|
model.config.save_pretrained(save_directory)
|
||||||
|
|
||||||
from .qwen import convert_qwen_layer, convert_fused_qwen_layer
|
from .qwen import convert_qwen_layer, convert_fused_qwen_layer
|
||||||
from .qwen import convert_lm_head_and_embedding
|
from .qwen import convert_lm_head_and_embedding
|
||||||
|
if not use_level_zero:
|
||||||
# save fused_layers blobs of fused decoder layers
|
# save fused_layers blobs of fused decoder layers
|
||||||
convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
|
||||||
save_directory, weight_dir, transpose_value_cache, kv_len,
|
save_directory, weight_dir, transpose_value_cache, kv_len,
|
||||||
group_size, layernorm_const, "decode")
|
group_size, layernorm_const, "decode")
|
||||||
|
else:
|
||||||
|
# save layer_num blobs of each decoder layer
|
||||||
|
layer_num = len(model.model.layers)
|
||||||
|
param_list = []
|
||||||
|
for layer_idx in range(0, layer_num):
|
||||||
|
param_list.append((model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
|
save_directory, weight_dir, transpose_value_cache, kv_len,
|
||||||
|
group_size, layernorm_const))
|
||||||
|
with Pool() as pool:
|
||||||
|
result = pool.starmap(convert_qwen_layer, param_list)
|
||||||
# save blob of single prefill layer
|
# save blob of single prefill layer
|
||||||
convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
|
||||||
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
||||||
|
|
@ -466,7 +480,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
save_directory, weight_dir,
|
save_directory, weight_dir,
|
||||||
convert_model=True)
|
convert_model=True)
|
||||||
elif model.config.model_type == "llama":
|
elif model.config.model_type == "llama":
|
||||||
layernorm_const = True
|
|
||||||
embedding_post = False
|
embedding_post = False
|
||||||
cos_sin_input = False
|
cos_sin_input = False
|
||||||
use_prefill_sdp = False
|
use_prefill_sdp = False
|
||||||
|
|
@ -499,7 +512,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
"embedding_post": embedding_post,
|
"embedding_post": embedding_post,
|
||||||
"cos_sin_input": cos_sin_input,
|
"cos_sin_input": cos_sin_input,
|
||||||
"n_splits_linear": n_splits_linear,
|
"n_splits_linear": n_splits_linear,
|
||||||
"n_splits_down_proj": n_splits_down_proj}
|
"n_splits_down_proj": n_splits_down_proj,
|
||||||
|
"use_level_zero": use_level_zero}
|
||||||
model.config.update(update_dict)
|
model.config.update(update_dict)
|
||||||
model.config.save_pretrained(save_directory)
|
model.config.save_pretrained(save_directory)
|
||||||
|
|
||||||
|
|
@ -519,7 +533,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
||||||
group_size, layernorm_const, "prefill")
|
group_size, layernorm_const, "prefill")
|
||||||
elif model.config.model_type == "minicpm":
|
elif model.config.model_type == "minicpm":
|
||||||
layernorm_const = True
|
|
||||||
fused_layers = 4
|
fused_layers = 4
|
||||||
update_dict = {"kv_len": kv_len,
|
update_dict = {"kv_len": kv_len,
|
||||||
"num_head": model.model.layers[0].self_attn.num_heads,
|
"num_head": model.model.layers[0].self_attn.num_heads,
|
||||||
|
|
@ -536,7 +549,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
"model_type": "minicpm",
|
"model_type": "minicpm",
|
||||||
"embedding_post": True,
|
"embedding_post": True,
|
||||||
"n_splits_linear": n_splits_linear,
|
"n_splits_linear": n_splits_linear,
|
||||||
"n_splits_down_proj": n_splits_down_proj}
|
"n_splits_down_proj": n_splits_down_proj,
|
||||||
|
"use_level_zero": use_level_zero}
|
||||||
model.config.update(update_dict)
|
model.config.update(update_dict)
|
||||||
model.config.save_pretrained(save_directory)
|
model.config.save_pretrained(save_directory)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue