[NPU] update fused layers for GW (#12459)
* update fused layers for GW * fix * fix llama condition for glm model * update
This commit is contained in:
parent
1b533a105c
commit
490bb0ca53
2 changed files with 42 additions and 18 deletions
|
|
@ -136,6 +136,7 @@ class _BaseAutoModelClass:
|
||||||
mock_device = kwargs.pop('device', None) # For mock on CPU
|
mock_device = kwargs.pop('device', None) # For mock on CPU
|
||||||
convert_model = kwargs.pop('convert_model', False)
|
convert_model = kwargs.pop('convert_model', False)
|
||||||
save_directory = kwargs.pop('save_directory', None)
|
save_directory = kwargs.pop('save_directory', None)
|
||||||
|
fuse_layers = kwargs.pop('fuse_layers', None)
|
||||||
|
|
||||||
invalidInputError(
|
invalidInputError(
|
||||||
quantization_group_size in [0, 32, 64, 128],
|
quantization_group_size in [0, 32, 64, 128],
|
||||||
|
|
@ -204,6 +205,7 @@ class _BaseAutoModelClass:
|
||||||
"transpose_value_cache": transpose_value_cache,
|
"transpose_value_cache": transpose_value_cache,
|
||||||
"convert_model": convert_model,
|
"convert_model": convert_model,
|
||||||
"save_directory": save_directory,
|
"save_directory": save_directory,
|
||||||
|
"fuse_layers": fuse_layers
|
||||||
}
|
}
|
||||||
model = cls.optimize_npu_model(*args, **optimize_kwargs)
|
model = cls.optimize_npu_model(*args, **optimize_kwargs)
|
||||||
else:
|
else:
|
||||||
|
|
@ -243,6 +245,7 @@ class _BaseAutoModelClass:
|
||||||
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
|
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
|
||||||
convert_model = kwargs.pop('convert_model', False)
|
convert_model = kwargs.pop('convert_model', False)
|
||||||
save_directory = kwargs.pop('save_directory', None)
|
save_directory = kwargs.pop('save_directory', None)
|
||||||
|
fuse_layers = kwargs.pop('fuse_layers', None)
|
||||||
|
|
||||||
if hasattr(model, "llm"):
|
if hasattr(model, "llm"):
|
||||||
llm = model.llm
|
llm = model.llm
|
||||||
|
|
@ -282,7 +285,8 @@ class _BaseAutoModelClass:
|
||||||
group_size=quantization_group_size,
|
group_size=quantization_group_size,
|
||||||
qtype=qtype,
|
qtype=qtype,
|
||||||
convert_model=convert_model,
|
convert_model=convert_model,
|
||||||
save_directory=save_directory)
|
save_directory=save_directory,
|
||||||
|
fuse_layers=fuse_layers)
|
||||||
model.save_low_bit = types.MethodType(save_low_bit, model)
|
model.save_low_bit = types.MethodType(save_low_bit, model)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -195,7 +195,8 @@ def convert_llm(model: torch.nn.Module,
|
||||||
group_size: int,
|
group_size: int,
|
||||||
qtype: str,
|
qtype: str,
|
||||||
convert_model: bool=False,
|
convert_model: bool=False,
|
||||||
save_directory: str=None):
|
save_directory: str=None,
|
||||||
|
fuse_layers: int=None):
|
||||||
# whether to set layernorm weight as const
|
# whether to set layernorm weight as const
|
||||||
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
|
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
|
||||||
if group_size == 0:
|
if group_size == 0:
|
||||||
|
|
@ -216,7 +217,8 @@ def convert_llm(model: torch.nn.Module,
|
||||||
n_splits_linear,
|
n_splits_linear,
|
||||||
n_splits_down_proj,
|
n_splits_down_proj,
|
||||||
group_size,
|
group_size,
|
||||||
save_directory)
|
save_directory,
|
||||||
|
fuse_layers=fuse_layers)
|
||||||
return 0
|
return 0
|
||||||
if model.config.model_type == "llama":
|
if model.config.model_type == "llama":
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
|
@ -422,18 +424,22 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
n_splits_linear: int,
|
n_splits_linear: int,
|
||||||
n_splits_down_proj: int,
|
n_splits_down_proj: int,
|
||||||
group_size: int,
|
group_size: int,
|
||||||
save_directory: str=None):
|
save_directory: str=None,
|
||||||
|
fuse_layers: int=None):
|
||||||
os.mkdir(save_directory)
|
os.mkdir(save_directory)
|
||||||
weight_dir = os.path.join(save_directory, "model_weights")
|
weight_dir = os.path.join(save_directory, "model_weights")
|
||||||
os.mkdir(weight_dir)
|
os.mkdir(weight_dir)
|
||||||
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
|
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
|
||||||
|
|
||||||
if model.config.model_type == "qwen2":
|
if model.config.model_type == "qwen2":
|
||||||
|
if group_size == 0:
|
||||||
if model.config.hidden_size == 1536:
|
if model.config.hidden_size == 1536:
|
||||||
# Qwen2-1.5B-Instruct
|
# Qwen2-1.5B-Instruct
|
||||||
fused_layers = 1
|
fused_layers = 1 if fuse_layers is None else fuse_layers
|
||||||
else:
|
else:
|
||||||
fused_layers = 2
|
fused_layers = 2 if fuse_layers is None else fuse_layers
|
||||||
|
else:
|
||||||
|
fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers
|
||||||
update_dict = {"kv_len": kv_len,
|
update_dict = {"kv_len": kv_len,
|
||||||
"num_head": model.model.layers[0].self_attn.num_heads,
|
"num_head": model.model.layers[0].self_attn.num_heads,
|
||||||
"head_dim": model.model.layers[0].self_attn.head_dim,
|
"head_dim": model.model.layers[0].self_attn.head_dim,
|
||||||
|
|
@ -469,20 +475,31 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
embedding_post = False
|
embedding_post = False
|
||||||
cos_sin_input = False
|
cos_sin_input = False
|
||||||
use_prefill_sdp = False
|
use_prefill_sdp = False
|
||||||
if model.config.vocab_size == 32000:
|
if group_size == 0:
|
||||||
|
if model.config.intermediate_size == 11008:
|
||||||
# for Llama2-7B
|
# for Llama2-7B
|
||||||
fused_layers = 4
|
fused_layers = 4 if fuse_layers is None else fuse_layers
|
||||||
use_prefill_sdp = True
|
use_prefill_sdp = True
|
||||||
else:
|
elif model.config.intermediate_size == 14336:
|
||||||
if model.config.intermediate_size == 8192:
|
# for Llama3-8B
|
||||||
|
fused_layers = 2 if fuse_layers is None else fuse_layers
|
||||||
|
use_prefill_sdp = True
|
||||||
|
elif not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
|
||||||
# llama3.2 1B & # llama3.2 3B
|
# llama3.2 1B & # llama3.2 3B
|
||||||
embedding_post = True
|
embedding_post = True
|
||||||
cos_sin_input = True
|
cos_sin_input = True
|
||||||
fused_layers = 2
|
fused_layers = 2 if fuse_layers is None else fuse_layers
|
||||||
else:
|
else:
|
||||||
# for Llama3-8B
|
fused_layers = 2 if fuse_layers is None else fuse_layers
|
||||||
fused_layers = 2
|
else:
|
||||||
|
if model.config.intermediate_size in [11008, 14336]:
|
||||||
|
# for Llama2-7B & Llama3-8B
|
||||||
use_prefill_sdp = True
|
use_prefill_sdp = True
|
||||||
|
elif not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
|
||||||
|
# llama3.2 1B & # llama3.2 3B
|
||||||
|
embedding_post = True
|
||||||
|
cos_sin_input = True
|
||||||
|
fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers
|
||||||
update_dict = {"kv_len": kv_len,
|
update_dict = {"kv_len": kv_len,
|
||||||
"num_head": model.model.layers[0].self_attn.num_heads,
|
"num_head": model.model.layers[0].self_attn.num_heads,
|
||||||
"head_dim": model.model.layers[0].self_attn.head_dim,
|
"head_dim": model.model.layers[0].self_attn.head_dim,
|
||||||
|
|
@ -518,7 +535,10 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
||||||
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
|
||||||
group_size, layernorm_const, "prefill")
|
group_size, layernorm_const, "prefill")
|
||||||
elif model.config.model_type == "minicpm":
|
elif model.config.model_type == "minicpm":
|
||||||
fused_layers = 4
|
if group_size == 0:
|
||||||
|
fused_layers = 4 if fuse_layers is None else fuse_layers
|
||||||
|
else:
|
||||||
|
fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers
|
||||||
update_dict = {"kv_len": kv_len,
|
update_dict = {"kv_len": kv_len,
|
||||||
"num_head": model.model.layers[0].self_attn.num_heads,
|
"num_head": model.model.layers[0].self_attn.num_heads,
|
||||||
"head_dim": model.model.layers[0].self_attn.head_dim,
|
"head_dim": model.model.layers[0].self_attn.head_dim,
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue