[NPU] further fix of qwen2 int8 pipeline & C++ (#12449)
* fix * fix style
This commit is contained in:
parent
303b104c10
commit
24b46b2b19
2 changed files with 8 additions and 2 deletions
|
|
@ -231,7 +231,7 @@ class _BaseAutoModelClass:
|
||||||
from intel_npu_acceleration_library.compiler import create_npu_kernels
|
from intel_npu_acceleration_library.compiler import create_npu_kernels
|
||||||
|
|
||||||
model = kwargs.pop("model")
|
model = kwargs.pop("model")
|
||||||
qtype = kwargs.pop("qtype", "sym_int4")
|
qtype = kwargs.pop("qtype", "sym_int4_rtn")
|
||||||
mixed_precision = kwargs.pop("mixed_precision", False)
|
mixed_precision = kwargs.pop("mixed_precision", False)
|
||||||
quantization_group_size = kwargs.pop("quantization_group_size", 0)
|
quantization_group_size = kwargs.pop("quantization_group_size", 0)
|
||||||
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
|
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
|
||||||
|
|
@ -280,6 +280,7 @@ class _BaseAutoModelClass:
|
||||||
max_prompt_len=max_prompt_len,
|
max_prompt_len=max_prompt_len,
|
||||||
transpose_value_cache=transpose_value_cache,
|
transpose_value_cache=transpose_value_cache,
|
||||||
group_size=quantization_group_size,
|
group_size=quantization_group_size,
|
||||||
|
qtype=qtype,
|
||||||
convert_model=convert_model,
|
convert_model=convert_model,
|
||||||
save_directory=save_directory)
|
save_directory=save_directory)
|
||||||
model.save_low_bit = types.MethodType(save_low_bit, model)
|
model.save_low_bit = types.MethodType(save_low_bit, model)
|
||||||
|
|
|
||||||
|
|
@ -193,12 +193,17 @@ def convert_llm(model: torch.nn.Module,
|
||||||
max_prompt_len: int,
|
max_prompt_len: int,
|
||||||
transpose_value_cache: bool,
|
transpose_value_cache: bool,
|
||||||
group_size: int,
|
group_size: int,
|
||||||
|
qtype: str,
|
||||||
convert_model: bool=False,
|
convert_model: bool=False,
|
||||||
save_directory: str=None):
|
save_directory: str=None):
|
||||||
# whether to set layernorm weight as const
|
# whether to set layernorm weight as const
|
||||||
layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1"
|
layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1"
|
||||||
if group_size == 0:
|
if group_size == 0:
|
||||||
n_splits_linear = 1
|
n_splits_linear = 1
|
||||||
|
if qtype == "sym_int8_rtn":
|
||||||
|
# do not split mlp down_proj for Qwen2-7B & sym_int8
|
||||||
|
n_splits_down_proj = 1
|
||||||
|
else:
|
||||||
n_splits_down_proj = 2 if model.config.intermediate_size == 18944 else 1
|
n_splits_down_proj = 2 if model.config.intermediate_size == 18944 else 1
|
||||||
else:
|
else:
|
||||||
n_splits_linear = model.config.hidden_size // group_size
|
n_splits_linear = model.config.hidden_size // group_size
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue