From 303b104c1090346d1e0919d64de1323b14054d1c Mon Sep 17 00:00:00 2001 From: Yuwen Hu <54161268+Oscilloscope98@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:53:04 +0800 Subject: [PATCH] Fix abnormal output for Qwen2-7B when sym_int8 (#12446) --- .../llm/src/ipex_llm/transformers/npu_models/convert_mp.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index f38c79d8..2e98c1eb 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -128,7 +128,11 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, from ipex_llm.transformers.npu_models.common import split_linears if quantization_group_size == 0: n_splits_linear = 1 - n_splits_down_proj = 2 if model.config.intermediate_size == 18944 else 1 + if qtype == "sym_int8_rtn": + # do not split mlp down_proj for Qwen2-7B & sym_int8 + n_splits_down_proj = 1 + else: + n_splits_down_proj = 2 if model.config.intermediate_size == 18944 else 1 else: invalidInputError( model.config.hidden_size % quantization_group_size == 0 and