From 0763268e4ce65985280c924ce76ed91daa4fb8c2 Mon Sep 17 00:00:00 2001 From: Yina Chen <33650826+cyita@users.noreply.github.com> Date: Wed, 30 Oct 2024 11:40:21 +0200 Subject: [PATCH] [NPU]Qwen2 groupwise performance opt (#12299) * qwen2 gw performance opt * remove debug --- python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py index 8459ddf5..9ad99947 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py @@ -229,7 +229,10 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory): new_value_states = self.convert_to_fp16(curr_key_values[i][1]) print(f"{mode} start compiling") - self.compile() + if group_size != 0 and (mode == "prefill" or num_layers == 2): + self.compile(npu_dpu_groups=6) + else: + self.compile() print(f"{mode} end compiling") def build_decoder(