[NPU]Qwen2 groupwise performance opt (#12299)

* qwen2 gw performance opt * remove debug
2024-10-30 11:40:21 +02:00 · 2024-10-30 11:40:21 +02:00 · 0763268e4c
commit 0763268e4c
parent 41b8064554
1 changed files with 4 additions and 1 deletions
--- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
@ -229,6 +229,9 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
            new_value_states = self.convert_to_fp16(curr_key_values[i][1])
        print(f"{mode} start compiling")
        if group_size != 0 and (mode == "prefill" or num_layers == 2):
            self.compile(npu_dpu_groups=6)
        else:
            self.compile()
        print(f"{mode} end compiling")