diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py index a5aa5791..6abe95bc 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py @@ -23,7 +23,8 @@ from intel_npu_acceleration_library.backend.factory import NNFactory import numpy as np -def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True): +def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True, + npu_dpu_groups=None): xml_path = os.path.join(dir, model_name + ".xml") bin_path = os.path.join(dir, model_name + ".bin") model.save(xml_path) @@ -35,6 +36,11 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True core.set_property("NPU", {"NPU_COMPILATION_MODE_PARAMS": "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"}) core.set_property("NPU", {"PERFORMANCE_HINT": "LATENCY"}) + if ( + npu_dpu_groups is not None + and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1" + ): + core.set_property("NPU", {"NPU_DPU_GROUPS": str(npu_dpu_groups)}) model = core.read_model(xml_path) inputs = model.inputs diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py index 0899ef4a..435ba4ff 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py @@ -272,11 +272,13 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, input_len = 1 decoder_name = f"decoder_layer_{layer_idx}" keep_position_ids = True + npu_dpu_groups = None else: input_len = kv_len decoder_name = "decoder_layer_prefill" layernorm_const = False keep_position_ids = False + npu_dpu_groups = 6 single_decoder = LowBitLlamaMultiDecoderlayer( [1, input_len, num_heads * head_dim], @@ -303,7 +305,8 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, rest_blob_path = update_names_of_IR_and_export_blob(single_decoder, decoder_name, temp_dir, - True, False) + True, False, + npu_dpu_groups=npu_dpu_groups) if mode == "decode": if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):