Optimize first token of C++ NPU by adding npu_dpu_groups (#12443)
* add npu_dpu_groups * add check for env * fix style
This commit is contained in:
parent
66bd7abae4
commit
52c17fe104
2 changed files with 11 additions and 2 deletions
|
|
@ -23,7 +23,8 @@ from intel_npu_acceleration_library.backend.factory import NNFactory
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True):
|
def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True,
|
||||||
|
npu_dpu_groups=None):
|
||||||
xml_path = os.path.join(dir, model_name + ".xml")
|
xml_path = os.path.join(dir, model_name + ".xml")
|
||||||
bin_path = os.path.join(dir, model_name + ".bin")
|
bin_path = os.path.join(dir, model_name + ".bin")
|
||||||
model.save(xml_path)
|
model.save(xml_path)
|
||||||
|
|
@ -35,6 +36,11 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True
|
||||||
core.set_property("NPU", {"NPU_COMPILATION_MODE_PARAMS":
|
core.set_property("NPU", {"NPU_COMPILATION_MODE_PARAMS":
|
||||||
"compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"})
|
"compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"})
|
||||||
core.set_property("NPU", {"PERFORMANCE_HINT": "LATENCY"})
|
core.set_property("NPU", {"PERFORMANCE_HINT": "LATENCY"})
|
||||||
|
if (
|
||||||
|
npu_dpu_groups is not None
|
||||||
|
and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1"
|
||||||
|
):
|
||||||
|
core.set_property("NPU", {"NPU_DPU_GROUPS": str(npu_dpu_groups)})
|
||||||
|
|
||||||
model = core.read_model(xml_path)
|
model = core.read_model(xml_path)
|
||||||
inputs = model.inputs
|
inputs = model.inputs
|
||||||
|
|
|
||||||
|
|
@ -272,11 +272,13 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
input_len = 1
|
input_len = 1
|
||||||
decoder_name = f"decoder_layer_{layer_idx}"
|
decoder_name = f"decoder_layer_{layer_idx}"
|
||||||
keep_position_ids = True
|
keep_position_ids = True
|
||||||
|
npu_dpu_groups = None
|
||||||
else:
|
else:
|
||||||
input_len = kv_len
|
input_len = kv_len
|
||||||
decoder_name = "decoder_layer_prefill"
|
decoder_name = "decoder_layer_prefill"
|
||||||
layernorm_const = False
|
layernorm_const = False
|
||||||
keep_position_ids = False
|
keep_position_ids = False
|
||||||
|
npu_dpu_groups = 6
|
||||||
|
|
||||||
single_decoder = LowBitLlamaMultiDecoderlayer(
|
single_decoder = LowBitLlamaMultiDecoderlayer(
|
||||||
[1, input_len, num_heads * head_dim],
|
[1, input_len, num_heads * head_dim],
|
||||||
|
|
@ -303,7 +305,8 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
||||||
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
||||||
decoder_name,
|
decoder_name,
|
||||||
temp_dir,
|
temp_dir,
|
||||||
True, False)
|
True, False,
|
||||||
|
npu_dpu_groups=npu_dpu_groups)
|
||||||
|
|
||||||
if mode == "decode":
|
if mode == "decode":
|
||||||
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue