Optimize first token of C++ NPU by adding npu_dpu_groups (#12443)
* add npu_dpu_groups * add check for env * fix style
This commit is contained in:
		
							parent
							
								
									66bd7abae4
								
							
						
					
					
						commit
						52c17fe104
					
				
					 2 changed files with 11 additions and 2 deletions
				
			
		| 
						 | 
				
			
			@ -23,7 +23,8 @@ from intel_npu_acceleration_library.backend.factory import NNFactory
 | 
			
		|||
import numpy as np
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True):
 | 
			
		||||
def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True,
 | 
			
		||||
                                       npu_dpu_groups=None):
 | 
			
		||||
    xml_path = os.path.join(dir, model_name + ".xml")
 | 
			
		||||
    bin_path = os.path.join(dir, model_name + ".bin")
 | 
			
		||||
    model.save(xml_path)
 | 
			
		||||
| 
						 | 
				
			
			@ -35,6 +36,11 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True
 | 
			
		|||
    core.set_property("NPU", {"NPU_COMPILATION_MODE_PARAMS":
 | 
			
		||||
                              "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"})
 | 
			
		||||
    core.set_property("NPU", {"PERFORMANCE_HINT": "LATENCY"})
 | 
			
		||||
    if (
 | 
			
		||||
        npu_dpu_groups is not None
 | 
			
		||||
        and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1"
 | 
			
		||||
    ):
 | 
			
		||||
        core.set_property("NPU", {"NPU_DPU_GROUPS": str(npu_dpu_groups)})
 | 
			
		||||
 | 
			
		||||
    model = core.read_model(xml_path)
 | 
			
		||||
    inputs = model.inputs
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -272,11 +272,13 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
			
		|||
        input_len = 1
 | 
			
		||||
        decoder_name = f"decoder_layer_{layer_idx}"
 | 
			
		||||
        keep_position_ids = True
 | 
			
		||||
        npu_dpu_groups = None
 | 
			
		||||
    else:
 | 
			
		||||
        input_len = kv_len
 | 
			
		||||
        decoder_name = "decoder_layer_prefill"
 | 
			
		||||
        layernorm_const = False
 | 
			
		||||
        keep_position_ids = False
 | 
			
		||||
        npu_dpu_groups = 6
 | 
			
		||||
 | 
			
		||||
    single_decoder = LowBitLlamaMultiDecoderlayer(
 | 
			
		||||
        [1, input_len, num_heads * head_dim],
 | 
			
		||||
| 
						 | 
				
			
			@ -303,7 +305,8 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 | 
			
		|||
    rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
 | 
			
		||||
                                                        decoder_name,
 | 
			
		||||
                                                        temp_dir,
 | 
			
		||||
                                                        True, False)
 | 
			
		||||
                                                        True, False,
 | 
			
		||||
                                                        npu_dpu_groups=npu_dpu_groups)
 | 
			
		||||
 | 
			
		||||
    if mode == "decode":
 | 
			
		||||
        if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue