[NPU] Modify IPEX_LLM_NPU_DISABLE_COMPILE_OPT setting for long input (#12537)
This commit is contained in:
		
							parent
							
								
									7cc01fdc86
								
							
						
					
					
						commit
						6596c18489
					
				
					 2 changed files with 13 additions and 4 deletions
				
			
		| 
						 | 
				
			
			@ -290,7 +290,8 @@ class _BaseAutoModelClass:
 | 
			
		|||
            model.config.update({"group_size": quantization_group_size})
 | 
			
		||||
            model.config.update({"asym": qtype == "asym_int4_rtn"})
 | 
			
		||||
            optimize_llm_pre(model, qtype, mixed_precision,
 | 
			
		||||
                             quantization_group_size=quantization_group_size)
 | 
			
		||||
                             quantization_group_size=quantization_group_size,
 | 
			
		||||
                             max_prompt_len=max_prompt_len)
 | 
			
		||||
            cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
 | 
			
		||||
                             quantization_group_size, imatrix_data,
 | 
			
		||||
                             *args, **kwargs)
 | 
			
		||||
| 
						 | 
				
			
			@ -580,7 +581,7 @@ class _BaseAutoModelClass:
 | 
			
		|||
            with torch.no_grad():
 | 
			
		||||
                optimize_llm_pre(model, qtype, mixed_precision,
 | 
			
		||||
                                 quantization_group_size=quantization_group_size,
 | 
			
		||||
                                 load=bigdl_lcmu_enabled)
 | 
			
		||||
                                 load=bigdl_lcmu_enabled, max_prompt_len=max_prompt_len)
 | 
			
		||||
                cls.load_convert(qtype, model, quant_device, modules_to_not_convert,
 | 
			
		||||
                                 quantization_group_size, *model_args, **kwargs)
 | 
			
		||||
                create_npu_kernels(llm)
 | 
			
		||||
| 
						 | 
				
			
			@ -804,7 +805,8 @@ class EmbeddingModel(_BaseAutoModelClass):
 | 
			
		|||
 | 
			
		||||
        with torch.no_grad():
 | 
			
		||||
            optimize_llm_pre(model, qtype, mixed_precision,
 | 
			
		||||
                             quantization_group_size=quantization_group_size)
 | 
			
		||||
                             quantization_group_size=quantization_group_size,
 | 
			
		||||
                             max_prompt_len=max_prompt_len)
 | 
			
		||||
            cls.load_convert_fp16(qtype, model.encoder, "cpu", modules_to_not_convert,
 | 
			
		||||
                                  quantization_group_size)
 | 
			
		||||
            create_npu_kernels(model.encoder)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -31,7 +31,7 @@ def convert_forward(m, target_m, new_forward):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
 | 
			
		||||
                     quantization_group_size=0, load=False):
 | 
			
		||||
                     quantization_group_size=0, load=False, max_prompt_len=512):
 | 
			
		||||
    if model.config.model_type == "baichuan":
 | 
			
		||||
        # process NormHead module in Baichuan2 7B
 | 
			
		||||
        if hasattr(model, 'lm_head') and model.lm_head is not None:
 | 
			
		||||
| 
						 | 
				
			
			@ -48,6 +48,13 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
 | 
			
		|||
 | 
			
		||||
    cpu_lm_head = os.environ.get("IPEX_LLM_CPU_LM_HEAD", "0") != "0"
 | 
			
		||||
 | 
			
		||||
    # workaround for long input performance of llama3.2-3b and glm-edge-4b CW
 | 
			
		||||
    if os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT") is None:
 | 
			
		||||
        disable_compile_opt = model.config.model_type == "llama" and \
 | 
			
		||||
            model.config.hidden_size == 3072 and max_prompt_len >= 1920 and \
 | 
			
		||||
            quantization_group_size == 0
 | 
			
		||||
        os.environ["IPEX_LLM_NPU_DISABLE_COMPILE_OPT"] = "1" if disable_compile_opt else "0"
 | 
			
		||||
 | 
			
		||||
    # workaround for MiniCPM-2B
 | 
			
		||||
    if model.config.model_type == "minicpm" and model.config.num_hidden_layers == 40:
 | 
			
		||||
        # 73440 is vocab_size of MiniCPM-1B
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue