[NPU] further fix of qwen2 int8 pipeline & C++ (#12449)
* fix * fix style
This commit is contained in:
		
							parent
							
								
									303b104c10
								
							
						
					
					
						commit
						24b46b2b19
					
				
					 2 changed files with 8 additions and 2 deletions
				
			
		| 
						 | 
					@ -231,7 +231,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
        from intel_npu_acceleration_library.compiler import create_npu_kernels
 | 
					        from intel_npu_acceleration_library.compiler import create_npu_kernels
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        model = kwargs.pop("model")
 | 
					        model = kwargs.pop("model")
 | 
				
			||||||
        qtype = kwargs.pop("qtype", "sym_int4")
 | 
					        qtype = kwargs.pop("qtype", "sym_int4_rtn")
 | 
				
			||||||
        mixed_precision = kwargs.pop("mixed_precision", False)
 | 
					        mixed_precision = kwargs.pop("mixed_precision", False)
 | 
				
			||||||
        quantization_group_size = kwargs.pop("quantization_group_size", 0)
 | 
					        quantization_group_size = kwargs.pop("quantization_group_size", 0)
 | 
				
			||||||
        modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
 | 
					        modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
 | 
				
			||||||
| 
						 | 
					@ -280,6 +280,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
                        max_prompt_len=max_prompt_len,
 | 
					                        max_prompt_len=max_prompt_len,
 | 
				
			||||||
                        transpose_value_cache=transpose_value_cache,
 | 
					                        transpose_value_cache=transpose_value_cache,
 | 
				
			||||||
                        group_size=quantization_group_size,
 | 
					                        group_size=quantization_group_size,
 | 
				
			||||||
 | 
					                        qtype=qtype,
 | 
				
			||||||
                        convert_model=convert_model,
 | 
					                        convert_model=convert_model,
 | 
				
			||||||
                        save_directory=save_directory)
 | 
					                        save_directory=save_directory)
 | 
				
			||||||
        model.save_low_bit = types.MethodType(save_low_bit, model)
 | 
					        model.save_low_bit = types.MethodType(save_low_bit, model)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -193,13 +193,18 @@ def convert_llm(model: torch.nn.Module,
 | 
				
			||||||
                max_prompt_len: int,
 | 
					                max_prompt_len: int,
 | 
				
			||||||
                transpose_value_cache: bool,
 | 
					                transpose_value_cache: bool,
 | 
				
			||||||
                group_size: int,
 | 
					                group_size: int,
 | 
				
			||||||
 | 
					                qtype: str,
 | 
				
			||||||
                convert_model: bool=False,
 | 
					                convert_model: bool=False,
 | 
				
			||||||
                save_directory: str=None):
 | 
					                save_directory: str=None):
 | 
				
			||||||
    # whether to set layernorm weight as const
 | 
					    # whether to set layernorm weight as const
 | 
				
			||||||
    layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1"
 | 
					    layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1"
 | 
				
			||||||
    if group_size == 0:
 | 
					    if group_size == 0:
 | 
				
			||||||
        n_splits_linear = 1
 | 
					        n_splits_linear = 1
 | 
				
			||||||
        n_splits_down_proj = 2 if model.config.intermediate_size == 18944 else 1
 | 
					        if qtype == "sym_int8_rtn":
 | 
				
			||||||
 | 
					            # do not split mlp down_proj for Qwen2-7B & sym_int8
 | 
				
			||||||
 | 
					            n_splits_down_proj = 1
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            n_splits_down_proj = 2 if model.config.intermediate_size == 18944 else 1
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        n_splits_linear = model.config.hidden_size // group_size
 | 
					        n_splits_linear = model.config.hidden_size // group_size
 | 
				
			||||||
        n_splits_down_proj = model.config.intermediate_size // group_size
 | 
					        n_splits_down_proj = model.config.intermediate_size // group_size
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue