small fix of imatrix (#12480)
This commit is contained in:
		
							parent
							
								
									ab01753b1c
								
							
						
					
					
						commit
						598603bea6
					
				
					 3 changed files with 11 additions and 10 deletions
				
			
		| 
						 | 
					@ -186,7 +186,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
            with torch.no_grad():
 | 
					            with torch.no_grad():
 | 
				
			||||||
                # Only mock quantization_group_size=0 for now
 | 
					                # Only mock quantization_group_size=0 for now
 | 
				
			||||||
                cls.load_convert_cpu(qtype, model, "cpu", modules_to_not_convert, 0,
 | 
					                cls.load_convert_cpu(qtype, model, "cpu", modules_to_not_convert, 0,
 | 
				
			||||||
                                     *args, **kwargs)
 | 
					                                     imatrix_data, *args, **kwargs)
 | 
				
			||||||
            model = model.eval()
 | 
					            model = model.eval()
 | 
				
			||||||
            logger.info(f"Finish to convert model")
 | 
					            logger.info(f"Finish to convert model")
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
| 
						 | 
					@ -223,7 +223,7 @@ class _BaseAutoModelClass:
 | 
				
			||||||
                optimize_llm(model)
 | 
					                optimize_llm(model)
 | 
				
			||||||
                with torch.no_grad():
 | 
					                with torch.no_grad():
 | 
				
			||||||
                    cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
 | 
					                    cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
 | 
				
			||||||
                                     quantization_group_size, imatrix_data=imatrix_data,
 | 
					                                     quantization_group_size, imatrix_data,
 | 
				
			||||||
                                     *args, **kwargs)
 | 
					                                     *args, **kwargs)
 | 
				
			||||||
                    if hasattr(model, "llm"):
 | 
					                    if hasattr(model, "llm"):
 | 
				
			||||||
                        create_npu_kernels(model.llm)
 | 
					                        create_npu_kernels(model.llm)
 | 
				
			||||||
| 
						 | 
					@ -333,12 +333,12 @@ class _BaseAutoModelClass:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def load_convert_cpu(cls, q_k, optimize_model, device, modules_to_not_convert,
 | 
					    def load_convert_cpu(cls, q_k, optimize_model, device, modules_to_not_convert,
 | 
				
			||||||
                         group_size=0, *arg, **kwarg):
 | 
					                         group_size=0, imatrix_data=None, *arg, **kwarg):
 | 
				
			||||||
        from ipex_llm.transformers.npu_models.convert import replace_with_DequantizedLinear
 | 
					        from ipex_llm.transformers.npu_models.convert import replace_with_DequantizedLinear
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        replace_with_DequantizedLinear(optimize_model, q_k, device=device,
 | 
					        replace_with_DequantizedLinear(optimize_model, q_k, device=device,
 | 
				
			||||||
                                       modules_to_not_convert=modules_to_not_convert,
 | 
					                                       modules_to_not_convert=modules_to_not_convert,
 | 
				
			||||||
                                       group_size=group_size)
 | 
					                                       group_size=group_size, imatrix=imatrix_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    @patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
 | 
					    @patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
 | 
				
			||||||
| 
						 | 
					@ -766,7 +766,7 @@ class EmbeddingModel(_BaseAutoModelClass):
 | 
				
			||||||
            optimize_llm_pre(model, qtype, mixed_precision,
 | 
					            optimize_llm_pre(model, qtype, mixed_precision,
 | 
				
			||||||
                             quantization_group_size=quantization_group_size)
 | 
					                             quantization_group_size=quantization_group_size)
 | 
				
			||||||
            cls.load_convert_fp16(qtype, model.encoder, "cpu", modules_to_not_convert,
 | 
					            cls.load_convert_fp16(qtype, model.encoder, "cpu", modules_to_not_convert,
 | 
				
			||||||
                                  quantization_group_size, *args, **kwargs)
 | 
					                                  quantization_group_size, None, *args, **kwargs)
 | 
				
			||||||
            create_npu_kernels(model.encoder)
 | 
					            create_npu_kernels(model.encoder)
 | 
				
			||||||
        model = model.eval()
 | 
					        model = model.eval()
 | 
				
			||||||
        logger.info(f"Finish to convert model")
 | 
					        logger.info(f"Finish to convert model")
 | 
				
			||||||
| 
						 | 
					@ -781,11 +781,11 @@ class EmbeddingModel(_BaseAutoModelClass):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def load_convert_fp16(cls, q_k, optimize_model, device, modules_to_not_convert,
 | 
					    def load_convert_fp16(cls, q_k, optimize_model, device, modules_to_not_convert,
 | 
				
			||||||
                          group_size=0, *arg, **kwarg):
 | 
					                          group_size=0, imatrix_data=None, *arg, **kwarg):
 | 
				
			||||||
        from ipex_llm.transformers.npu_models.xlm_mp import replace_with_FP16Linear
 | 
					        from ipex_llm.transformers.npu_models.xlm_mp import replace_with_FP16Linear
 | 
				
			||||||
        replace_with_FP16Linear(optimize_model, q_k, device=device,
 | 
					        replace_with_FP16Linear(optimize_model, q_k, device=device,
 | 
				
			||||||
                                modules_to_not_convert=modules_to_not_convert,
 | 
					                                modules_to_not_convert=modules_to_not_convert,
 | 
				
			||||||
                                group_size=group_size)
 | 
					                                group_size=group_size, imatrix=imatrix_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def encode(self,
 | 
					    def encode(self,
 | 
				
			||||||
               sentences,
 | 
					               sentences,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -104,7 +104,7 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@module_optimization
 | 
					@module_optimization
 | 
				
			||||||
def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert,
 | 
					def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert,
 | 
				
			||||||
                                   group_size):
 | 
					                                   group_size, imatrix):
 | 
				
			||||||
    from ipex_llm.transformers.npu_models.linear import DequantizedLinear
 | 
					    from ipex_llm.transformers.npu_models.linear import DequantizedLinear
 | 
				
			||||||
    from ipex_llm.transformers.low_bit_linear import ggml_convert_qtype
 | 
					    from ipex_llm.transformers.low_bit_linear import ggml_convert_qtype
 | 
				
			||||||
    from ipex_llm.ggml.quantize import ggml_tensor_qtype
 | 
					    from ipex_llm.ggml.quantize import ggml_tensor_qtype
 | 
				
			||||||
| 
						 | 
					@ -113,7 +113,8 @@ def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert,
 | 
				
			||||||
        enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0"
 | 
					        enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0"
 | 
				
			||||||
        qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32),
 | 
					        qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32),
 | 
				
			||||||
                                             iqtype, device=device,
 | 
					                                             iqtype, device=device,
 | 
				
			||||||
                                             enable_scale_search=enable_scale_search)
 | 
					                                             enable_scale_search=enable_scale_search,
 | 
				
			||||||
 | 
					                                             imatrix=imatrix)
 | 
				
			||||||
        return DequantizedLinear(qweights, scale, layer.bias)
 | 
					        return DequantizedLinear(qweights, scale, layer.bias)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -721,7 +721,7 @@ def replace_with_Layernorm(layer, qtype=None, device='NPU',
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@module_optimization
 | 
					@module_optimization
 | 
				
			||||||
def replace_with_FP16Linear(layer, qtype, device, modules_to_not_convert,
 | 
					def replace_with_FP16Linear(layer, qtype, device, modules_to_not_convert,
 | 
				
			||||||
                            group_size):
 | 
					                            group_size, imatrix=None):
 | 
				
			||||||
    from ipex_llm.transformers.npu_models.linear import Linear
 | 
					    from ipex_llm.transformers.npu_models.linear import Linear
 | 
				
			||||||
    if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
 | 
					    if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
 | 
				
			||||||
        return Linear(layer.weight, layer.bias)
 | 
					        return Linear(layer.weight, layer.bias)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue