add disable opts for awq (#12641)
This commit is contained in:
		
							parent
							
								
									62318964fa
								
							
						
					
					
						commit
						8e5328e9b4
					
				
					 4 changed files with 17 additions and 6 deletions
				
			
		| 
						 | 
				
			
			@ -254,7 +254,9 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_
 | 
			
		|||
                                 torch_dtype=torch_dtype,
 | 
			
		||||
                                 optimize_model=optimize_llm,
 | 
			
		||||
                                 modules_to_not_convert=modules_to_not_convert,
 | 
			
		||||
                                 cpu_embedding=cpu_embedding)
 | 
			
		||||
                                 cpu_embedding=cpu_embedding,
 | 
			
		||||
                                 disable_optimize_pre=kwargs.pop("disable_optimize_pre",
 | 
			
		||||
                                                                 False))
 | 
			
		||||
    # add save_low_bit to pretrained model dynamically
 | 
			
		||||
    import types
 | 
			
		||||
    model._bigdl_config = dict()
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1081,7 +1081,8 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
 | 
			
		|||
                         torch_dtype="auto",
 | 
			
		||||
                         imatrix_data=None,
 | 
			
		||||
                         embedding_qtype=None,
 | 
			
		||||
                         mixed_precision=False):
 | 
			
		||||
                         mixed_precision=False,
 | 
			
		||||
                         disable_optimize_pre=False):
 | 
			
		||||
    if qtype in ggml_tensor_qtype.values():
 | 
			
		||||
        index = list(ggml_tensor_qtype.values()).index(qtype)
 | 
			
		||||
        logger.info(f"Converting the current model to "
 | 
			
		||||
| 
						 | 
				
			
			@ -1104,7 +1105,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
 | 
			
		|||
        model = _optimize_ipex(model, qtype)
 | 
			
		||||
        return model
 | 
			
		||||
 | 
			
		||||
    if optimize_model:
 | 
			
		||||
    if optimize_model and not disable_optimize_pre:
 | 
			
		||||
        model = _optimize_pre(model, qtype)
 | 
			
		||||
 | 
			
		||||
    act_order = False
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -764,6 +764,7 @@ class FP16Linear(nn.Linear):
 | 
			
		|||
        # weigh_type = 3 means weight has been transposed by esimd method
 | 
			
		||||
        self.weight_type = 1
 | 
			
		||||
        self.optimize_lm_head = optimize_lm_head
 | 
			
		||||
        self.disable_fp16_opt = False
 | 
			
		||||
 | 
			
		||||
    def forward(self, x: torch.Tensor):
 | 
			
		||||
        # only work for GPU
 | 
			
		||||
| 
						 | 
				
			
			@ -779,8 +780,11 @@ class FP16Linear(nn.Linear):
 | 
			
		|||
            self.weight.data = self.weight.data.to(x.dtype)
 | 
			
		||||
 | 
			
		||||
        if not self.use_esimd_kernel(x):
 | 
			
		||||
            if get_ipex_version() < "2.1.10+xpu" \
 | 
			
		||||
                    or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]:
 | 
			
		||||
            if (
 | 
			
		||||
                get_ipex_version() < "2.1.10+xpu"
 | 
			
		||||
                or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]
 | 
			
		||||
                or self.disable_fp16_opt
 | 
			
		||||
            ):
 | 
			
		||||
                if self.weight_type == 2:
 | 
			
		||||
                    self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
 | 
			
		||||
                                                     requires_grad=False)
 | 
			
		||||
| 
						 | 
				
			
			@ -845,6 +849,8 @@ class FP16Linear(nn.Linear):
 | 
			
		|||
 | 
			
		||||
    def use_esimd_kernel(self, x):
 | 
			
		||||
        gpu_type = get_xpu_device_type(x)
 | 
			
		||||
        if self.disable_fp16_opt:
 | 
			
		||||
            return False
 | 
			
		||||
        # esimd kernel can only be used for Arc and Flex
 | 
			
		||||
        if gpu_type not in ["arc", "flex"]:
 | 
			
		||||
            return False
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -445,6 +445,7 @@ class _BaseAutoModelClass:
 | 
			
		|||
        mixed_precision = kwargs.pop("mixed_precision", False)
 | 
			
		||||
        if embedding_qtype is not None:
 | 
			
		||||
            embedding_qtype = ggml_tensor_qtype[embedding_qtype]
 | 
			
		||||
        disable_optimize_pre = kwargs.pop("disable_optimize_pre", False)
 | 
			
		||||
        _args = copy.deepcopy(args)
 | 
			
		||||
        _kwargs = copy.deepcopy(kwargs)
 | 
			
		||||
        awq_config = None
 | 
			
		||||
| 
						 | 
				
			
			@ -513,7 +514,8 @@ class _BaseAutoModelClass:
 | 
			
		|||
                                     torch_dtype=kwargs.get("torch_dtype", 'auto'),
 | 
			
		||||
                                     imatrix_data=imatrix_data,
 | 
			
		||||
                                     embedding_qtype=embedding_qtype,
 | 
			
		||||
                                     mixed_precision=mixed_precision)
 | 
			
		||||
                                     mixed_precision=mixed_precision,
 | 
			
		||||
                                     disable_optimize_pre=disable_optimize_pre)
 | 
			
		||||
 | 
			
		||||
        if disk_embedding:
 | 
			
		||||
            from ipex_llm.transformers.embedding import DiskEmbedding
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue