[NPU] Add env to enable scale search (#12462)
* add env enable scale search * address comment * move logic
This commit is contained in:
		
							parent
							
								
									d272f6b471
								
							
						
					
					
						commit
						1b533a105c
					
				
					 1 changed files with 7 additions and 2 deletions
				
			
		| 
						 | 
					@ -14,6 +14,7 @@
 | 
				
			||||||
# limitations under the License.
 | 
					# limitations under the License.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
import importlib
 | 
					import importlib
 | 
				
			||||||
from ipex_llm.transformers.npu_models.linear import QuantizedLinear
 | 
					from ipex_llm.transformers.npu_models.linear import QuantizedLinear
 | 
				
			||||||
| 
						 | 
					@ -69,8 +70,10 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
 | 
				
			||||||
               (layer.in_features == 18944 and layer.out_features == 3584):
 | 
					               (layer.in_features == 18944 and layer.out_features == 3584):
 | 
				
			||||||
                qtype = "sym_int8_rtn"
 | 
					                qtype = "sym_int8_rtn"
 | 
				
			||||||
                iqtype = ggml_tensor_qtype[qtype]
 | 
					                iqtype = ggml_tensor_qtype[qtype]
 | 
				
			||||||
 | 
					        enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0"
 | 
				
			||||||
        qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32),
 | 
					        qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32),
 | 
				
			||||||
                                             iqtype, device=device)
 | 
					                                             iqtype, device=device,
 | 
				
			||||||
 | 
					                                             enable_scale_search=enable_scale_search)
 | 
				
			||||||
        return QuantizedLinear(qweights, scale, layer.bias,
 | 
					        return QuantizedLinear(qweights, scale, layer.bias,
 | 
				
			||||||
                               group_size=group_size)
 | 
					                               group_size=group_size)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -83,8 +86,10 @@ def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert,
 | 
				
			||||||
    from ipex_llm.ggml.quantize import ggml_tensor_qtype
 | 
					    from ipex_llm.ggml.quantize import ggml_tensor_qtype
 | 
				
			||||||
    iqtype = ggml_tensor_qtype[qtype]
 | 
					    iqtype = ggml_tensor_qtype[qtype]
 | 
				
			||||||
    if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
 | 
					    if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
 | 
				
			||||||
 | 
					        enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0"
 | 
				
			||||||
        qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32),
 | 
					        qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32),
 | 
				
			||||||
                                             iqtype, device=device)
 | 
					                                             iqtype, device=device,
 | 
				
			||||||
 | 
					                                             enable_scale_search=enable_scale_search)
 | 
				
			||||||
        return DequantizedLinear(qweights, scale, layer.bias)
 | 
					        return DequantizedLinear(qweights, scale, layer.bias)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue