small fix of imatrix (#12480)
This commit is contained in:
parent
ab01753b1c
commit
598603bea6
3 changed files with 11 additions and 10 deletions
|
|
@ -186,7 +186,7 @@ class _BaseAutoModelClass:
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# Only mock quantization_group_size=0 for now
|
# Only mock quantization_group_size=0 for now
|
||||||
cls.load_convert_cpu(qtype, model, "cpu", modules_to_not_convert, 0,
|
cls.load_convert_cpu(qtype, model, "cpu", modules_to_not_convert, 0,
|
||||||
*args, **kwargs)
|
imatrix_data, *args, **kwargs)
|
||||||
model = model.eval()
|
model = model.eval()
|
||||||
logger.info(f"Finish to convert model")
|
logger.info(f"Finish to convert model")
|
||||||
else:
|
else:
|
||||||
|
|
@ -223,7 +223,7 @@ class _BaseAutoModelClass:
|
||||||
optimize_llm(model)
|
optimize_llm(model)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
|
cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
|
||||||
quantization_group_size, imatrix_data=imatrix_data,
|
quantization_group_size, imatrix_data,
|
||||||
*args, **kwargs)
|
*args, **kwargs)
|
||||||
if hasattr(model, "llm"):
|
if hasattr(model, "llm"):
|
||||||
create_npu_kernels(model.llm)
|
create_npu_kernels(model.llm)
|
||||||
|
|
@ -333,12 +333,12 @@ class _BaseAutoModelClass:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load_convert_cpu(cls, q_k, optimize_model, device, modules_to_not_convert,
|
def load_convert_cpu(cls, q_k, optimize_model, device, modules_to_not_convert,
|
||||||
group_size=0, *arg, **kwarg):
|
group_size=0, imatrix_data=None, *arg, **kwarg):
|
||||||
from ipex_llm.transformers.npu_models.convert import replace_with_DequantizedLinear
|
from ipex_llm.transformers.npu_models.convert import replace_with_DequantizedLinear
|
||||||
|
|
||||||
replace_with_DequantizedLinear(optimize_model, q_k, device=device,
|
replace_with_DequantizedLinear(optimize_model, q_k, device=device,
|
||||||
modules_to_not_convert=modules_to_not_convert,
|
modules_to_not_convert=modules_to_not_convert,
|
||||||
group_size=group_size)
|
group_size=group_size, imatrix=imatrix_data)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
|
@patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
|
||||||
|
|
@ -766,7 +766,7 @@ class EmbeddingModel(_BaseAutoModelClass):
|
||||||
optimize_llm_pre(model, qtype, mixed_precision,
|
optimize_llm_pre(model, qtype, mixed_precision,
|
||||||
quantization_group_size=quantization_group_size)
|
quantization_group_size=quantization_group_size)
|
||||||
cls.load_convert_fp16(qtype, model.encoder, "cpu", modules_to_not_convert,
|
cls.load_convert_fp16(qtype, model.encoder, "cpu", modules_to_not_convert,
|
||||||
quantization_group_size, *args, **kwargs)
|
quantization_group_size, None, *args, **kwargs)
|
||||||
create_npu_kernels(model.encoder)
|
create_npu_kernels(model.encoder)
|
||||||
model = model.eval()
|
model = model.eval()
|
||||||
logger.info(f"Finish to convert model")
|
logger.info(f"Finish to convert model")
|
||||||
|
|
@ -781,11 +781,11 @@ class EmbeddingModel(_BaseAutoModelClass):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load_convert_fp16(cls, q_k, optimize_model, device, modules_to_not_convert,
|
def load_convert_fp16(cls, q_k, optimize_model, device, modules_to_not_convert,
|
||||||
group_size=0, *arg, **kwarg):
|
group_size=0, imatrix_data=None, *arg, **kwarg):
|
||||||
from ipex_llm.transformers.npu_models.xlm_mp import replace_with_FP16Linear
|
from ipex_llm.transformers.npu_models.xlm_mp import replace_with_FP16Linear
|
||||||
replace_with_FP16Linear(optimize_model, q_k, device=device,
|
replace_with_FP16Linear(optimize_model, q_k, device=device,
|
||||||
modules_to_not_convert=modules_to_not_convert,
|
modules_to_not_convert=modules_to_not_convert,
|
||||||
group_size=group_size)
|
group_size=group_size, imatrix=imatrix_data)
|
||||||
|
|
||||||
def encode(self,
|
def encode(self,
|
||||||
sentences,
|
sentences,
|
||||||
|
|
|
||||||
|
|
@ -104,7 +104,7 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
|
||||||
|
|
||||||
@module_optimization
|
@module_optimization
|
||||||
def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert,
|
def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert,
|
||||||
group_size):
|
group_size, imatrix):
|
||||||
from ipex_llm.transformers.npu_models.linear import DequantizedLinear
|
from ipex_llm.transformers.npu_models.linear import DequantizedLinear
|
||||||
from ipex_llm.transformers.low_bit_linear import ggml_convert_qtype
|
from ipex_llm.transformers.low_bit_linear import ggml_convert_qtype
|
||||||
from ipex_llm.ggml.quantize import ggml_tensor_qtype
|
from ipex_llm.ggml.quantize import ggml_tensor_qtype
|
||||||
|
|
@ -113,7 +113,8 @@ def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert,
|
||||||
enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0"
|
enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0"
|
||||||
qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32),
|
qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32),
|
||||||
iqtype, device=device,
|
iqtype, device=device,
|
||||||
enable_scale_search=enable_scale_search)
|
enable_scale_search=enable_scale_search,
|
||||||
|
imatrix=imatrix)
|
||||||
return DequantizedLinear(qweights, scale, layer.bias)
|
return DequantizedLinear(qweights, scale, layer.bias)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -721,7 +721,7 @@ def replace_with_Layernorm(layer, qtype=None, device='NPU',
|
||||||
|
|
||||||
@module_optimization
|
@module_optimization
|
||||||
def replace_with_FP16Linear(layer, qtype, device, modules_to_not_convert,
|
def replace_with_FP16Linear(layer, qtype, device, modules_to_not_convert,
|
||||||
group_size):
|
group_size, imatrix=None):
|
||||||
from ipex_llm.transformers.npu_models.linear import Linear
|
from ipex_llm.transformers.npu_models.linear import Linear
|
||||||
if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
|
if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
|
||||||
return Linear(layer.weight, layer.bias)
|
return Linear(layer.weight, layer.bias)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue