add disable opts for awq (#12641)
This commit is contained in:
parent
62318964fa
commit
8e5328e9b4
4 changed files with 17 additions and 6 deletions
|
|
@ -254,7 +254,9 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_
|
||||||
torch_dtype=torch_dtype,
|
torch_dtype=torch_dtype,
|
||||||
optimize_model=optimize_llm,
|
optimize_model=optimize_llm,
|
||||||
modules_to_not_convert=modules_to_not_convert,
|
modules_to_not_convert=modules_to_not_convert,
|
||||||
cpu_embedding=cpu_embedding)
|
cpu_embedding=cpu_embedding,
|
||||||
|
disable_optimize_pre=kwargs.pop("disable_optimize_pre",
|
||||||
|
False))
|
||||||
# add save_low_bit to pretrained model dynamically
|
# add save_low_bit to pretrained model dynamically
|
||||||
import types
|
import types
|
||||||
model._bigdl_config = dict()
|
model._bigdl_config = dict()
|
||||||
|
|
|
||||||
|
|
@ -1081,7 +1081,8 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
|
||||||
torch_dtype="auto",
|
torch_dtype="auto",
|
||||||
imatrix_data=None,
|
imatrix_data=None,
|
||||||
embedding_qtype=None,
|
embedding_qtype=None,
|
||||||
mixed_precision=False):
|
mixed_precision=False,
|
||||||
|
disable_optimize_pre=False):
|
||||||
if qtype in ggml_tensor_qtype.values():
|
if qtype in ggml_tensor_qtype.values():
|
||||||
index = list(ggml_tensor_qtype.values()).index(qtype)
|
index = list(ggml_tensor_qtype.values()).index(qtype)
|
||||||
logger.info(f"Converting the current model to "
|
logger.info(f"Converting the current model to "
|
||||||
|
|
@ -1104,7 +1105,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
|
||||||
model = _optimize_ipex(model, qtype)
|
model = _optimize_ipex(model, qtype)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
if optimize_model:
|
if optimize_model and not disable_optimize_pre:
|
||||||
model = _optimize_pre(model, qtype)
|
model = _optimize_pre(model, qtype)
|
||||||
|
|
||||||
act_order = False
|
act_order = False
|
||||||
|
|
|
||||||
|
|
@ -764,6 +764,7 @@ class FP16Linear(nn.Linear):
|
||||||
# weigh_type = 3 means weight has been transposed by esimd method
|
# weigh_type = 3 means weight has been transposed by esimd method
|
||||||
self.weight_type = 1
|
self.weight_type = 1
|
||||||
self.optimize_lm_head = optimize_lm_head
|
self.optimize_lm_head = optimize_lm_head
|
||||||
|
self.disable_fp16_opt = False
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor):
|
def forward(self, x: torch.Tensor):
|
||||||
# only work for GPU
|
# only work for GPU
|
||||||
|
|
@ -779,8 +780,11 @@ class FP16Linear(nn.Linear):
|
||||||
self.weight.data = self.weight.data.to(x.dtype)
|
self.weight.data = self.weight.data.to(x.dtype)
|
||||||
|
|
||||||
if not self.use_esimd_kernel(x):
|
if not self.use_esimd_kernel(x):
|
||||||
if get_ipex_version() < "2.1.10+xpu" \
|
if (
|
||||||
or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]:
|
get_ipex_version() < "2.1.10+xpu"
|
||||||
|
or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]
|
||||||
|
or self.disable_fp16_opt
|
||||||
|
):
|
||||||
if self.weight_type == 2:
|
if self.weight_type == 2:
|
||||||
self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
|
self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
|
||||||
requires_grad=False)
|
requires_grad=False)
|
||||||
|
|
@ -845,6 +849,8 @@ class FP16Linear(nn.Linear):
|
||||||
|
|
||||||
def use_esimd_kernel(self, x):
|
def use_esimd_kernel(self, x):
|
||||||
gpu_type = get_xpu_device_type(x)
|
gpu_type = get_xpu_device_type(x)
|
||||||
|
if self.disable_fp16_opt:
|
||||||
|
return False
|
||||||
# esimd kernel can only be used for Arc and Flex
|
# esimd kernel can only be used for Arc and Flex
|
||||||
if gpu_type not in ["arc", "flex"]:
|
if gpu_type not in ["arc", "flex"]:
|
||||||
return False
|
return False
|
||||||
|
|
|
||||||
|
|
@ -445,6 +445,7 @@ class _BaseAutoModelClass:
|
||||||
mixed_precision = kwargs.pop("mixed_precision", False)
|
mixed_precision = kwargs.pop("mixed_precision", False)
|
||||||
if embedding_qtype is not None:
|
if embedding_qtype is not None:
|
||||||
embedding_qtype = ggml_tensor_qtype[embedding_qtype]
|
embedding_qtype = ggml_tensor_qtype[embedding_qtype]
|
||||||
|
disable_optimize_pre = kwargs.pop("disable_optimize_pre", False)
|
||||||
_args = copy.deepcopy(args)
|
_args = copy.deepcopy(args)
|
||||||
_kwargs = copy.deepcopy(kwargs)
|
_kwargs = copy.deepcopy(kwargs)
|
||||||
awq_config = None
|
awq_config = None
|
||||||
|
|
@ -513,7 +514,8 @@ class _BaseAutoModelClass:
|
||||||
torch_dtype=kwargs.get("torch_dtype", 'auto'),
|
torch_dtype=kwargs.get("torch_dtype", 'auto'),
|
||||||
imatrix_data=imatrix_data,
|
imatrix_data=imatrix_data,
|
||||||
embedding_qtype=embedding_qtype,
|
embedding_qtype=embedding_qtype,
|
||||||
mixed_precision=mixed_precision)
|
mixed_precision=mixed_precision,
|
||||||
|
disable_optimize_pre=disable_optimize_pre)
|
||||||
|
|
||||||
if disk_embedding:
|
if disk_embedding:
|
||||||
from ipex_llm.transformers.embedding import DiskEmbedding
|
from ipex_llm.transformers.embedding import DiskEmbedding
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue