add disable opts for awq (#12641)
This commit is contained in:
parent
62318964fa
commit
8e5328e9b4
4 changed files with 17 additions and 6 deletions
|
|
@ -254,7 +254,9 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_
|
|||
torch_dtype=torch_dtype,
|
||||
optimize_model=optimize_llm,
|
||||
modules_to_not_convert=modules_to_not_convert,
|
||||
cpu_embedding=cpu_embedding)
|
||||
cpu_embedding=cpu_embedding,
|
||||
disable_optimize_pre=kwargs.pop("disable_optimize_pre",
|
||||
False))
|
||||
# add save_low_bit to pretrained model dynamically
|
||||
import types
|
||||
model._bigdl_config = dict()
|
||||
|
|
|
|||
|
|
@ -1081,7 +1081,8 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
|
|||
torch_dtype="auto",
|
||||
imatrix_data=None,
|
||||
embedding_qtype=None,
|
||||
mixed_precision=False):
|
||||
mixed_precision=False,
|
||||
disable_optimize_pre=False):
|
||||
if qtype in ggml_tensor_qtype.values():
|
||||
index = list(ggml_tensor_qtype.values()).index(qtype)
|
||||
logger.info(f"Converting the current model to "
|
||||
|
|
@ -1104,7 +1105,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
|
|||
model = _optimize_ipex(model, qtype)
|
||||
return model
|
||||
|
||||
if optimize_model:
|
||||
if optimize_model and not disable_optimize_pre:
|
||||
model = _optimize_pre(model, qtype)
|
||||
|
||||
act_order = False
|
||||
|
|
|
|||
|
|
@ -764,6 +764,7 @@ class FP16Linear(nn.Linear):
|
|||
# weigh_type = 3 means weight has been transposed by esimd method
|
||||
self.weight_type = 1
|
||||
self.optimize_lm_head = optimize_lm_head
|
||||
self.disable_fp16_opt = False
|
||||
|
||||
def forward(self, x: torch.Tensor):
|
||||
# only work for GPU
|
||||
|
|
@ -779,8 +780,11 @@ class FP16Linear(nn.Linear):
|
|||
self.weight.data = self.weight.data.to(x.dtype)
|
||||
|
||||
if not self.use_esimd_kernel(x):
|
||||
if get_ipex_version() < "2.1.10+xpu" \
|
||||
or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]:
|
||||
if (
|
||||
get_ipex_version() < "2.1.10+xpu"
|
||||
or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]
|
||||
or self.disable_fp16_opt
|
||||
):
|
||||
if self.weight_type == 2:
|
||||
self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
|
||||
requires_grad=False)
|
||||
|
|
@ -845,6 +849,8 @@ class FP16Linear(nn.Linear):
|
|||
|
||||
def use_esimd_kernel(self, x):
|
||||
gpu_type = get_xpu_device_type(x)
|
||||
if self.disable_fp16_opt:
|
||||
return False
|
||||
# esimd kernel can only be used for Arc and Flex
|
||||
if gpu_type not in ["arc", "flex"]:
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -445,6 +445,7 @@ class _BaseAutoModelClass:
|
|||
mixed_precision = kwargs.pop("mixed_precision", False)
|
||||
if embedding_qtype is not None:
|
||||
embedding_qtype = ggml_tensor_qtype[embedding_qtype]
|
||||
disable_optimize_pre = kwargs.pop("disable_optimize_pre", False)
|
||||
_args = copy.deepcopy(args)
|
||||
_kwargs = copy.deepcopy(kwargs)
|
||||
awq_config = None
|
||||
|
|
@ -513,7 +514,8 @@ class _BaseAutoModelClass:
|
|||
torch_dtype=kwargs.get("torch_dtype", 'auto'),
|
||||
imatrix_data=imatrix_data,
|
||||
embedding_qtype=embedding_qtype,
|
||||
mixed_precision=mixed_precision)
|
||||
mixed_precision=mixed_precision,
|
||||
disable_optimize_pre=disable_optimize_pre)
|
||||
|
||||
if disk_embedding:
|
||||
from ipex_llm.transformers.embedding import DiskEmbedding
|
||||
|
|
|
|||
Loading…
Reference in a new issue