From 54bf3a23a647da4da404a2a5bdb10c3bf5e62826 Mon Sep 17 00:00:00 2001 From: Ruonan Wang Date: Wed, 31 Jul 2024 06:39:58 +0300 Subject: [PATCH] add fallback for unsupported k-quants (#11691) * add fallback * fix style * fix --- .../llm/src/ipex_llm/transformers/convert.py | 5 ++++- python/llm/src/ipex_llm/transformers/utils.py | 21 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index 274ce467..32cf166f 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -44,7 +44,7 @@ import warnings import transformers import importlib.util from ipex_llm.ggml.quantize import ggml_tensor_qtype, gguf_mixed_qtype -from .utils import logger, get_cur_qtype_and_imatrix +from .utils import logger, get_cur_qtype_and_imatrix, check_hidden_size import numpy as np import os from ipex_llm.utils.common import invalidInputError @@ -396,6 +396,9 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, ggml_tensor_qtype["asym_int4"]]: cur_qtype = ggml_tensor_qtype["sym_int8"] + # check hidden size whether is a multiple of 256 + cur_qtype = check_hidden_size(cur_qtype, in_features) + new_linear = LowBitLinear( in_features, out_features, diff --git a/python/llm/src/ipex_llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py index 74e10244..cf8c5612 100644 --- a/python/llm/src/ipex_llm/transformers/utils.py +++ b/python/llm/src/ipex_llm/transformers/utils.py @@ -361,3 +361,24 @@ def get_modelscope_hf_config(model_id_or_path: str, def is_torch_bf16_gpu_available(): # always true for XPU and CPU return True + + +def check_hidden_size(qtype, hidden_size): + if hidden_size % 256 != 0: + if qtype == ggml_tensor_qtype["q4_k"]: + logger.info(f"hidden size {hidden_size} is not divisible by 256, " + "required for q4_k - using fallback quantization asym_int4.") + return ggml_tensor_qtype["asym_int4"] + elif qtype == ggml_tensor_qtype["q5_k"]: + logger.info(f"hidden size {hidden_size} is not divisible by 256, " + "required for q5_k - using fallback quantization asym_int5.") + return ggml_tensor_qtype["asym_int5"] + elif qtype == ggml_tensor_qtype["q6_k"]: + logger.info(f"hidden size {hidden_size} is not divisible by 256, " + "required for q6_k - using fallback quantization sym_int8.") + return ggml_tensor_qtype["sym_int8"] + elif qtype == ggml_tensor_qtype["fp6_k"]: + logger.info(f"hidden size {hidden_size} is not divisible by 256, " + "required for fq6_k - using fallback quantization fp6.") + return ggml_tensor_qtype["fp6"] + return qtype