From 54bf3a23a647da4da404a2a5bdb10c3bf5e62826 Mon Sep 17 00:00:00 2001
From: Ruonan Wang <ruonan1.wang@intel.com>
Date: Wed, 31 Jul 2024 06:39:58 +0300
Subject: [PATCH] add fallback for unsupported k-quants (#11691)

* add fallback

* fix style

* fix
---
 .../llm/src/ipex_llm/transformers/convert.py  |  5 ++++-
 python/llm/src/ipex_llm/transformers/utils.py | 21 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
index 274ce467..32cf166f 100644
--- a/python/llm/src/ipex_llm/transformers/convert.py
+++ b/python/llm/src/ipex_llm/transformers/convert.py
@@ -44,7 +44,7 @@ import warnings
 import transformers
 import importlib.util
 from ipex_llm.ggml.quantize import ggml_tensor_qtype, gguf_mixed_qtype
-from .utils import logger, get_cur_qtype_and_imatrix
+from .utils import logger, get_cur_qtype_and_imatrix, check_hidden_size
 import numpy as np
 import os
 from ipex_llm.utils.common import invalidInputError
@@ -396,6 +396,9 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                                          ggml_tensor_qtype["asym_int4"]]:
                             cur_qtype = ggml_tensor_qtype["sym_int8"]
 
+                    # check hidden size whether is a multiple of 256
+                    cur_qtype = check_hidden_size(cur_qtype, in_features)
+
                     new_linear = LowBitLinear(
                         in_features,
                         out_features,
diff --git a/python/llm/src/ipex_llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py
index 74e10244..cf8c5612 100644
--- a/python/llm/src/ipex_llm/transformers/utils.py
+++ b/python/llm/src/ipex_llm/transformers/utils.py
@@ -361,3 +361,24 @@ def get_modelscope_hf_config(model_id_or_path: str,
 def is_torch_bf16_gpu_available():
     # always true for XPU and CPU
     return True
+
+
+def check_hidden_size(qtype, hidden_size):
+    if hidden_size % 256 != 0:
+        if qtype == ggml_tensor_qtype["q4_k"]:
+            logger.info(f"hidden size {hidden_size} is not divisible by 256, "
+                        "required for q4_k - using fallback quantization asym_int4.")
+            return ggml_tensor_qtype["asym_int4"]
+        elif qtype == ggml_tensor_qtype["q5_k"]:
+            logger.info(f"hidden size {hidden_size} is not divisible by 256, "
+                        "required for q5_k - using fallback quantization asym_int5.")
+            return ggml_tensor_qtype["asym_int5"]
+        elif qtype == ggml_tensor_qtype["q6_k"]:
+            logger.info(f"hidden size {hidden_size} is not divisible by 256, "
+                        "required for q6_k - using fallback quantization sym_int8.")
+            return ggml_tensor_qtype["sym_int8"]
+        elif qtype == ggml_tensor_qtype["fp6_k"]:
+            logger.info(f"hidden size {hidden_size} is not divisible by 256, "
+                        "required for fq6_k - using fallback quantization fp6.")
+            return ggml_tensor_qtype["fp6"]
+    return qtype