From 063dc145aca0cd320f532bcaff0ff611659d77b8 Mon Sep 17 00:00:00 2001
From: Ruonan Wang <ruonan1.wang@intel.com>
Date: Thu, 8 Feb 2024 13:52:01 +0800
Subject: [PATCH] LLM: basic support for q2k (#10132)

* basic support for q2k

* fix style
---
 python/llm/src/bigdl/llm/ggml/quantize.py              |  3 ++-
 .../llm/src/bigdl/llm/transformers/low_bit_linear.py   | 10 ++++++----
 python/llm/src/bigdl/llm/transformers/model.py         |  3 ++-
 python/llm/src/bigdl/llm/transformers/utils.py         |  9 ++++-----
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/python/llm/src/bigdl/llm/ggml/quantize.py b/python/llm/src/bigdl/llm/ggml/quantize.py
index 31be0f8f..b25dd7ce 100644
--- a/python/llm/src/bigdl/llm/ggml/quantize.py
+++ b/python/llm/src/bigdl/llm/ggml/quantize.py
@@ -41,7 +41,8 @@ ggml_tensor_qtype = {"sym_int4": 2,   # q4_0 in ggml
                      "fp8": 19,           # fp8 in e5m2 format
                      "bf16": 20,
                      "iq2_xxs": 21,
-                     "iq2_xs": 22}
+                     "iq2_xs": 22,
+                     "q2_k": 23}
 
 _llama_quantize_type = {"q4_0": 2,
                         "q4_1": 3,
diff --git a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py
index 62c97da1..9676de62 100644
--- a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py
+++ b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py
@@ -72,6 +72,7 @@ MOFQ8 = ggml_tensor_qtype["mixed_fp8"]
 FP8E5 = ggml_tensor_qtype["fp8_e5m2"]
 IQ2_XXS = ggml_tensor_qtype["iq2_xxs"]
 IQ2_XS = ggml_tensor_qtype["iq2_xs"]
+Q2_K = ggml_tensor_qtype["q2_k"]
 
 
 def get_block_size(qtype: str):
@@ -105,12 +106,13 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int,
     if not convert_shape_only and device != 'meta':
         dst = ctypes.c_void_p(dst_tensor.data.data_ptr())
         hist = (ctypes.c_int64 * 16)()
-        if qtype not in [IQ2_XXS, IQ2_XS]:
+        if qtype not in [IQ2_XXS, IQ2_XS, Q2_K]:
             ggml.ggml_quantize_tensor(src, dst, qtype, n, k, hist)
         else:
-            # quantize with importance matrix
-            imatrix = imatrix.data.data_ptr()
-            imatrix = ctypes.cast(imatrix, ctypes.POINTER(ctypes.c_float))
+            if imatrix is not None:
+                # quantize with importance matrix
+                imatrix = imatrix.data.data_ptr()
+                imatrix = ctypes.cast(imatrix, ctypes.POINTER(ctypes.c_float))
             # pass nrow and n_per_row
             ggml.ggml_quantize_tensor_with_weights(src, dst, qtype,
                                                    n // in_features, in_features,
diff --git a/python/llm/src/bigdl/llm/transformers/model.py b/python/llm/src/bigdl/llm/transformers/model.py
index 022190e6..c3cdde4a 100644
--- a/python/llm/src/bigdl/llm/transformers/model.py
+++ b/python/llm/src/bigdl/llm/transformers/model.py
@@ -271,10 +271,11 @@ class _BaseAutoModelClass:
                 else:
                     kwargs["pretraining_tp"] = 1
             q_k = load_in_low_bit if load_in_low_bit else "sym_int4"
+            imatrix_file = kwargs.pop("imatrix", None)
             if q_k in ["iq2_xxs", "iq2_xs"]:
-                imatrix_file = kwargs.pop("imatrix", None)
                 invalidInputError(imatrix_file is not None,
                                   "For iq2_xxs and iq2_xs quantization, imatrix is needed.")
+            if imatrix_file is not None:
                 imatrix_data = load_imatrix_data(imatrix_file)
                 kwargs['imatrix_data'] = imatrix_data
             model = cls.load_convert(q_k, optimize_model, *args, **kwargs)
diff --git a/python/llm/src/bigdl/llm/transformers/utils.py b/python/llm/src/bigdl/llm/transformers/utils.py
index e7ab41aa..e3751fe7 100644
--- a/python/llm/src/bigdl/llm/transformers/utils.py
+++ b/python/llm/src/bigdl/llm/transformers/utils.py
@@ -225,7 +225,8 @@ def load_imatrix_data(imatrix_file):
 
 
 def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data):
-    if qtype in [ggml_tensor_qtype["iq2_xxs"], ggml_tensor_qtype["iq2_xs"]]:
+    if qtype in [ggml_tensor_qtype["iq2_xxs"], ggml_tensor_qtype["iq2_xs"],
+                 ggml_tensor_qtype["q2_k"]] and imatrix_data is not None:
         # For quantization which needs importance matrix
         # module name preprocess
         # full name maybe model.layers.31.self_attn.o_proj
@@ -253,11 +254,9 @@ def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data):
             if cur_module == 'v' or (cur_module == 'down' and int(layer) in [0, 1, 10, 11]) \
                     or new_module_name == 'lm_head':
                 cur_qtype = ggml_tensor_qtype['sym_int4']
+        return cur_qtype, cur_imatrix
     else:
-        cur_imatrix = None
-        cur_qtype = qtype
-
-    return cur_qtype, cur_imatrix
+        return qtype, None
 
 
 def get_modelscope_hf_config(model_id_or_path: str,