LLM: support iq1s for llama2-70b-hf (#10596)

2024-04-01 13:13:13 +08:00 · 2024-04-01 13:13:13 +08:00 · bfc1caa5e5
commit bfc1caa5e5
parent d6af4877dd
2 changed files with 20 additions and 11 deletions
--- a/python/llm/src/ipex_llm/transformers/convert.py
+++ b/python/llm/src/ipex_llm/transformers/convert.py
@ -192,7 +192,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                                 convert_shape_only=False,
                                 cpu_embedding=False, prefix_name='',
                                 imatrix_data=None, embedding_qtype=None,
-                                 model_type=None, torch_dtype=torch.float32,
+                                 model_config=None, torch_dtype=torch.float32,
                                 enable_xetla=False):
    from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \
        FP16Linear, BF16Linear
@ -211,6 +211,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
            in_features, out_features, mp_group = linear_args
            optimize_lm_head = False
            if name == "lm_head":
+                model_type = getattr(model_config, "model_type", None)
                if model_type in ["gptj", "llama"] and os.environ.get("BIGDL_OPTIMIZE_LM_HEAD",
                                                                      None) == "1":
                    optimize_lm_head = True
@ -262,7 +263,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                    cur_qtype, cur_imatrix = get_cur_qtype_and_imatrix(qtype,
                                                                       full_module_name,
                                                                       imatrix_data,
-                                                                       model_type)
+                                                                       model_config)
                    device = module.weight.data.device
                    # Copy the weights
                    paramsLowBit = FP4Params(data=module.weight.data,
@ -378,7 +379,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                prefix_name=prefix_name + '.' + name if prefix_name != '' else name,
                imatrix_data=imatrix_data,
                embedding_qtype=embedding_qtype,
-                model_type=model_type,
+                model_config=model_config,
                torch_dtype=torch_dtype,
                enable_xetla=enable_xetla,
            )
@ -652,17 +653,13 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
    if optimize_model:
        model = _optimize_pre(model)

-    # mixed quantization needs model_type to choose custom quantization strategy
-    if hasattr(model, "config"):
-        model_type = getattr(model.config, "model_type", None)
-    else:
-        model_type = None
+    # mixed quantization needs model_config to choose custom quantization strategy
    model, has_been_replaced = _replace_with_low_bit_linear(
        model, qtype, modules_to_not_convert,
        convert_shape_only, cpu_embedding,
        imatrix_data=imatrix_data,
        embedding_qtype=embedding_qtype,
-        model_type=model_type,
+        model_config=getattr(model, "config", None),
        torch_dtype=torch_dtype,
        enable_xetla=enable_xetla,
    )
--- a/python/llm/src/ipex_llm/transformers/utils.py
+++ b/python/llm/src/ipex_llm/transformers/utils.py
@ -267,8 +267,12 @@ def module_name_process(full_module_name):
    return new_module_name, layer, cur_module


-def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_type=None):
+def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_config=None):
    cur_qtype = qtype
+    if model_config is not None:
+        model_type = getattr(model_config, "model_type", None)
+    else:
+        model_dtype = None
    if qtype in [ggml_tensor_qtype["gguf_iq2_xxs"], ggml_tensor_qtype["gguf_iq2_xs"],
                 ggml_tensor_qtype["gguf_iq1_s"]]:
        # For quantization which needs importance matrix
@ -281,7 +285,15 @@ def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_type=
            elif cur_module == 'down' and int(layer) in [0, 1, 2, 3]:
                cur_qtype = ggml_tensor_qtype['q2_k']
        else:
-            if cur_module == 'v' or (cur_module == 'down' and int(layer) in [0, 1, 10, 11]):
+            num_hidden_layers = getattr(model_config, "num_hidden_layers", None)
+            hidden_size = getattr(model_config, "hidden_size", None)
+            if model_type == "llama" and hidden_size == 8192:
+                # for llama2-70b
+                if cur_module == 'v':
+                    cur_qtype = ggml_tensor_qtype['sym_int4']  # llama.cpp use q4k here
+                if cur_module == 'down' and int(layer) < int(num_hidden_layers/8):
+                    cur_qtype = ggml_tensor_qtype['q2_k']
+            elif cur_module == 'v' or (cur_module == 'down' and int(layer) in [0, 1, 10, 11]):
                cur_qtype = ggml_tensor_qtype['q2_k']
            if qtype == ggml_tensor_qtype["gguf_iq1_s"] and cur_module == 'o':
                cur_qtype = ggml_tensor_qtype['gguf_iq2_xxs']