From bfc1caa5e5c5b16ff5fbf939538ed128722cd44f Mon Sep 17 00:00:00 2001 From: Ruonan Wang Date: Mon, 1 Apr 2024 13:13:13 +0800 Subject: [PATCH] LLM: support iq1s for llama2-70b-hf (#10596) --- python/llm/src/ipex_llm/transformers/convert.py | 15 ++++++--------- python/llm/src/ipex_llm/transformers/utils.py | 16 ++++++++++++++-- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index c0f9c857..d607dbaa 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -192,7 +192,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, convert_shape_only=False, cpu_embedding=False, prefix_name='', imatrix_data=None, embedding_qtype=None, - model_type=None, torch_dtype=torch.float32, + model_config=None, torch_dtype=torch.float32, enable_xetla=False): from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \ FP16Linear, BF16Linear @@ -211,6 +211,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, in_features, out_features, mp_group = linear_args optimize_lm_head = False if name == "lm_head": + model_type = getattr(model_config, "model_type", None) if model_type in ["gptj", "llama"] and os.environ.get("BIGDL_OPTIMIZE_LM_HEAD", None) == "1": optimize_lm_head = True @@ -262,7 +263,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, cur_qtype, cur_imatrix = get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, - model_type) + model_config) device = module.weight.data.device # Copy the weights paramsLowBit = FP4Params(data=module.weight.data, @@ -378,7 +379,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, prefix_name=prefix_name + '.' + name if prefix_name != '' else name, imatrix_data=imatrix_data, embedding_qtype=embedding_qtype, - model_type=model_type, + model_config=model_config, torch_dtype=torch_dtype, enable_xetla=enable_xetla, ) @@ -652,17 +653,13 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True, if optimize_model: model = _optimize_pre(model) - # mixed quantization needs model_type to choose custom quantization strategy - if hasattr(model, "config"): - model_type = getattr(model.config, "model_type", None) - else: - model_type = None + # mixed quantization needs model_config to choose custom quantization strategy model, has_been_replaced = _replace_with_low_bit_linear( model, qtype, modules_to_not_convert, convert_shape_only, cpu_embedding, imatrix_data=imatrix_data, embedding_qtype=embedding_qtype, - model_type=model_type, + model_config=getattr(model, "config", None), torch_dtype=torch_dtype, enable_xetla=enable_xetla, ) diff --git a/python/llm/src/ipex_llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py index 1450cd0c..b7873045 100644 --- a/python/llm/src/ipex_llm/transformers/utils.py +++ b/python/llm/src/ipex_llm/transformers/utils.py @@ -267,8 +267,12 @@ def module_name_process(full_module_name): return new_module_name, layer, cur_module -def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_type=None): +def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_config=None): cur_qtype = qtype + if model_config is not None: + model_type = getattr(model_config, "model_type", None) + else: + model_dtype = None if qtype in [ggml_tensor_qtype["gguf_iq2_xxs"], ggml_tensor_qtype["gguf_iq2_xs"], ggml_tensor_qtype["gguf_iq1_s"]]: # For quantization which needs importance matrix @@ -281,7 +285,15 @@ def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_type= elif cur_module == 'down' and int(layer) in [0, 1, 2, 3]: cur_qtype = ggml_tensor_qtype['q2_k'] else: - if cur_module == 'v' or (cur_module == 'down' and int(layer) in [0, 1, 10, 11]): + num_hidden_layers = getattr(model_config, "num_hidden_layers", None) + hidden_size = getattr(model_config, "hidden_size", None) + if model_type == "llama" and hidden_size == 8192: + # for llama2-70b + if cur_module == 'v': + cur_qtype = ggml_tensor_qtype['sym_int4'] # llama.cpp use q4k here + if cur_module == 'down' and int(layer) < int(num_hidden_layers/8): + cur_qtype = ggml_tensor_qtype['q2_k'] + elif cur_module == 'v' or (cur_module == 'down' and int(layer) in [0, 1, 10, 11]): cur_qtype = ggml_tensor_qtype['q2_k'] if qtype == ggml_tensor_qtype["gguf_iq1_s"] and cur_module == 'o': cur_qtype = ggml_tensor_qtype['gguf_iq2_xxs']