ipex-llm/python/llm/src/ipex_llm/ggml/quantize.py

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import subprocess
from ipex_llm.utils.common import invalidInputError
import platform
from pathlib import Path


dirname, _ = os.path.split(os.path.abspath(__file__))
libs_dirname = os.path.dirname(dirname)

# ggml quantized tensor type, this is different from below file quantized type(_quantize_type)
ggml_tensor_qtype = {"sym_int4": 2,   # q4_0 in ggml
                     "asym_int4": 3,  # q4_1 in ggml
                     "sym_int5": 6,   # q5_0 in ggml
                     "asym_int5": 7,  # q5_1 in ggml
                     "sym_int8": 8,   # q8_0 in ggml
                     "nf4": 10,
                     "nf3": 11,
                     "fp16": 12,
                     "fp8_e4m3": 15,      # fp8 in e4m3 format
                     "fp4": 16,
                     "mixed_fp4": 17,     # Mixture of Formats Quantization 4 bits
                     "mixed_fp8": 18,     # Mixture of Formats Quantization 8 bits
                     "fp8_e5m2": 19,      # fp8 in e5m2 format
                     "fp8": 19,           # fp8 in e5m2 format
                     "bf16": 20,
                     "gguf_iq2_xxs": 21,
                     "gguf_iq2_xs": 22,
                     "q2_k": 23,
                     "gguf_iq1_s": 24,
                     "gguf_iq1_m": 25,
                     "q6_k": 26,
                     "q4_k": 27,
                     "q5_k": 28,
                     "fp6": 29,
                     "fp6_k": 30,
                     "sym_int4_rtn": 31,
                     "sym_int8_rtn": 32,
                     "asym_int4_rtn": 33,
                     "woq_int4": 34,
                     }

# mixed precison from llama.cpp
gguf_mixed_qtype = {"gguf_q4k_s": 101,
                    "gguf_q4k_m": 102}

_llama_quantize_type = {"q4_0": 2,
                        "q4_1": 3,
                        "q5_0": 8,
                        "q5_1": 9,
                        "q8_0": 7}
_bloom_quantize_type = {"q4_0": 2,
                        "q4_1": 3}
_gptneox_quantize_type = {"q4_0": 2,
                          "q4_1": 3,
                          "q5_0": 8,
                          "q5_1": 9,
                          "q8_0": 7}
_starcoder_quantize_type = {"q4_0": 2,
                            "q4_1": 3,
                            "q5_0": 8,
                            "q5_1": 9,
                            "q8_0": 7}

_quantize_type = {"llama": _llama_quantize_type,
                  "bloom": _bloom_quantize_type,
                  "gptneox": _gptneox_quantize_type,
                  "starcoder": _starcoder_quantize_type}


def quantize(input_path: str, output_path: str,
             model_family: str, dtype: str='q4_0'):
    """
    Quantize ggml file to lower precision.

    :param input_path: Path of input ggml file, for example `./ggml-model-f16.bin`.
    :param output_path: Save path of output quantized model. You must pass a directory to
            save all related output. Filename of quantized model will be like
            `bigdl_llm_llama_q4_0.bin`.
    :param model_family: Which model family your input model belongs to.
            Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported.
    :param dtype: Quantization method which differs in the resulting model disk size and
            inference speed. Defalut to `q4_0`. Difference model family may support
            different types, now the supported list is:
            llama : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
            bloom : "q4_0", "q4_1"
            gptneox : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
            starcoder : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"

    :return: the path str to the converted ggml binary checkpoint
    """
    invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder'],
                      "Now we only support quantization of model \
                       family('llama', 'bloom', 'gptneox', 'starcoder')",
                      "{} is not in the list.".format(model_family))
    invalidInputError(os.path.isfile(input_path),
                      "The file {} is not found".format(input_path))
    invalidInputError(os.path.isdir(output_path),
                      "The output_path {} is not a directory".format(output_path))
    # convert quantize type str into corresponding int value
    quantize_type_map = _quantize_type[model_family]
    output_filename = "bigdl_llm_{}_{}.bin".format(model_family,
                                                   dtype.lower())
    output_path = os.path.join(output_path, output_filename)
    invalidInputError(dtype.lower() in quantize_type_map, "{0} model just accept {1} now, \
                      but you pass in {2}.".format(
                      model_family,
                      list(quantize_type_map.keys()),
                      dtype))
    quantize_type = quantize_type_map[dtype]
    if platform.platform().startswith('Windows'):
        suffix = '.exe'
    else:
        suffix = ''
    quantize_args = "{0}/libs/quantize-{1}{2} {3} {4} {5}".format(libs_dirname,
                                                                  model_family,
                                                                  suffix,
                                                                  input_path,
                                                                  output_path,
                                                                  str(quantize_type))
    p = subprocess.run(quantize_args.split(), capture_output=True)
    error_message = p.stderr
    invalidInputError(not p.returncode,
                      "Fail to quantize {}, error message is {}.".format(str(input_path),
                                                                         error_message))
    return str(output_path)