ipex-llm/python/llm/src/ipex_llm/convert_model.py

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#


from ipex_llm.utils.common import invalidInputError
import argparse
import os


def _special_kwarg_check(kwargs, check_args):
    _used_args = {}
    for arg in kwargs:
        if arg in check_args:
            _used_args[arg] = kwargs[arg]
    return True, _used_args


def llm_convert(model,
                outfile,
                model_family,
                outtype='int4',
                model_format="pth",
                **kwargs):
    """
    This function is able to:

        1. Convert Hugging Face llama-like / gpt-neox-like / bloom-like / starcoder-like
           / chatglm-like PyTorch model to lower precision in BigDL-LLM optimized GGML format.
        2. Convert Hugging Face GPTQ format llama-like model to BigDL-LLM optimized
           GGML format.

    :param model: Path to a **directory**:

           1. If ``model_format='pth'``, the folder should be a Hugging Face checkpoint
              that is directly pulled from Hugging Face hub, for example ``./llama-7b-hf``.
              This should be a dir path that contains: weight bin, tokenizer config,
              tokenizer.model (required for llama) and added_tokens.json (if applied).
              For lora finetuned model, the path should be pointed to a merged weight.
           2. If ``model_format='gptq'``, the folder should be be a Hugging Face checkpoint
              in GPTQ format, which contains weights in pytorch's .pt format,
              and ``tokenizer.model``.

    :param outfile: Save path of output quantized model. You must pass a **directory** to
           save all related output.
    :param model_family: Which model family your input model belongs to.
           Now ``llama``/``bloom``/``gptneox``/``starcoder``/``chatglm`` has been supported.
           If ``model_format='gptq'``, only ``llama`` is supported.
    :param dtype: Which quantized precision will be converted.
           If ``model_format='pth'``, `int4` and `int8` are supported,
           meanwhile `int8` only works for `llama` and `gptneox`.
           If ``model_format='gptq'``, only ``int4`` is supported.
    :param model_format: Specify the model format to be converted. ``pth`` is for
           PyTorch model checkpoint from Hugging Face. ``gptq`` is for GPTQ format
           model from Hugging Face.
    :param **kwargs: Supported keyword arguments includes:

           * ``tmp_path``: Valid when ``model_format='pth'``. It refers to the path
             that stores the intermediate model during the conversion process.
           * ``tokenizer_path``: Valid when ``model_format='gptq'``. It refers to the path
             where ``tokenizer.model`` is located (if it is not in the ``model`` directory)

    :return: the path string to the converted lower precision checkpoint.
    """
    if model_format == "pth":
        from ipex_llm.ggml.convert_model import convert_model as ggml_convert_model
        _, _used_args = _special_kwarg_check(kwargs=kwargs,
                                             check_args=["tmp_path"])
        return ggml_convert_model(input_path=model,
                                  output_path=outfile,
                                  model_family=model_family,
                                  dtype=outtype,
                                  **_used_args,
                                  )
    elif model_format == "gptq":
        from ipex_llm.gptq.convert.convert_gptq_to_ggml import convert_gptq2ggml
        invalidInputError(model_family == "llama" and outtype == 'int4',
                          "Convert GPTQ models should always "
                          "specify `--model-family llama --dtype int4` in the command line.")
        os.makedirs(outfile, exist_ok=True)
        invalidInputError(os.path.isdir(outfile),
                          "The output_path {} is not a directory".format(outfile))
        _, _used_args = _special_kwarg_check(kwargs=kwargs,
                                             check_args=["tokenizer_path"])

        output_filename = "bigdl_llm_{}_{}_from_gptq.bin".format(model_family,
                                                                 outtype.lower())
        outfile = os.path.join(outfile, output_filename)

        # TODO: delete this when support AutoTokenizer
        if "tokenizer_path" in _used_args:
            gptq_tokenizer_path = _used_args["tokenizer_path"]
        else:
            gptq_tokenizer_path = None

        convert_gptq2ggml(model_path=model,
                          output_path=outfile,
                          tokenizer_path=gptq_tokenizer_path,
                          )
        return outfile
    else:
        invalidInputError(False, f"Unsupported input model_type: {model_format}")

    return None


def main():
    parser = argparse.ArgumentParser(description='Model Convert Parameters')
    parser.add_argument('model', type=str,
                        help=("model, a path to a *directory* containing model weights"))
    parser.add_argument('-o', '--outfile', type=str, required=True,
                        help=("outfile,save path of output quantized model."))
    parser.add_argument('-x', '--model-family', type=str, required=True,
                        help=("--model-family: Which model family your input model belongs to."
                              "Now only `llama`/`bloom`/`gptneox`/`chatglm` are supported."))
    parser.add_argument('-f', '--model-format', type=str, required=True,
                        help=("The model type to be convert to a ggml compatible file."
                              "Now only `pth`/`gptq` are supported."))
    parser.add_argument('-t', '--outtype', type=str, default="int4",
                        help="Which quantized precision will be converted.")

    # pth specific args
    parser.add_argument('-p', '--tmp-path', type=str, default=None,
                        help="Which path to store the intermediate model during the"
                        "conversion process.")

    # gptq specific args
    parser.add_argument('-k', '--tokenizer-path', type=str, default=None,
                        help="tokenizer_path, a path of tokenizer.model")
    args = parser.parse_args()
    params = vars(args)
    llm_convert(**params)