diff --git a/python/llm/setup.py b/python/llm/setup.py index b4acd347..1caa24d5 100644 --- a/python/llm/setup.py +++ b/python/llm/setup.py @@ -42,7 +42,7 @@ BIGDL_PYTHON_HOME = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) VERSION = open(os.path.join(BIGDL_PYTHON_HOME, 'version.txt'), 'r').read().strip() llm_home = os.path.join(os.path.dirname(os.path.abspath(__file__)), "src") libs_dir = os.path.join(llm_home, "bigdl", "llm", "libs") -CONVERT_DEP = ['numpy', 'torch', 'transformers', 'sentencepiece'] +CONVERT_DEP = ['numpy', 'torch', 'transformers', 'sentencepiece', 'accelerate'] def get_llm_packages(): @@ -151,6 +151,11 @@ def setup_package(): package_dir={"": "src"}, package_data={"bigdl.llm": package_data[platform_name]}, include_package_data=True, + entry_points={ + "console_scripts": [ + 'convert_model=bigdl.llm.ggml.convert_model:main' + ] + }, extras_require={"all": all_requires}, classifiers=[ 'License :: OSI Approved :: Apache Software License', diff --git a/python/llm/src/bigdl/llm/ggml/convert.py b/python/llm/src/bigdl/llm/ggml/convert.py index 69c2aa75..1814965e 100644 --- a/python/llm/src/bigdl/llm/ggml/convert.py +++ b/python/llm/src/bigdl/llm/ggml/convert.py @@ -77,7 +77,11 @@ def _convert_to_ggml(model_path: str, outfile_dir: str, """ Convert Hugging Face llama-like / gpt-neox-like / bloom-like model to ggml format. - :param model_path: str, path of model, for example `./llama-7b-hf`. + :param input_path: Path to a *directory* for huggingface checkpoint that are directly + pulled from huggingface hub, for example `./llama-7b-hf`. This should be a dir + path that contains: weight bin, tokenizer config, tokenizer.model (required for + llama) and added_tokens.json (if applied). + For lora finetuned model, the path should be pointed to a merged weight. :param outfile_dir: str, the directory to save ggml compatible file, for example `./models`. :param model_family: Which model family your input model belongs to. Default to `llama`. Now only `llama`/`bloom`/`gptneox` are supported. diff --git a/python/llm/src/bigdl/llm/ggml/convert_model.py b/python/llm/src/bigdl/llm/ggml/convert_model.py index 653b5cc0..4164314d 100644 --- a/python/llm/src/bigdl/llm/ggml/convert_model.py +++ b/python/llm/src/bigdl/llm/ggml/convert_model.py @@ -16,47 +16,99 @@ import os import time from pathlib import Path - from bigdl.llm.ggml.convert import _convert_to_ggml from bigdl.llm.ggml.quantize import quantize +from bigdl.llm.utils.common import invalidInputError +import argparse +import tempfile def convert_model(input_path: str, output_path: str, model_family: str, dtype: str = 'int4', - tmp_path: str = '/tmp'): + tmp_path: str = None): """ Convert Hugging Face llama-like / gpt-neox-like / bloom-like model to lower precision - :param input_path: str, path of model, for example `./llama-7b-hf`. - :param output_path: Save path of output quantized model. Default to `None`. - If you don't specify this parameter, quantized model will be saved in - the same directory as the input and just replace precision with quantize_type - like `./ggml-model-q4_0.bin`. + :param input_path: Path to a *directory* for huggingface checkpoint that are directly + pulled from huggingface hub, for example `./llama-7b-hf`. This should be a dir + path that contains: weight bin, tokenizer config, tokenizer.model (required for + llama) and added_tokens.json (if applied). + For lora finetuned model, the path should be pointed to a merged weight. + :param output_path: Save path of output quantized model. You must pass a *directory* to + save all related output. :param model_family: Which model family your input model belongs to. Now only `llama`/`bloom`/`gptneox` are supported. :param dtype: Which quantized precision will be converted. - Now only int4 supported. + Now only int4 is supported. :param tmp_path: Which path to store the intermediate model during the conversion process. + Default to `None` so that intermediate model will not be saved. - :return: the path str to the converted lower precision checkpoint + :return: the path string to the converted lower precision checkpoint. """ dtype = dtype.lower() + # make sure directory exists + os.makedirs(output_path, exist_ok=True) + # check input value + invalidInputError(model_family in ['llama', 'bloom', 'gptneox'], + "Now we only support quantization of model \ + family('llama', 'bloom', 'gptneox')", + "{} is not in the list.".format(model_family)) + invalidInputError(os.path.isdir(output_path), + "The output_path {} was not a directory".format(output_path)) + invalidInputError(dtype == 'int4', + "Now only int4 is supported.") + # check for input_path + invalidInputError(os.path.exists(input_path), + "The input path {} was not found".format(input_path)) + invalidInputError(os.path.isdir(input_path), + "The input path {} was not a directory".format(input_path)) + # shall we support model_id or just model directory? + if dtype == 'int4': dtype = 'q4_0' - model_name = Path(input_path).stem - tmp_ggml_file_path = os.path.join(tmp_path, f'{model_name}_{int(time.time())}') - _convert_to_ggml(model_path=input_path, - outfile_dir=tmp_ggml_file_path, - model_family=model_family, - outtype="fp16") + if tmp_path is not None: + model_name = Path(input_path).stem + tmp_ggml_file_path = os.path.join(tmp_path, f'{model_name}_{int(time.time())}') + _convert_to_ggml(model_path=input_path, + outfile_dir=tmp_ggml_file_path, + model_family=model_family, + outtype="fp16") + tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir()) + return quantize(input_path=tmp_ggml_file_path, + output_path=output_path, + model_family=model_family, + dtype=dtype) + else: + with tempfile.TemporaryDirectory() as tmp_ggml_file_path: + _convert_to_ggml(model_path=input_path, + outfile_dir=tmp_ggml_file_path, + model_family=model_family, + outtype="fp16") + tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir()) + return quantize(input_path=tmp_ggml_file_path, + output_path=output_path, + model_family=model_family, + dtype=dtype) - tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir()) - return quantize(input_path=tmp_ggml_file_path, - output_path=output_path, - model_family=model_family, - dtype=dtype) +def main(): + parser = argparse.ArgumentParser(description='Model Convert Parameters') + parser.add_argument('-i', '--input_path', type=str, required=True, + help=("input_path, a path to a *directory* containing model weights")) + parser.add_argument('-o', '--output_path', type=str, required=True, + help=("output_path,save path of output quantized model.")) + parser.add_argument('-x', '--model_family', type=str, required=True, + help=("model_family: Which model family your input model belongs to." + "Now only `llama`/`bloom`/`gptneox` are supported.")) + parser.add_argument('-t', '--dtype', type=str, default="int4", + help="Which quantized precision will be converted.") + parser.add_argument('-p', '--tmp_path', type=str, default=None, + help="Which path to store the intermediate model during the" + "conversion process.") + args = parser.parse_args() + params = vars(args) + convert_model(**params) diff --git a/python/llm/src/bigdl/llm/ggml/quantize.py b/python/llm/src/bigdl/llm/ggml/quantize.py index 973cbc7e..f28b76a8 100644 --- a/python/llm/src/bigdl/llm/ggml/quantize.py +++ b/python/llm/src/bigdl/llm/ggml/quantize.py @@ -42,24 +42,23 @@ _quantize_type = {"llama": _llama_quantize_type, "gptneox": _gptneox_quantize_type} -def quantize(input_path: str, output_path: str=None, - model_family: str = 'llama', dtype: str='q4_0'): +def quantize(input_path: str, output_path: str, + model_family: str, dtype: str='q4_0'): """ Quantize ggml file to lower precision. :param input_path: Path of input ggml file, for example `./ggml-model-f16.bin`. - :param output_path: Save path of output quantized model. Default to `None`. - If you don't specify this parameter, quantized model will be saved in - the same directory as the input and just replace precision with quantize_type - like `./ggml-model-q4_0.bin`. - :param model_family: Which model family your input model belongs to. Default to `llama`. + :param output_path: Save path of output quantized model. You must pass a directory to + save all related output. Filename of quantized model will be like + `bigdl_llm_llama_q4_0.bin`. + :param model_family: Which model family your input model belongs to. Now only `llama`/`bloom`/`gptneox` are supported. :param dtype: Quantization method which differs in the resulting model disk size and - inference speed. Defalut to `q4_0`. Difference model family may support different types, - now the supported list is: + inference speed. Defalut to `q4_0`. Difference model family may support + different types, now the supported list is: llama : "q4_0", "q4_1", "q4_2" bloom : "q4_0", "q4_1" - gptneox : "q4_0", "q4_1", "q4_2", "q5_0", "q5_1", "q8_0" + gptneox : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0" :return: the path str to the converted ggml binary checkpoint """ @@ -69,12 +68,14 @@ def quantize(input_path: str, output_path: str=None, "{} is not in the list.".format(model_family)) invalidInputError(os.path.isfile(input_path), "The file {} was not found".format(input_path)) - # TODO : multi input model path - if output_path is None: - output_path = Path(str(input_path).replace('f16', dtype)) + invalidInputError(os.path.isdir(output_path), + "The output_path {} was not a directory".format(output_path)) # convert quantize type str into corresponding int value quantize_type_map = _quantize_type[model_family] - invalidInputError(dtype in quantize_type_map, "{0} model just accept {1} now, \ + output_filename = "bigdl_llm_{}_{}.bin".format(model_family, + dtype.lower()) + output_path = os.path.join(output_path, output_filename) + invalidInputError(dtype.lower() in quantize_type_map, "{0} model just accept {1} now, \ but you pass in {2}.".format( model_family, list(quantize_type_map.keys()), diff --git a/python/llm/src/bigdl/llm/ggml/transformers/model.py b/python/llm/src/bigdl/llm/ggml/transformers/model.py index c1ff926d..b09d326a 100644 --- a/python/llm/src/bigdl/llm/ggml/transformers/model.py +++ b/python/llm/src/bigdl/llm/ggml/transformers/model.py @@ -42,7 +42,7 @@ class AutoModelForCausalLM: """ :param pretrained_model_name_or_path: We support 3 kinds of pretrained model checkpoint - 1. path for huggingface checkpoint that are directly pulled from hugginface hub. + 1. path for huggingface checkpoint that are directly pulled from huggingface hub. This should be a dir path that contains: weight bin, tokenizer config, tokenizer.model (required for llama) and added_tokens.json (if applied). For lora fine tuned model, the path should be pointed to a merged weight.