diff --git a/python/llm/src/bigdl/llm/ggml/__init__.py b/python/llm/src/bigdl/llm/ggml/__init__.py index cba29fd7..adeb474d 100644 --- a/python/llm/src/bigdl/llm/ggml/__init__.py +++ b/python/llm/src/bigdl/llm/ggml/__init__.py @@ -21,3 +21,4 @@ from .quantize import quantize from .convert import _convert_to_ggml +from .convert_model import convert_model diff --git a/python/llm/src/bigdl/llm/ggml/convert_model.py b/python/llm/src/bigdl/llm/ggml/convert_model.py new file mode 100644 index 00000000..8547e1b5 --- /dev/null +++ b/python/llm/src/bigdl/llm/ggml/convert_model.py @@ -0,0 +1,41 @@ +from bigdl.llm.ggml.convert import _convert_to_ggml +from bigdl.llm.ggml.quantize import quantize +from pathlib import Path +import time + + +def convert_model(input_path: str, + output_path: str, + model_family: str, + dtype: str = 'int4'): + """ + Convert Hugging Face llama-like / gpt-neox-like / bloom-like model to lower precision + + :param input_path: str, path of model, for example `./llama-7b-hf`. + :param output_path: Save path of output quantized model. Default to `None`. + If you don't specify this parameter, quantized model will be saved in + the same directory as the input and just replace precision with quantize_type + like `./ggml-model-q4_0.bin`. + :param model_family: Which model family your input model belongs to. + Now only `llama`/`bloom`/`gptneox` are supported. + :param dtype: Which quantized precision will be converted. + Now only int4 supported. + """ + + dtype = dtype.lower() + if dtype == 'int4': + dtype = 'q4_0' + + model_name = Path(input_path).stem + tmp_ggml_file_path = f'/tmp/{model_name}_{int(time.time())}' + _convert_to_ggml(model_path=input_path, + outfile_dir=tmp_ggml_file_path, + model_family=model_family, + outtype="fp16") + + tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir()) + + quantize(input_path=tmp_ggml_file_path, + output_path=output_path, + model_family=model_family, + dtype=dtype) \ No newline at end of file diff --git a/python/llm/src/bigdl/llm/ggml/quantize.py b/python/llm/src/bigdl/llm/ggml/quantize.py index a5527ee8..baee0ce4 100644 --- a/python/llm/src/bigdl/llm/ggml/quantize.py +++ b/python/llm/src/bigdl/llm/ggml/quantize.py @@ -18,6 +18,7 @@ import os import subprocess from bigdl.llm.utils.common import invalidInputError import platform +from pathlib import Path dirname, _ = os.path.split(os.path.abspath(__file__)) @@ -28,7 +29,7 @@ _llama_quantize_type = {"q4_0": 2, "q5_0": 8, "q5_1": 9, "q8_0": 7} -_bloomz_quantize_type = {"q4_0": 2, +_bloom_quantize_type = {"q4_0": 2, "q4_1": 3} _gptneox_quantize_type = {"q4_0": 2, "q4_1": 3, @@ -38,7 +39,7 @@ _gptneox_quantize_type = {"q4_0": 2, "q8_0": 7} _quantize_type = {"llama": _llama_quantize_type, - "bloomz": _bloomz_quantize_type, + "bloom": _bloom_quantize_type, "gptneox": _gptneox_quantize_type} _valid_types = set(list(_llama_quantize_type.keys()) + list(_bloomz_quantize_type.keys()) + @@ -56,23 +57,23 @@ def quantize(input_path: str, output_path: str=None, the same directory as the input and just replace precision with quantize_type like `./ggml-model-q4_0.bin`. :param model_family: Which model family your input model belongs to. Default to `llama`. - Now only `llama`/`bloomz`/`gptneox` are supported. + Now only `llama`/`bloom`/`gptneox` are supported. :param dtype: Quantization method which differs in the resulting model disk size and inference speed. Defalut to `q4_0`. Difference model family may support different types, now the supported list is: llama : "q4_0", "q4_1", "q4_2" - bloomz : "q4_0", "q4_1" + bloom : "q4_0", "q4_1" gptneox : "q4_0", "q4_1", "q4_2", "q5_0", "q5_1", "q8_0" """ - invalidInputError(model_family in ['llama', 'bloomz', 'gptneox'], + invalidInputError(model_family in ['llama', 'bloom', 'gptneox'], "Now we only support quantization of model \ - family('llama', 'bloomz', 'gptneox')", + family('llama', 'bloom', 'gptneox')", "{} is not in the list.".format(model_family)) invalidInputError(os.path.isfile(input_path), "The file {} was not found".format(input_path)) # TODO : multi input model path if output_path is None: - output_path = input_path.replace("f16", dtype) + output_path = Path(str(input_path).replace('f16', dtype)) # convert quantize type str into corresponding int value quantize_type_map = _quantize_type[model_family] invalidInputError(dtype in quantize_type_map, "{0} model just accept {1} now, \