[llm] Add convert_model api (#8244)

* add convert_model api

* change the model_path to input_path

* map int4 to q4_0

* fix blank line

* change bloomz to bloom

* remove default model_family

* change dtype to lower first
This commit is contained in:
Jun Wang 2023-06-03 10:18:29 +08:00 committed by GitHub
parent e290660b20
commit 2bc0e7abbb
3 changed files with 50 additions and 7 deletions

View file

@ -21,3 +21,4 @@
from .quantize import quantize
from .convert import _convert_to_ggml
from .convert_model import convert_model

View file

@ -0,0 +1,41 @@
from bigdl.llm.ggml.convert import _convert_to_ggml
from bigdl.llm.ggml.quantize import quantize
from pathlib import Path
import time
def convert_model(input_path: str,
output_path: str,
model_family: str,
dtype: str = 'int4'):
"""
Convert Hugging Face llama-like / gpt-neox-like / bloom-like model to lower precision
:param input_path: str, path of model, for example `./llama-7b-hf`.
:param output_path: Save path of output quantized model. Default to `None`.
If you don't specify this parameter, quantized model will be saved in
the same directory as the input and just replace precision with quantize_type
like `./ggml-model-q4_0.bin`.
:param model_family: Which model family your input model belongs to.
Now only `llama`/`bloom`/`gptneox` are supported.
:param dtype: Which quantized precision will be converted.
Now only int4 supported.
"""
dtype = dtype.lower()
if dtype == 'int4':
dtype = 'q4_0'
model_name = Path(input_path).stem
tmp_ggml_file_path = f'/tmp/{model_name}_{int(time.time())}'
_convert_to_ggml(model_path=input_path,
outfile_dir=tmp_ggml_file_path,
model_family=model_family,
outtype="fp16")
tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir())
quantize(input_path=tmp_ggml_file_path,
output_path=output_path,
model_family=model_family,
dtype=dtype)

View file

@ -18,6 +18,7 @@ import os
import subprocess
from bigdl.llm.utils.common import invalidInputError
import platform
from pathlib import Path
dirname, _ = os.path.split(os.path.abspath(__file__))
@ -28,7 +29,7 @@ _llama_quantize_type = {"q4_0": 2,
"q5_0": 8,
"q5_1": 9,
"q8_0": 7}
_bloomz_quantize_type = {"q4_0": 2,
_bloom_quantize_type = {"q4_0": 2,
"q4_1": 3}
_gptneox_quantize_type = {"q4_0": 2,
"q4_1": 3,
@ -38,7 +39,7 @@ _gptneox_quantize_type = {"q4_0": 2,
"q8_0": 7}
_quantize_type = {"llama": _llama_quantize_type,
"bloomz": _bloomz_quantize_type,
"bloom": _bloom_quantize_type,
"gptneox": _gptneox_quantize_type}
_valid_types = set(list(_llama_quantize_type.keys()) + list(_bloomz_quantize_type.keys()) +
@ -56,23 +57,23 @@ def quantize(input_path: str, output_path: str=None,
the same directory as the input and just replace precision with quantize_type
like `./ggml-model-q4_0.bin`.
:param model_family: Which model family your input model belongs to. Default to `llama`.
Now only `llama`/`bloomz`/`gptneox` are supported.
Now only `llama`/`bloom`/`gptneox` are supported.
:param dtype: Quantization method which differs in the resulting model disk size and
inference speed. Defalut to `q4_0`. Difference model family may support different types,
now the supported list is:
llama : "q4_0", "q4_1", "q4_2"
bloomz : "q4_0", "q4_1"
bloom : "q4_0", "q4_1"
gptneox : "q4_0", "q4_1", "q4_2", "q5_0", "q5_1", "q8_0"
"""
invalidInputError(model_family in ['llama', 'bloomz', 'gptneox'],
invalidInputError(model_family in ['llama', 'bloom', 'gptneox'],
"Now we only support quantization of model \
family('llama', 'bloomz', 'gptneox')",
family('llama', 'bloom', 'gptneox')",
"{} is not in the list.".format(model_family))
invalidInputError(os.path.isfile(input_path),
"The file {} was not found".format(input_path))
# TODO : multi input model path
if output_path is None:
output_path = input_path.replace("f16", dtype)
output_path = Path(str(input_path).replace('f16', dtype))
# convert quantize type str into corresponding int value
quantize_type_map = _quantize_type[model_family]
invalidInputError(dtype in quantize_type_map, "{0} model just accept {1} now, \