LLM: enhancements for convert_model (#8278)

* update convert

* change output name

* add discription for input_path, add check for input_values

* basic support for command line

* fix style

* update based on comment

* update based on comment
This commit is contained in:
Ruonan Wang 2023-06-07 13:22:14 +08:00 committed by GitHub
parent 2d14e593f0
commit 39ad68e786
5 changed files with 99 additions and 37 deletions

View file

@ -42,7 +42,7 @@ BIGDL_PYTHON_HOME = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
VERSION = open(os.path.join(BIGDL_PYTHON_HOME, 'version.txt'), 'r').read().strip()
llm_home = os.path.join(os.path.dirname(os.path.abspath(__file__)), "src")
libs_dir = os.path.join(llm_home, "bigdl", "llm", "libs")
CONVERT_DEP = ['numpy', 'torch', 'transformers', 'sentencepiece']
CONVERT_DEP = ['numpy', 'torch', 'transformers', 'sentencepiece', 'accelerate']
def get_llm_packages():
@ -151,6 +151,11 @@ def setup_package():
package_dir={"": "src"},
package_data={"bigdl.llm": package_data[platform_name]},
include_package_data=True,
entry_points={
"console_scripts": [
'convert_model=bigdl.llm.ggml.convert_model:main'
]
},
extras_require={"all": all_requires},
classifiers=[
'License :: OSI Approved :: Apache Software License',

View file

@ -77,7 +77,11 @@ def _convert_to_ggml(model_path: str, outfile_dir: str,
"""
Convert Hugging Face llama-like / gpt-neox-like / bloom-like model to ggml format.
:param model_path: str, path of model, for example `./llama-7b-hf`.
:param input_path: Path to a *directory* for huggingface checkpoint that are directly
pulled from huggingface hub, for example `./llama-7b-hf`. This should be a dir
path that contains: weight bin, tokenizer config, tokenizer.model (required for
llama) and added_tokens.json (if applied).
For lora finetuned model, the path should be pointed to a merged weight.
:param outfile_dir: str, the directory to save ggml compatible file, for example `./models`.
:param model_family: Which model family your input model belongs to. Default to `llama`.
Now only `llama`/`bloom`/`gptneox` are supported.

View file

@ -16,47 +16,99 @@
import os
import time
from pathlib import Path
from bigdl.llm.ggml.convert import _convert_to_ggml
from bigdl.llm.ggml.quantize import quantize
from bigdl.llm.utils.common import invalidInputError
import argparse
import tempfile
def convert_model(input_path: str,
output_path: str,
model_family: str,
dtype: str = 'int4',
tmp_path: str = '/tmp'):
tmp_path: str = None):
"""
Convert Hugging Face llama-like / gpt-neox-like / bloom-like model to lower precision
:param input_path: str, path of model, for example `./llama-7b-hf`.
:param output_path: Save path of output quantized model. Default to `None`.
If you don't specify this parameter, quantized model will be saved in
the same directory as the input and just replace precision with quantize_type
like `./ggml-model-q4_0.bin`.
:param input_path: Path to a *directory* for huggingface checkpoint that are directly
pulled from huggingface hub, for example `./llama-7b-hf`. This should be a dir
path that contains: weight bin, tokenizer config, tokenizer.model (required for
llama) and added_tokens.json (if applied).
For lora finetuned model, the path should be pointed to a merged weight.
:param output_path: Save path of output quantized model. You must pass a *directory* to
save all related output.
:param model_family: Which model family your input model belongs to.
Now only `llama`/`bloom`/`gptneox` are supported.
:param dtype: Which quantized precision will be converted.
Now only int4 supported.
Now only int4 is supported.
:param tmp_path: Which path to store the intermediate model during the conversion process.
Default to `None` so that intermediate model will not be saved.
:return: the path str to the converted lower precision checkpoint
:return: the path string to the converted lower precision checkpoint.
"""
dtype = dtype.lower()
# make sure directory exists
os.makedirs(output_path, exist_ok=True)
# check input value
invalidInputError(model_family in ['llama', 'bloom', 'gptneox'],
"Now we only support quantization of model \
family('llama', 'bloom', 'gptneox')",
"{} is not in the list.".format(model_family))
invalidInputError(os.path.isdir(output_path),
"The output_path {} was not a directory".format(output_path))
invalidInputError(dtype == 'int4',
"Now only int4 is supported.")
# check for input_path
invalidInputError(os.path.exists(input_path),
"The input path {} was not found".format(input_path))
invalidInputError(os.path.isdir(input_path),
"The input path {} was not a directory".format(input_path))
# shall we support model_id or just model directory?
if dtype == 'int4':
dtype = 'q4_0'
model_name = Path(input_path).stem
tmp_ggml_file_path = os.path.join(tmp_path, f'{model_name}_{int(time.time())}')
_convert_to_ggml(model_path=input_path,
outfile_dir=tmp_ggml_file_path,
model_family=model_family,
outtype="fp16")
if tmp_path is not None:
model_name = Path(input_path).stem
tmp_ggml_file_path = os.path.join(tmp_path, f'{model_name}_{int(time.time())}')
_convert_to_ggml(model_path=input_path,
outfile_dir=tmp_ggml_file_path,
model_family=model_family,
outtype="fp16")
tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir())
return quantize(input_path=tmp_ggml_file_path,
output_path=output_path,
model_family=model_family,
dtype=dtype)
else:
with tempfile.TemporaryDirectory() as tmp_ggml_file_path:
_convert_to_ggml(model_path=input_path,
outfile_dir=tmp_ggml_file_path,
model_family=model_family,
outtype="fp16")
tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir())
return quantize(input_path=tmp_ggml_file_path,
output_path=output_path,
model_family=model_family,
dtype=dtype)
tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir())
return quantize(input_path=tmp_ggml_file_path,
output_path=output_path,
model_family=model_family,
dtype=dtype)
def main():
parser = argparse.ArgumentParser(description='Model Convert Parameters')
parser.add_argument('-i', '--input_path', type=str, required=True,
help=("input_path, a path to a *directory* containing model weights"))
parser.add_argument('-o', '--output_path', type=str, required=True,
help=("output_path,save path of output quantized model."))
parser.add_argument('-x', '--model_family', type=str, required=True,
help=("model_family: Which model family your input model belongs to."
"Now only `llama`/`bloom`/`gptneox` are supported."))
parser.add_argument('-t', '--dtype', type=str, default="int4",
help="Which quantized precision will be converted.")
parser.add_argument('-p', '--tmp_path', type=str, default=None,
help="Which path to store the intermediate model during the"
"conversion process.")
args = parser.parse_args()
params = vars(args)
convert_model(**params)

View file

@ -42,24 +42,23 @@ _quantize_type = {"llama": _llama_quantize_type,
"gptneox": _gptneox_quantize_type}
def quantize(input_path: str, output_path: str=None,
model_family: str = 'llama', dtype: str='q4_0'):
def quantize(input_path: str, output_path: str,
model_family: str, dtype: str='q4_0'):
"""
Quantize ggml file to lower precision.
:param input_path: Path of input ggml file, for example `./ggml-model-f16.bin`.
:param output_path: Save path of output quantized model. Default to `None`.
If you don't specify this parameter, quantized model will be saved in
the same directory as the input and just replace precision with quantize_type
like `./ggml-model-q4_0.bin`.
:param model_family: Which model family your input model belongs to. Default to `llama`.
:param output_path: Save path of output quantized model. You must pass a directory to
save all related output. Filename of quantized model will be like
`bigdl_llm_llama_q4_0.bin`.
:param model_family: Which model family your input model belongs to.
Now only `llama`/`bloom`/`gptneox` are supported.
:param dtype: Quantization method which differs in the resulting model disk size and
inference speed. Defalut to `q4_0`. Difference model family may support different types,
now the supported list is:
inference speed. Defalut to `q4_0`. Difference model family may support
different types, now the supported list is:
llama : "q4_0", "q4_1", "q4_2"
bloom : "q4_0", "q4_1"
gptneox : "q4_0", "q4_1", "q4_2", "q5_0", "q5_1", "q8_0"
gptneox : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
:return: the path str to the converted ggml binary checkpoint
"""
@ -69,12 +68,14 @@ def quantize(input_path: str, output_path: str=None,
"{} is not in the list.".format(model_family))
invalidInputError(os.path.isfile(input_path),
"The file {} was not found".format(input_path))
# TODO : multi input model path
if output_path is None:
output_path = Path(str(input_path).replace('f16', dtype))
invalidInputError(os.path.isdir(output_path),
"The output_path {} was not a directory".format(output_path))
# convert quantize type str into corresponding int value
quantize_type_map = _quantize_type[model_family]
invalidInputError(dtype in quantize_type_map, "{0} model just accept {1} now, \
output_filename = "bigdl_llm_{}_{}.bin".format(model_family,
dtype.lower())
output_path = os.path.join(output_path, output_filename)
invalidInputError(dtype.lower() in quantize_type_map, "{0} model just accept {1} now, \
but you pass in {2}.".format(
model_family,
list(quantize_type_map.keys()),

View file

@ -42,7 +42,7 @@ class AutoModelForCausalLM:
"""
:param pretrained_model_name_or_path: We support 3 kinds of pretrained model checkpoint
1. path for huggingface checkpoint that are directly pulled from hugginface hub.
1. path for huggingface checkpoint that are directly pulled from huggingface hub.
This should be a dir path that contains: weight bin, tokenizer config,
tokenizer.model (required for llama) and added_tokens.json (if applied).
For lora fine tuned model, the path should be pointed to a merged weight.