LLM: enhancements for convert_model (#8278)
* update convert * change output name * add discription for input_path, add check for input_values * basic support for command line * fix style * update based on comment * update based on comment
This commit is contained in:
parent
2d14e593f0
commit
39ad68e786
5 changed files with 99 additions and 37 deletions
|
|
@ -42,7 +42,7 @@ BIGDL_PYTHON_HOME = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
VERSION = open(os.path.join(BIGDL_PYTHON_HOME, 'version.txt'), 'r').read().strip()
|
VERSION = open(os.path.join(BIGDL_PYTHON_HOME, 'version.txt'), 'r').read().strip()
|
||||||
llm_home = os.path.join(os.path.dirname(os.path.abspath(__file__)), "src")
|
llm_home = os.path.join(os.path.dirname(os.path.abspath(__file__)), "src")
|
||||||
libs_dir = os.path.join(llm_home, "bigdl", "llm", "libs")
|
libs_dir = os.path.join(llm_home, "bigdl", "llm", "libs")
|
||||||
CONVERT_DEP = ['numpy', 'torch', 'transformers', 'sentencepiece']
|
CONVERT_DEP = ['numpy', 'torch', 'transformers', 'sentencepiece', 'accelerate']
|
||||||
|
|
||||||
|
|
||||||
def get_llm_packages():
|
def get_llm_packages():
|
||||||
|
|
@ -151,6 +151,11 @@ def setup_package():
|
||||||
package_dir={"": "src"},
|
package_dir={"": "src"},
|
||||||
package_data={"bigdl.llm": package_data[platform_name]},
|
package_data={"bigdl.llm": package_data[platform_name]},
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
|
entry_points={
|
||||||
|
"console_scripts": [
|
||||||
|
'convert_model=bigdl.llm.ggml.convert_model:main'
|
||||||
|
]
|
||||||
|
},
|
||||||
extras_require={"all": all_requires},
|
extras_require={"all": all_requires},
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'License :: OSI Approved :: Apache Software License',
|
'License :: OSI Approved :: Apache Software License',
|
||||||
|
|
|
||||||
|
|
@ -77,7 +77,11 @@ def _convert_to_ggml(model_path: str, outfile_dir: str,
|
||||||
"""
|
"""
|
||||||
Convert Hugging Face llama-like / gpt-neox-like / bloom-like model to ggml format.
|
Convert Hugging Face llama-like / gpt-neox-like / bloom-like model to ggml format.
|
||||||
|
|
||||||
:param model_path: str, path of model, for example `./llama-7b-hf`.
|
:param input_path: Path to a *directory* for huggingface checkpoint that are directly
|
||||||
|
pulled from huggingface hub, for example `./llama-7b-hf`. This should be a dir
|
||||||
|
path that contains: weight bin, tokenizer config, tokenizer.model (required for
|
||||||
|
llama) and added_tokens.json (if applied).
|
||||||
|
For lora finetuned model, the path should be pointed to a merged weight.
|
||||||
:param outfile_dir: str, the directory to save ggml compatible file, for example `./models`.
|
:param outfile_dir: str, the directory to save ggml compatible file, for example `./models`.
|
||||||
:param model_family: Which model family your input model belongs to. Default to `llama`.
|
:param model_family: Which model family your input model belongs to. Default to `llama`.
|
||||||
Now only `llama`/`bloom`/`gptneox` are supported.
|
Now only `llama`/`bloom`/`gptneox` are supported.
|
||||||
|
|
|
||||||
|
|
@ -16,47 +16,99 @@
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from bigdl.llm.ggml.convert import _convert_to_ggml
|
from bigdl.llm.ggml.convert import _convert_to_ggml
|
||||||
from bigdl.llm.ggml.quantize import quantize
|
from bigdl.llm.ggml.quantize import quantize
|
||||||
|
from bigdl.llm.utils.common import invalidInputError
|
||||||
|
import argparse
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
|
||||||
def convert_model(input_path: str,
|
def convert_model(input_path: str,
|
||||||
output_path: str,
|
output_path: str,
|
||||||
model_family: str,
|
model_family: str,
|
||||||
dtype: str = 'int4',
|
dtype: str = 'int4',
|
||||||
tmp_path: str = '/tmp'):
|
tmp_path: str = None):
|
||||||
"""
|
"""
|
||||||
Convert Hugging Face llama-like / gpt-neox-like / bloom-like model to lower precision
|
Convert Hugging Face llama-like / gpt-neox-like / bloom-like model to lower precision
|
||||||
|
|
||||||
:param input_path: str, path of model, for example `./llama-7b-hf`.
|
:param input_path: Path to a *directory* for huggingface checkpoint that are directly
|
||||||
:param output_path: Save path of output quantized model. Default to `None`.
|
pulled from huggingface hub, for example `./llama-7b-hf`. This should be a dir
|
||||||
If you don't specify this parameter, quantized model will be saved in
|
path that contains: weight bin, tokenizer config, tokenizer.model (required for
|
||||||
the same directory as the input and just replace precision with quantize_type
|
llama) and added_tokens.json (if applied).
|
||||||
like `./ggml-model-q4_0.bin`.
|
For lora finetuned model, the path should be pointed to a merged weight.
|
||||||
|
:param output_path: Save path of output quantized model. You must pass a *directory* to
|
||||||
|
save all related output.
|
||||||
:param model_family: Which model family your input model belongs to.
|
:param model_family: Which model family your input model belongs to.
|
||||||
Now only `llama`/`bloom`/`gptneox` are supported.
|
Now only `llama`/`bloom`/`gptneox` are supported.
|
||||||
:param dtype: Which quantized precision will be converted.
|
:param dtype: Which quantized precision will be converted.
|
||||||
Now only int4 supported.
|
Now only int4 is supported.
|
||||||
:param tmp_path: Which path to store the intermediate model during the conversion process.
|
:param tmp_path: Which path to store the intermediate model during the conversion process.
|
||||||
|
Default to `None` so that intermediate model will not be saved.
|
||||||
|
|
||||||
:return: the path str to the converted lower precision checkpoint
|
:return: the path string to the converted lower precision checkpoint.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
dtype = dtype.lower()
|
dtype = dtype.lower()
|
||||||
|
# make sure directory exists
|
||||||
|
os.makedirs(output_path, exist_ok=True)
|
||||||
|
# check input value
|
||||||
|
invalidInputError(model_family in ['llama', 'bloom', 'gptneox'],
|
||||||
|
"Now we only support quantization of model \
|
||||||
|
family('llama', 'bloom', 'gptneox')",
|
||||||
|
"{} is not in the list.".format(model_family))
|
||||||
|
invalidInputError(os.path.isdir(output_path),
|
||||||
|
"The output_path {} was not a directory".format(output_path))
|
||||||
|
invalidInputError(dtype == 'int4',
|
||||||
|
"Now only int4 is supported.")
|
||||||
|
# check for input_path
|
||||||
|
invalidInputError(os.path.exists(input_path),
|
||||||
|
"The input path {} was not found".format(input_path))
|
||||||
|
invalidInputError(os.path.isdir(input_path),
|
||||||
|
"The input path {} was not a directory".format(input_path))
|
||||||
|
# shall we support model_id or just model directory?
|
||||||
|
|
||||||
if dtype == 'int4':
|
if dtype == 'int4':
|
||||||
dtype = 'q4_0'
|
dtype = 'q4_0'
|
||||||
|
|
||||||
|
if tmp_path is not None:
|
||||||
model_name = Path(input_path).stem
|
model_name = Path(input_path).stem
|
||||||
tmp_ggml_file_path = os.path.join(tmp_path, f'{model_name}_{int(time.time())}')
|
tmp_ggml_file_path = os.path.join(tmp_path, f'{model_name}_{int(time.time())}')
|
||||||
_convert_to_ggml(model_path=input_path,
|
_convert_to_ggml(model_path=input_path,
|
||||||
outfile_dir=tmp_ggml_file_path,
|
outfile_dir=tmp_ggml_file_path,
|
||||||
model_family=model_family,
|
model_family=model_family,
|
||||||
outtype="fp16")
|
outtype="fp16")
|
||||||
|
|
||||||
tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir())
|
tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir())
|
||||||
|
|
||||||
return quantize(input_path=tmp_ggml_file_path,
|
return quantize(input_path=tmp_ggml_file_path,
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
model_family=model_family,
|
model_family=model_family,
|
||||||
dtype=dtype)
|
dtype=dtype)
|
||||||
|
else:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_ggml_file_path:
|
||||||
|
_convert_to_ggml(model_path=input_path,
|
||||||
|
outfile_dir=tmp_ggml_file_path,
|
||||||
|
model_family=model_family,
|
||||||
|
outtype="fp16")
|
||||||
|
tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir())
|
||||||
|
return quantize(input_path=tmp_ggml_file_path,
|
||||||
|
output_path=output_path,
|
||||||
|
model_family=model_family,
|
||||||
|
dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description='Model Convert Parameters')
|
||||||
|
parser.add_argument('-i', '--input_path', type=str, required=True,
|
||||||
|
help=("input_path, a path to a *directory* containing model weights"))
|
||||||
|
parser.add_argument('-o', '--output_path', type=str, required=True,
|
||||||
|
help=("output_path,save path of output quantized model."))
|
||||||
|
parser.add_argument('-x', '--model_family', type=str, required=True,
|
||||||
|
help=("model_family: Which model family your input model belongs to."
|
||||||
|
"Now only `llama`/`bloom`/`gptneox` are supported."))
|
||||||
|
parser.add_argument('-t', '--dtype', type=str, default="int4",
|
||||||
|
help="Which quantized precision will be converted.")
|
||||||
|
parser.add_argument('-p', '--tmp_path', type=str, default=None,
|
||||||
|
help="Which path to store the intermediate model during the"
|
||||||
|
"conversion process.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
params = vars(args)
|
||||||
|
convert_model(**params)
|
||||||
|
|
|
||||||
|
|
@ -42,24 +42,23 @@ _quantize_type = {"llama": _llama_quantize_type,
|
||||||
"gptneox": _gptneox_quantize_type}
|
"gptneox": _gptneox_quantize_type}
|
||||||
|
|
||||||
|
|
||||||
def quantize(input_path: str, output_path: str=None,
|
def quantize(input_path: str, output_path: str,
|
||||||
model_family: str = 'llama', dtype: str='q4_0'):
|
model_family: str, dtype: str='q4_0'):
|
||||||
"""
|
"""
|
||||||
Quantize ggml file to lower precision.
|
Quantize ggml file to lower precision.
|
||||||
|
|
||||||
:param input_path: Path of input ggml file, for example `./ggml-model-f16.bin`.
|
:param input_path: Path of input ggml file, for example `./ggml-model-f16.bin`.
|
||||||
:param output_path: Save path of output quantized model. Default to `None`.
|
:param output_path: Save path of output quantized model. You must pass a directory to
|
||||||
If you don't specify this parameter, quantized model will be saved in
|
save all related output. Filename of quantized model will be like
|
||||||
the same directory as the input and just replace precision with quantize_type
|
`bigdl_llm_llama_q4_0.bin`.
|
||||||
like `./ggml-model-q4_0.bin`.
|
:param model_family: Which model family your input model belongs to.
|
||||||
:param model_family: Which model family your input model belongs to. Default to `llama`.
|
|
||||||
Now only `llama`/`bloom`/`gptneox` are supported.
|
Now only `llama`/`bloom`/`gptneox` are supported.
|
||||||
:param dtype: Quantization method which differs in the resulting model disk size and
|
:param dtype: Quantization method which differs in the resulting model disk size and
|
||||||
inference speed. Defalut to `q4_0`. Difference model family may support different types,
|
inference speed. Defalut to `q4_0`. Difference model family may support
|
||||||
now the supported list is:
|
different types, now the supported list is:
|
||||||
llama : "q4_0", "q4_1", "q4_2"
|
llama : "q4_0", "q4_1", "q4_2"
|
||||||
bloom : "q4_0", "q4_1"
|
bloom : "q4_0", "q4_1"
|
||||||
gptneox : "q4_0", "q4_1", "q4_2", "q5_0", "q5_1", "q8_0"
|
gptneox : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"
|
||||||
|
|
||||||
:return: the path str to the converted ggml binary checkpoint
|
:return: the path str to the converted ggml binary checkpoint
|
||||||
"""
|
"""
|
||||||
|
|
@ -69,12 +68,14 @@ def quantize(input_path: str, output_path: str=None,
|
||||||
"{} is not in the list.".format(model_family))
|
"{} is not in the list.".format(model_family))
|
||||||
invalidInputError(os.path.isfile(input_path),
|
invalidInputError(os.path.isfile(input_path),
|
||||||
"The file {} was not found".format(input_path))
|
"The file {} was not found".format(input_path))
|
||||||
# TODO : multi input model path
|
invalidInputError(os.path.isdir(output_path),
|
||||||
if output_path is None:
|
"The output_path {} was not a directory".format(output_path))
|
||||||
output_path = Path(str(input_path).replace('f16', dtype))
|
|
||||||
# convert quantize type str into corresponding int value
|
# convert quantize type str into corresponding int value
|
||||||
quantize_type_map = _quantize_type[model_family]
|
quantize_type_map = _quantize_type[model_family]
|
||||||
invalidInputError(dtype in quantize_type_map, "{0} model just accept {1} now, \
|
output_filename = "bigdl_llm_{}_{}.bin".format(model_family,
|
||||||
|
dtype.lower())
|
||||||
|
output_path = os.path.join(output_path, output_filename)
|
||||||
|
invalidInputError(dtype.lower() in quantize_type_map, "{0} model just accept {1} now, \
|
||||||
but you pass in {2}.".format(
|
but you pass in {2}.".format(
|
||||||
model_family,
|
model_family,
|
||||||
list(quantize_type_map.keys()),
|
list(quantize_type_map.keys()),
|
||||||
|
|
|
||||||
|
|
@ -42,7 +42,7 @@ class AutoModelForCausalLM:
|
||||||
"""
|
"""
|
||||||
:param pretrained_model_name_or_path: We support 3 kinds of pretrained model checkpoint
|
:param pretrained_model_name_or_path: We support 3 kinds of pretrained model checkpoint
|
||||||
|
|
||||||
1. path for huggingface checkpoint that are directly pulled from hugginface hub.
|
1. path for huggingface checkpoint that are directly pulled from huggingface hub.
|
||||||
This should be a dir path that contains: weight bin, tokenizer config,
|
This should be a dir path that contains: weight bin, tokenizer config,
|
||||||
tokenizer.model (required for llama) and added_tokens.json (if applied).
|
tokenizer.model (required for llama) and added_tokens.json (if applied).
|
||||||
For lora fine tuned model, the path should be pointed to a merged weight.
|
For lora fine tuned model, the path should be pointed to a merged weight.
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue