* Rename bigdl/llm to ipex_llm * rm python/llm/src/bigdl * from bigdl.llm to from ipex_llm
128 lines
6.1 KiB
Python
128 lines
6.1 KiB
Python
#
|
|
# Copyright 2016 The BigDL Authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
import os
|
|
import time
|
|
from pathlib import Path
|
|
from ipex_llm.ggml.convert import _convert_to_ggml, _convert_chatglm
|
|
from ipex_llm.ggml.quantize import quantize
|
|
from ipex_llm.utils.common import invalidInputError
|
|
import argparse
|
|
import tempfile
|
|
|
|
|
|
def convert_model(input_path: str,
|
|
output_path: str,
|
|
model_family: str,
|
|
dtype: str = 'int4',
|
|
tmp_path: str = None):
|
|
"""
|
|
Convert Hugging Face llama-like / gpt-neox-like / bloom-like / starcoder-like
|
|
PyTorch model to lower precision
|
|
|
|
:param input_path: Path to a **directory** for huggingface checkpoint that is directly
|
|
pulled from huggingface hub, for example `./llama-7b-hf`. This should be a dir
|
|
path that contains: weight bin, tokenizer config, tokenizer.model (required for
|
|
llama) and added_tokens.json (if applied).
|
|
For lora finetuned model, the path should be pointed to a merged weight.
|
|
:param output_path: Save path of output quantized model. You must pass a **directory** to
|
|
save all related output.
|
|
:param model_family: Which model family your input model belongs to.
|
|
Now only ``llama``/``bloom``/``gptneox``/``starcoder`` are supported.
|
|
:param dtype: Which quantized precision will be converted.
|
|
Now only `int4` and `int8` are supported, and `int8` only works for `llama`
|
|
and `gptneox`.
|
|
:param tmp_path: Which path to store the intermediate model during the conversion process.
|
|
Default to `None` so that intermediate model will not be saved.
|
|
|
|
:return: the path string to the converted lower precision checkpoint.
|
|
"""
|
|
|
|
dtype = dtype.lower()
|
|
# make sure directory exists
|
|
os.makedirs(output_path, exist_ok=True)
|
|
# check input value
|
|
invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder', 'chatglm'],
|
|
"Now we only support quantization of model \
|
|
family('llama', 'bloom', 'gptneox', 'starcoder', 'chatglm')",
|
|
"{} is not in the list.".format(model_family))
|
|
invalidInputError(os.path.isdir(output_path),
|
|
"The output_path {} was not a directory".format(output_path))
|
|
invalidInputError(dtype in ['int4', 'int8'],
|
|
"Now only int4 and int8 are supported.")
|
|
# check for input_path
|
|
invalidInputError(os.path.exists(input_path),
|
|
"The input path {} was not found".format(input_path))
|
|
invalidInputError(os.path.isdir(input_path),
|
|
"The input path {} was not a directory".format(input_path))
|
|
# shall we support model_id or just model directory?
|
|
|
|
if dtype == 'int4':
|
|
dtype = 'q4_0'
|
|
elif dtype == 'int8':
|
|
dtype = 'q8_0'
|
|
invalidInputError(model_family in ['llama', 'gptneox', 'starcoder'],
|
|
"Now we only support int8 quantization of model \
|
|
family('llama', 'gptneox', 'starcoder')",
|
|
"{} is not in the list.".format(model_family))
|
|
|
|
# chatglm merges convertion and quantization into one operation.
|
|
if model_family == 'chatglm':
|
|
return _convert_chatglm(model_path=input_path,
|
|
outfile_dir=output_path,
|
|
outtype=dtype)
|
|
|
|
if tmp_path is not None:
|
|
model_name = Path(input_path).stem
|
|
tmp_ggml_file_path = os.path.join(tmp_path, f'{model_name}_{int(time.time())}')
|
|
_convert_to_ggml(model_path=input_path,
|
|
outfile_dir=tmp_ggml_file_path,
|
|
model_family=model_family,
|
|
outtype="fp16")
|
|
tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir())
|
|
return quantize(input_path=tmp_ggml_file_path,
|
|
output_path=output_path,
|
|
model_family=model_family,
|
|
dtype=dtype)
|
|
else:
|
|
with tempfile.TemporaryDirectory() as tmp_ggml_file_path:
|
|
_convert_to_ggml(model_path=input_path,
|
|
outfile_dir=tmp_ggml_file_path,
|
|
model_family=model_family,
|
|
outtype="fp16")
|
|
tmp_ggml_file_path = next(Path(tmp_ggml_file_path).iterdir())
|
|
return quantize(input_path=tmp_ggml_file_path,
|
|
output_path=output_path,
|
|
model_family=model_family,
|
|
dtype=dtype)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Model Convert Parameters')
|
|
parser.add_argument('-i', '--input_path', type=str, required=True,
|
|
help=("input_path, a path to a *directory* containing model weights"))
|
|
parser.add_argument('-o', '--output_path', type=str, required=True,
|
|
help=("output_path,save path of output quantized model."))
|
|
parser.add_argument('-x', '--model_family', type=str, required=True,
|
|
help=("model_family: Which model family your input model belongs to."
|
|
"Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported."))
|
|
parser.add_argument('-t', '--dtype', type=str, default="int4",
|
|
help="Which quantized precision will be converted.")
|
|
parser.add_argument('-p', '--tmp_path', type=str, default=None,
|
|
help="Which path to store the intermediate model during the"
|
|
"conversion process.")
|
|
args = parser.parse_args()
|
|
params = vars(args)
|
|
convert_model(**params)
|