[LLM] Supports GPTQ convert in transfomers-like API, and supports folder outfile for llm-convert (#8366)
* Add docstrings to llm_convert * Small docstrings fix * Unify outfile type to be a folder path for either gptq or pth model_format * Supports gptq model input for from_pretrained * Fix example and readme * Small fix * Python style fix * Bug fix in llm_convert * Python style check * Fix based on comments * Small fix
This commit is contained in:
parent
4ec46afa4f
commit
7ef1c890eb
6 changed files with 102 additions and 34 deletions
|
|
@ -39,8 +39,9 @@ Here is an example to use `llm-convert` command line tool.
|
||||||
# pth model
|
# pth model
|
||||||
llm-convert "/path/to/llama-7b-hf/" --model-format pth --outfile "/path/to/llama-7b-int4/" --model-family "llama"
|
llm-convert "/path/to/llama-7b-hf/" --model-format pth --outfile "/path/to/llama-7b-int4/" --model-family "llama"
|
||||||
# gptq model
|
# gptq model
|
||||||
llm-convert "/path/to/vicuna-13B-1.1-GPTQ-4bit-128g/" --model-format gptq --outfile "/path/to/out.bin" --model-family "llama"
|
llm-convert "/path/to/vicuna-13B-1.1-GPTQ-4bit-128g/" --model-format gptq --outfile "/path/to/vicuna-13B-int4/" --model-family "llama"
|
||||||
```
|
```
|
||||||
|
> An example GPTQ model can be found [here](https://huggingface.co/TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g/tree/main)
|
||||||
|
|
||||||
Here is an example to use `llm_convert` python API.
|
Here is an example to use `llm_convert` python API.
|
||||||
```bash
|
```bash
|
||||||
|
|
|
||||||
|
|
@ -40,13 +40,13 @@ def convert_and_load(repo_id_or_model_path, model_family, n_threads):
|
||||||
# to convert the downloaded Huggungface checkpoint first,
|
# to convert the downloaded Huggungface checkpoint first,
|
||||||
# and then load the binary checkpoint directly.
|
# and then load the binary checkpoint directly.
|
||||||
#
|
#
|
||||||
# from bigdl.llm.ggml import llm_convert
|
# from bigdl.llm import llm_convert
|
||||||
#
|
#
|
||||||
# model_path = repo_id_or_model_path
|
# model_path = repo_id_or_model_path
|
||||||
# output_ckpt_path = llm_convert(
|
# output_ckpt_path = llm_convert(
|
||||||
# input_path=model_path,
|
# model=model_path,
|
||||||
# output_path='./',
|
# outfile='./',
|
||||||
# dtype='int4',
|
# outtype='int4',
|
||||||
# model_family=model_family)
|
# model_family=model_family)
|
||||||
#
|
#
|
||||||
# llm = AutoModelForCausalLM.from_pretrained(
|
# llm = AutoModelForCausalLM.from_pretrained(
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,7 @@ from bigdl.llm.ggml.convert_model import convert_model as ggml_convert_model
|
||||||
from bigdl.llm.gptq.convert.convert_gptq_to_ggml import convert_gptq2ggml
|
from bigdl.llm.gptq.convert.convert_gptq_to_ggml import convert_gptq2ggml
|
||||||
from bigdl.llm.utils.common import invalidInputError
|
from bigdl.llm.utils.common import invalidInputError
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
def _special_kwarg_check(kwargs, check_args):
|
def _special_kwarg_check(kwargs, check_args):
|
||||||
|
|
@ -35,6 +36,46 @@ def llm_convert(model,
|
||||||
outtype='int4',
|
outtype='int4',
|
||||||
model_format="pth",
|
model_format="pth",
|
||||||
**kwargs):
|
**kwargs):
|
||||||
|
"""
|
||||||
|
This function is able to:
|
||||||
|
|
||||||
|
1. Convert Hugging Face llama-like / gpt-neox-like / bloom-like / starcoder-like
|
||||||
|
PyTorch model to lower precision in BigDL-LLM optimized GGML format.
|
||||||
|
2. Convert Hugging Face GPTQ format llama-like model to BigDL-LLM optimized
|
||||||
|
GGML format.
|
||||||
|
|
||||||
|
:param model: Path to a **directory**:
|
||||||
|
|
||||||
|
1. If ``model_format='pth'``, the folder should be a Hugging Face checkpoint
|
||||||
|
that is directly pulled from Hugging Face hub, for example ``./llama-7b-hf``.
|
||||||
|
This should be a dir path that contains: weight bin, tokenizer config,
|
||||||
|
tokenizer.model (required for llama) and added_tokens.json (if applied).
|
||||||
|
For lora finetuned model, the path should be pointed to a merged weight.
|
||||||
|
2. If ``model_format='gptq'``, the folder should be be a Hugging Face checkpoint
|
||||||
|
in GPTQ format, which contains weights in pytorch's .pt format,
|
||||||
|
and ``tokenizer.model``.
|
||||||
|
|
||||||
|
:param outfile: Save path of output quantized model. You must pass a **directory** to
|
||||||
|
save all related output.
|
||||||
|
:param model_family: Which model family your input model belongs to.
|
||||||
|
Now ``llama``/``bloom``/``gptneox``/``starcoder`` has been supported.
|
||||||
|
If ``model_format='gptq'``, only ``llama`` is supported.
|
||||||
|
:param dtype: Which quantized precision will be converted.
|
||||||
|
If ``model_format='pth'``, `int4` and `int8` are supported,
|
||||||
|
meanwhile `int8` only works for `llama` and `gptneox`.
|
||||||
|
If ``model_format='gptq'``, only ``int4`` is supported.
|
||||||
|
:param model_format: Specify the model format to be converted. ``pth`` is for
|
||||||
|
PyTorch model checkpoint from Hugging Face. ``gptq`` is for GPTQ format
|
||||||
|
model from Hugging Face.
|
||||||
|
:param **kwargs: Supported keyword arguments includes:
|
||||||
|
|
||||||
|
* ``tmp_path``: Valid when ``model_format='pth'``. It refers to the path
|
||||||
|
that stores the intermediate model during the conversion process.
|
||||||
|
* ``tokenizer_path``: Valid when ``model_format='gptq'``. It refers to the path
|
||||||
|
where ``tokenizer.model`` is located (if it is not in the ``model`` directory)
|
||||||
|
|
||||||
|
:return: the path string to the converted lower precision checkpoint.
|
||||||
|
"""
|
||||||
if model_format == "pth":
|
if model_format == "pth":
|
||||||
_, _used_args = _special_kwarg_check(kwargs=kwargs,
|
_, _used_args = _special_kwarg_check(kwargs=kwargs,
|
||||||
check_args=["tmp_path"])
|
check_args=["tmp_path"])
|
||||||
|
|
@ -48,11 +89,23 @@ def llm_convert(model,
|
||||||
invalidInputError(model_family == "llama" and outtype == 'int4',
|
invalidInputError(model_family == "llama" and outtype == 'int4',
|
||||||
"Convert GPTQ models should always "
|
"Convert GPTQ models should always "
|
||||||
"specify `--model-family llama --dtype int4` in the command line.")
|
"specify `--model-family llama --dtype int4` in the command line.")
|
||||||
|
invalidInputError(os.path.isdir(outfile),
|
||||||
|
"The output_path {} is not a directory".format(outfile))
|
||||||
_, _used_args = _special_kwarg_check(kwargs=kwargs,
|
_, _used_args = _special_kwarg_check(kwargs=kwargs,
|
||||||
check_args=["tokenizer_path"])
|
check_args=["tokenizer_path"])
|
||||||
|
|
||||||
|
output_filename = "bigdl_llm_{}_{}_from_gptq.bin".format(model_family,
|
||||||
|
outtype.lower())
|
||||||
|
outfile = os.path.join(outfile, output_filename)
|
||||||
|
|
||||||
|
if "tokenizer_path" in _used_args:
|
||||||
|
gptq_tokenizer_path = _used_args["tokenizer_path"]
|
||||||
|
else:
|
||||||
|
gptq_tokenizer_path = None
|
||||||
|
|
||||||
convert_gptq2ggml(input_path=model,
|
convert_gptq2ggml(input_path=model,
|
||||||
output_path=outfile,
|
output_path=outfile,
|
||||||
tokenizer_path=_used_args["tokenizer_path"],
|
tokenizer_path=gptq_tokenizer_path,
|
||||||
)
|
)
|
||||||
return outfile
|
return outfile
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -29,17 +29,18 @@ def convert_model(input_path: str,
|
||||||
dtype: str = 'int4',
|
dtype: str = 'int4',
|
||||||
tmp_path: str = None):
|
tmp_path: str = None):
|
||||||
"""
|
"""
|
||||||
Convert Hugging Face llama-like / gpt-neox-like / bloom-like model to lower precision
|
Convert Hugging Face llama-like / gpt-neox-like / bloom-like / starcoder-like
|
||||||
|
PyTorch model to lower precision
|
||||||
|
|
||||||
:param input_path: Path to a *directory* for huggingface checkpoint that are directly
|
:param input_path: Path to a **directory** for huggingface checkpoint that is directly
|
||||||
pulled from huggingface hub, for example `./llama-7b-hf`. This should be a dir
|
pulled from huggingface hub, for example `./llama-7b-hf`. This should be a dir
|
||||||
path that contains: weight bin, tokenizer config, tokenizer.model (required for
|
path that contains: weight bin, tokenizer config, tokenizer.model (required for
|
||||||
llama) and added_tokens.json (if applied).
|
llama) and added_tokens.json (if applied).
|
||||||
For lora finetuned model, the path should be pointed to a merged weight.
|
For lora finetuned model, the path should be pointed to a merged weight.
|
||||||
:param output_path: Save path of output quantized model. You must pass a *directory* to
|
:param output_path: Save path of output quantized model. You must pass a **directory** to
|
||||||
save all related output.
|
save all related output.
|
||||||
:param model_family: Which model family your input model belongs to.
|
:param model_family: Which model family your input model belongs to.
|
||||||
Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported.
|
Now only ``llama``/``bloom``/``gptneox``/``starcoder`` are supported.
|
||||||
:param dtype: Which quantized precision will be converted.
|
:param dtype: Which quantized precision will be converted.
|
||||||
Now only `int4` and `int8` are supported, and `int8` only works for `llama`
|
Now only `int4` and `int8` are supported, and `int8` only works for `llama`
|
||||||
and `gptneox`.
|
and `gptneox`.
|
||||||
|
|
|
||||||
|
|
@ -74,9 +74,9 @@ def quantize(input_path: str, output_path: str,
|
||||||
family('llama', 'bloom', 'gptneox', 'starcoder')",
|
family('llama', 'bloom', 'gptneox', 'starcoder')",
|
||||||
"{} is not in the list.".format(model_family))
|
"{} is not in the list.".format(model_family))
|
||||||
invalidInputError(os.path.isfile(input_path),
|
invalidInputError(os.path.isfile(input_path),
|
||||||
"The file {} was not found".format(input_path))
|
"The file {} is not found".format(input_path))
|
||||||
invalidInputError(os.path.isdir(output_path),
|
invalidInputError(os.path.isdir(output_path),
|
||||||
"The output_path {} was not a directory".format(output_path))
|
"The output_path {} is not a directory".format(output_path))
|
||||||
# convert quantize type str into corresponding int value
|
# convert quantize type str into corresponding int value
|
||||||
quantize_type_map = _quantize_type[model_family]
|
quantize_type_map = _quantize_type[model_family]
|
||||||
output_filename = "bigdl_llm_{}_{}.bin".format(model_family,
|
output_filename = "bigdl_llm_{}_{}.bin".format(model_family,
|
||||||
|
|
|
||||||
|
|
@ -33,6 +33,7 @@ class AutoModelForCausalLM:
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls,
|
def from_pretrained(cls,
|
||||||
pretrained_model_name_or_path: str,
|
pretrained_model_name_or_path: str,
|
||||||
|
model_format: str = 'pth',
|
||||||
model_family: str = 'llama',
|
model_family: str = 'llama',
|
||||||
dtype: str = 'int4',
|
dtype: str = 'int4',
|
||||||
cache_dir: str = './',
|
cache_dir: str = './',
|
||||||
|
|
@ -41,20 +42,30 @@ class AutoModelForCausalLM:
|
||||||
"""
|
"""
|
||||||
:param pretrained_model_name_or_path: We support 3 kinds of pretrained model checkpoint
|
:param pretrained_model_name_or_path: We support 3 kinds of pretrained model checkpoint
|
||||||
|
|
||||||
1. path for huggingface checkpoint that are directly pulled from huggingface hub.
|
1. Path to directory for Hugging Face checkpoint that are directly pulled from
|
||||||
This should be a dir path that contains: weight bin, tokenizer config,
|
Hugging Face hub.
|
||||||
tokenizer.model (required for llama) and added_tokens.json (if applied).
|
|
||||||
For lora fine tuned model, the path should be pointed to a merged weight.
|
|
||||||
2. path for converted ggml binary checkpoint. The checkpoint should be converted by
|
|
||||||
``bigdl.llm.ggml.convert_model``.
|
|
||||||
3. a str for huggingface hub repo id.
|
|
||||||
|
|
||||||
:param model_family: the model family of the pretrained checkpoint.
|
If ``model_format='pth'``, the folder should contain: weight bin, tokenizer
|
||||||
|
config, tokenizer.model (required for llama) and added_tokens.json (if applied).
|
||||||
|
For lora fine tuned model, the path should be pointed to a merged weight.
|
||||||
|
|
||||||
|
If ``model_format='gptq'``, the folder should be be a Hugging Face checkpoint
|
||||||
|
in GPTQ format, which contains weights in pytorch's .pt format,
|
||||||
|
and ``tokenizer.model``.
|
||||||
|
|
||||||
|
2. Path for converted BigDL-LLM optimized ggml binary checkpoint.
|
||||||
|
The checkpoint should be converted by ``bigdl.llm.llm_convert``.
|
||||||
|
3. A str for Hugging Face hub repo id.
|
||||||
|
|
||||||
|
:param model_format: Specify the model format to be converted. ``pth`` is for
|
||||||
|
PyTorch model checkpoint from Hugging Face. ``gptq`` is for GPTQ format
|
||||||
|
model from Hugging Face.
|
||||||
|
:param model_family: The model family of the pretrained checkpoint.
|
||||||
Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"`` and ``"starcoder"``.
|
Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"`` and ``"starcoder"``.
|
||||||
:param dtype: Which quantized precision will be converted.
|
:param dtype: Which quantized precision will be converted.
|
||||||
Now only `int4` and `int8` are supported, and `int8` only works for `llama`
|
Now only `int4` and `int8` are supported, and `int8` only works for `llama`
|
||||||
, `gptneox` and `starcoder`.
|
, `gptneox` and `starcoder`.
|
||||||
:param cache_dir: (optional) this parameter will only be used when
|
:param cache_dir: (optional) This parameter will only be used when
|
||||||
``pretrained_model_name_or_path`` is a hugginface checkpoint or hub repo id.
|
``pretrained_model_name_or_path`` is a hugginface checkpoint or hub repo id.
|
||||||
It indicates the saving path for the converted low precision model.
|
It indicates the saving path for the converted low precision model.
|
||||||
:param tmp_path: (optional) Which path to store the intermediate fp16 model during the
|
:param tmp_path: (optional) Which path to store the intermediate fp16 model during the
|
||||||
|
|
@ -73,7 +84,7 @@ class AutoModelForCausalLM:
|
||||||
# if not, it is likely that the user wants to pass in the repo id.
|
# if not, it is likely that the user wants to pass in the repo id.
|
||||||
if not os.path.exists(pretrained_model_name_or_path):
|
if not os.path.exists(pretrained_model_name_or_path):
|
||||||
try:
|
try:
|
||||||
# download from huggingface based on repo id
|
# download from Hugging Face based on repo id
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
pretrained_model_name_or_path = snapshot_download(
|
pretrained_model_name_or_path = snapshot_download(
|
||||||
repo_id=pretrained_model_name_or_path)
|
repo_id=pretrained_model_name_or_path)
|
||||||
|
|
@ -82,24 +93,26 @@ class AutoModelForCausalLM:
|
||||||
# if downloading fails, it could be the case that repo id is invalid,
|
# if downloading fails, it could be the case that repo id is invalid,
|
||||||
# or the user pass in the wrong path for checkpoint
|
# or the user pass in the wrong path for checkpoint
|
||||||
invalidInputError(False,
|
invalidInputError(False,
|
||||||
"Downloadng from huggingface repo id {} failed. "
|
"Downloadng from Hugging Face repo id {} failed. "
|
||||||
"Please input valid huggingface hub repo id, "
|
"Please input valid Hugging Face hub repo id, "
|
||||||
"or provide the valid path to huggingface / "
|
"or provide the valid path to Hugging Face / "
|
||||||
"ggml binary checkpoint, for pretrained_model_name_or_path"
|
"BigDL-LLM optimized ggml binary checkpoint, "
|
||||||
|
"for pretrained_model_name_or_path"
|
||||||
.format(pretrained_model_name_or_path))
|
.format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
ggml_model_path = pretrained_model_name_or_path
|
ggml_model_path = pretrained_model_name_or_path
|
||||||
# check whether pretrained_model_name_or_path is a file.
|
# check whether pretrained_model_name_or_path is a file.
|
||||||
# if not, it is likely that pretrained_model_name_or_path
|
# if not, it is likely that pretrained_model_name_or_path
|
||||||
# points to a huggingface checkpoint
|
# points to a Hugging Face checkpoint
|
||||||
if not os.path.isfile(pretrained_model_name_or_path):
|
if not os.path.isfile(pretrained_model_name_or_path):
|
||||||
# huggingface checkpoint
|
# Hugging Face checkpoint
|
||||||
from bigdl.llm.ggml import convert_model
|
from bigdl.llm import llm_convert
|
||||||
ggml_model_path = convert_model(input_path=pretrained_model_name_or_path,
|
ggml_model_path = llm_convert(model=pretrained_model_name_or_path,
|
||||||
output_path=cache_dir,
|
outfile=cache_dir,
|
||||||
model_family=model_family,
|
model_family=model_family,
|
||||||
dtype=dtype,
|
outtype=dtype,
|
||||||
tmp_path=tmp_path)
|
model_format=model_format,
|
||||||
|
tmp_path=tmp_path)
|
||||||
|
|
||||||
if model_family == 'llama':
|
if model_family == 'llama':
|
||||||
from bigdl.llm.ggml.model.llama import Llama
|
from bigdl.llm.ggml.model.llama import Llama
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue