diff --git a/python/llm/README.md b/python/llm/README.md index 47a06784..2910ba9e 100644 --- a/python/llm/README.md +++ b/python/llm/README.md @@ -39,8 +39,9 @@ Here is an example to use `llm-convert` command line tool. # pth model llm-convert "/path/to/llama-7b-hf/" --model-format pth --outfile "/path/to/llama-7b-int4/" --model-family "llama" # gptq model -llm-convert "/path/to/vicuna-13B-1.1-GPTQ-4bit-128g/" --model-format gptq --outfile "/path/to/out.bin" --model-family "llama" +llm-convert "/path/to/vicuna-13B-1.1-GPTQ-4bit-128g/" --model-format gptq --outfile "/path/to/vicuna-13B-int4/" --model-family "llama" ``` +> An example GPTQ model can be found [here](https://huggingface.co/TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g/tree/main) Here is an example to use `llm_convert` python API. ```bash diff --git a/python/llm/example/transformers/int4_pipeline.py b/python/llm/example/transformers/int4_pipeline.py index 05b30ff5..d8ae9238 100644 --- a/python/llm/example/transformers/int4_pipeline.py +++ b/python/llm/example/transformers/int4_pipeline.py @@ -40,13 +40,13 @@ def convert_and_load(repo_id_or_model_path, model_family, n_threads): # to convert the downloaded Huggungface checkpoint first, # and then load the binary checkpoint directly. # - # from bigdl.llm.ggml import llm_convert + # from bigdl.llm import llm_convert # # model_path = repo_id_or_model_path # output_ckpt_path = llm_convert( - # input_path=model_path, - # output_path='./', - # dtype='int4', + # model=model_path, + # outfile='./', + # outtype='int4', # model_family=model_family) # # llm = AutoModelForCausalLM.from_pretrained( diff --git a/python/llm/src/bigdl/llm/convert_model.py b/python/llm/src/bigdl/llm/convert_model.py index 56cf0345..49758ddc 100644 --- a/python/llm/src/bigdl/llm/convert_model.py +++ b/python/llm/src/bigdl/llm/convert_model.py @@ -19,6 +19,7 @@ from bigdl.llm.ggml.convert_model import convert_model as ggml_convert_model from bigdl.llm.gptq.convert.convert_gptq_to_ggml import convert_gptq2ggml from bigdl.llm.utils.common import invalidInputError import argparse +import os def _special_kwarg_check(kwargs, check_args): @@ -35,6 +36,46 @@ def llm_convert(model, outtype='int4', model_format="pth", **kwargs): + """ + This function is able to: + + 1. Convert Hugging Face llama-like / gpt-neox-like / bloom-like / starcoder-like + PyTorch model to lower precision in BigDL-LLM optimized GGML format. + 2. Convert Hugging Face GPTQ format llama-like model to BigDL-LLM optimized + GGML format. + + :param model: Path to a **directory**: + + 1. If ``model_format='pth'``, the folder should be a Hugging Face checkpoint + that is directly pulled from Hugging Face hub, for example ``./llama-7b-hf``. + This should be a dir path that contains: weight bin, tokenizer config, + tokenizer.model (required for llama) and added_tokens.json (if applied). + For lora finetuned model, the path should be pointed to a merged weight. + 2. If ``model_format='gptq'``, the folder should be be a Hugging Face checkpoint + in GPTQ format, which contains weights in pytorch's .pt format, + and ``tokenizer.model``. + + :param outfile: Save path of output quantized model. You must pass a **directory** to + save all related output. + :param model_family: Which model family your input model belongs to. + Now ``llama``/``bloom``/``gptneox``/``starcoder`` has been supported. + If ``model_format='gptq'``, only ``llama`` is supported. + :param dtype: Which quantized precision will be converted. + If ``model_format='pth'``, `int4` and `int8` are supported, + meanwhile `int8` only works for `llama` and `gptneox`. + If ``model_format='gptq'``, only ``int4`` is supported. + :param model_format: Specify the model format to be converted. ``pth`` is for + PyTorch model checkpoint from Hugging Face. ``gptq`` is for GPTQ format + model from Hugging Face. + :param **kwargs: Supported keyword arguments includes: + + * ``tmp_path``: Valid when ``model_format='pth'``. It refers to the path + that stores the intermediate model during the conversion process. + * ``tokenizer_path``: Valid when ``model_format='gptq'``. It refers to the path + where ``tokenizer.model`` is located (if it is not in the ``model`` directory) + + :return: the path string to the converted lower precision checkpoint. + """ if model_format == "pth": _, _used_args = _special_kwarg_check(kwargs=kwargs, check_args=["tmp_path"]) @@ -48,11 +89,23 @@ def llm_convert(model, invalidInputError(model_family == "llama" and outtype == 'int4', "Convert GPTQ models should always " "specify `--model-family llama --dtype int4` in the command line.") + invalidInputError(os.path.isdir(outfile), + "The output_path {} is not a directory".format(outfile)) _, _used_args = _special_kwarg_check(kwargs=kwargs, check_args=["tokenizer_path"]) + + output_filename = "bigdl_llm_{}_{}_from_gptq.bin".format(model_family, + outtype.lower()) + outfile = os.path.join(outfile, output_filename) + + if "tokenizer_path" in _used_args: + gptq_tokenizer_path = _used_args["tokenizer_path"] + else: + gptq_tokenizer_path = None + convert_gptq2ggml(input_path=model, output_path=outfile, - tokenizer_path=_used_args["tokenizer_path"], + tokenizer_path=gptq_tokenizer_path, ) return outfile else: diff --git a/python/llm/src/bigdl/llm/ggml/convert_model.py b/python/llm/src/bigdl/llm/ggml/convert_model.py index 495eac3f..066b6b02 100644 --- a/python/llm/src/bigdl/llm/ggml/convert_model.py +++ b/python/llm/src/bigdl/llm/ggml/convert_model.py @@ -29,17 +29,18 @@ def convert_model(input_path: str, dtype: str = 'int4', tmp_path: str = None): """ - Convert Hugging Face llama-like / gpt-neox-like / bloom-like model to lower precision + Convert Hugging Face llama-like / gpt-neox-like / bloom-like / starcoder-like + PyTorch model to lower precision - :param input_path: Path to a *directory* for huggingface checkpoint that are directly + :param input_path: Path to a **directory** for huggingface checkpoint that is directly pulled from huggingface hub, for example `./llama-7b-hf`. This should be a dir path that contains: weight bin, tokenizer config, tokenizer.model (required for llama) and added_tokens.json (if applied). For lora finetuned model, the path should be pointed to a merged weight. - :param output_path: Save path of output quantized model. You must pass a *directory* to + :param output_path: Save path of output quantized model. You must pass a **directory** to save all related output. :param model_family: Which model family your input model belongs to. - Now only `llama`/`bloom`/`gptneox`/`starcoder` are supported. + Now only ``llama``/``bloom``/``gptneox``/``starcoder`` are supported. :param dtype: Which quantized precision will be converted. Now only `int4` and `int8` are supported, and `int8` only works for `llama` and `gptneox`. diff --git a/python/llm/src/bigdl/llm/ggml/quantize.py b/python/llm/src/bigdl/llm/ggml/quantize.py index 8a20477a..952b48e1 100644 --- a/python/llm/src/bigdl/llm/ggml/quantize.py +++ b/python/llm/src/bigdl/llm/ggml/quantize.py @@ -74,9 +74,9 @@ def quantize(input_path: str, output_path: str, family('llama', 'bloom', 'gptneox', 'starcoder')", "{} is not in the list.".format(model_family)) invalidInputError(os.path.isfile(input_path), - "The file {} was not found".format(input_path)) + "The file {} is not found".format(input_path)) invalidInputError(os.path.isdir(output_path), - "The output_path {} was not a directory".format(output_path)) + "The output_path {} is not a directory".format(output_path)) # convert quantize type str into corresponding int value quantize_type_map = _quantize_type[model_family] output_filename = "bigdl_llm_{}_{}.bin".format(model_family, diff --git a/python/llm/src/bigdl/llm/ggml/transformers/model.py b/python/llm/src/bigdl/llm/ggml/transformers/model.py index 9fc8557f..86999bad 100644 --- a/python/llm/src/bigdl/llm/ggml/transformers/model.py +++ b/python/llm/src/bigdl/llm/ggml/transformers/model.py @@ -33,6 +33,7 @@ class AutoModelForCausalLM: @classmethod def from_pretrained(cls, pretrained_model_name_or_path: str, + model_format: str = 'pth', model_family: str = 'llama', dtype: str = 'int4', cache_dir: str = './', @@ -41,20 +42,30 @@ class AutoModelForCausalLM: """ :param pretrained_model_name_or_path: We support 3 kinds of pretrained model checkpoint - 1. path for huggingface checkpoint that are directly pulled from huggingface hub. - This should be a dir path that contains: weight bin, tokenizer config, - tokenizer.model (required for llama) and added_tokens.json (if applied). - For lora fine tuned model, the path should be pointed to a merged weight. - 2. path for converted ggml binary checkpoint. The checkpoint should be converted by - ``bigdl.llm.ggml.convert_model``. - 3. a str for huggingface hub repo id. + 1. Path to directory for Hugging Face checkpoint that are directly pulled from + Hugging Face hub. - :param model_family: the model family of the pretrained checkpoint. + If ``model_format='pth'``, the folder should contain: weight bin, tokenizer + config, tokenizer.model (required for llama) and added_tokens.json (if applied). + For lora fine tuned model, the path should be pointed to a merged weight. + + If ``model_format='gptq'``, the folder should be be a Hugging Face checkpoint + in GPTQ format, which contains weights in pytorch's .pt format, + and ``tokenizer.model``. + + 2. Path for converted BigDL-LLM optimized ggml binary checkpoint. + The checkpoint should be converted by ``bigdl.llm.llm_convert``. + 3. A str for Hugging Face hub repo id. + + :param model_format: Specify the model format to be converted. ``pth`` is for + PyTorch model checkpoint from Hugging Face. ``gptq`` is for GPTQ format + model from Hugging Face. + :param model_family: The model family of the pretrained checkpoint. Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"`` and ``"starcoder"``. :param dtype: Which quantized precision will be converted. Now only `int4` and `int8` are supported, and `int8` only works for `llama` , `gptneox` and `starcoder`. - :param cache_dir: (optional) this parameter will only be used when + :param cache_dir: (optional) This parameter will only be used when ``pretrained_model_name_or_path`` is a hugginface checkpoint or hub repo id. It indicates the saving path for the converted low precision model. :param tmp_path: (optional) Which path to store the intermediate fp16 model during the @@ -73,7 +84,7 @@ class AutoModelForCausalLM: # if not, it is likely that the user wants to pass in the repo id. if not os.path.exists(pretrained_model_name_or_path): try: - # download from huggingface based on repo id + # download from Hugging Face based on repo id from huggingface_hub import snapshot_download pretrained_model_name_or_path = snapshot_download( repo_id=pretrained_model_name_or_path) @@ -82,24 +93,26 @@ class AutoModelForCausalLM: # if downloading fails, it could be the case that repo id is invalid, # or the user pass in the wrong path for checkpoint invalidInputError(False, - "Downloadng from huggingface repo id {} failed. " - "Please input valid huggingface hub repo id, " - "or provide the valid path to huggingface / " - "ggml binary checkpoint, for pretrained_model_name_or_path" + "Downloadng from Hugging Face repo id {} failed. " + "Please input valid Hugging Face hub repo id, " + "or provide the valid path to Hugging Face / " + "BigDL-LLM optimized ggml binary checkpoint, " + "for pretrained_model_name_or_path" .format(pretrained_model_name_or_path)) ggml_model_path = pretrained_model_name_or_path # check whether pretrained_model_name_or_path is a file. # if not, it is likely that pretrained_model_name_or_path - # points to a huggingface checkpoint + # points to a Hugging Face checkpoint if not os.path.isfile(pretrained_model_name_or_path): - # huggingface checkpoint - from bigdl.llm.ggml import convert_model - ggml_model_path = convert_model(input_path=pretrained_model_name_or_path, - output_path=cache_dir, - model_family=model_family, - dtype=dtype, - tmp_path=tmp_path) + # Hugging Face checkpoint + from bigdl.llm import llm_convert + ggml_model_path = llm_convert(model=pretrained_model_name_or_path, + outfile=cache_dir, + model_family=model_family, + outtype=dtype, + model_format=model_format, + tmp_path=tmp_path) if model_family == 'llama': from bigdl.llm.ggml.model.llama import Llama