transformer api refactor (#8389)

* transformer api refactor * fix style * add huggingface tokenizer usage in example and make ggml tokenzizer as option 1 and huggingface tokenizer as option 2 * fix style
2023-06-25 17:15:33 +08:00 · 2023-06-25 17:15:33 +08:00 · 446175cc05
commit 446175cc05
parent ce6d06eb0a
6 changed files with 139 additions and 217 deletions
--- a/python/llm/README.md
+++ b/python/llm/README.md
@ -75,40 +75,46 @@ llm-cli -x llama -h
 ```
 #### Transformers like API
-Users could load converted model or even the unconverted huggingface model directly by `AutoModelForCausalLM.from_pretrained`.
+You can also load the converted model using `BigdlForCausalLM` with a transformer like API, 
 ```python
 from bigdl.llm.transformers import BigdlForCausalLM
 llm = BigdlForCausalLM.from_pretrained("/path/to/llama-7b-int4/bigdl-llm-xxx.bin",
                                           model_family="llama")
 prompt="What is AI?"
 ```
 and simply do inference end-to-end like
 ```python
 output = llm(prompt, max_tokens=32)
 ```
 If you need to seperate the tokenization and generation, you can also do inference like
 ```python
 tokens_id = llm.tokenize(prompt)
 output_tokens_id = llm.generate(tokens_id, max_new_tokens=32)
 output = llm.batch_decode(output_tokens_id)
 ```
 Alternatively, you can load huggingface model directly using `AutoModelForCausalLM.from_pretrained`. 
 ```python
-from bigdl.llm.ggml.transformers import AutoModelForCausalLM
+from bigdl.llm.transformers import AutoModelForCausalLM
-# option 1: load converted model
+# option 1: load huggingface checkpoint
 llm = AutoModelForCausalLM.from_pretrained("/path/to/llama-7b-int4/bigdl-llm-xxx.bin",
                                           model_family="llama")
 # option 2: load huggingface checkpoint
 llm = AutoModelForCausalLM.from_pretrained("/path/to/llama-7b-hf/",
                                           model_family="llama")
-# option 3: load from huggingface hub repo
+# option 2: load from huggingface hub repo
 llm = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf",
                                           model_family="llama")
 ```
-Users could use llm to do the inference. Apart from end-to-end fast forward, we also support split the tokenization and model inference in our API.
+You can then use the the model the same way as you use transformers.
 ```python
 # end-to-end fast forward w/o spliting the tokenization and model inferencing
 result = llm("what is ai")
 # Use transformers tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
 tokens = tokenizer("what is ai").input_ids
 tokens_id = llm.generate(tokens, max_new_tokens=32)
 tokenizer.batch_decode(tokens_id)
 # Use bigdl-llm tokenizer
 tokens = llm.tokenize("what is ai")
 tokens_id = llm.generate(tokens, max_new_tokens=32)
 decoded = llm.batch_decode(tokens_id)
 ```
 #### llama-cpp-python like API
--- a/python/llm/example/transformers/int4_pipeline.py
+++ b/python/llm/example/transformers/int4_pipeline.py
@ -18,54 +18,49 @@ import time
 import argparse
-def convert_and_load(repo_id_or_model_path, model_family, n_threads):
+def convert(repo_id_or_model_path, model_family, tmp_path):
    from bigdl.llm import llm_convert
    original_llm_path = repo_id_or_model_path
    bigdl_llm_path = llm_convert(
        model=original_llm_path,
        outfile='./',
        outtype='int4',
        tmp_path=tmp_path,
        model_family=model_family)
-    from bigdl.llm.ggml.transformers import AutoModelForCausalLM
+    return bigdl_llm_path
-    # here you may input the HuggingFace repo id directly as the value of `pretrained_model_name_or_path`.
+def load(model_path, model_family, n_threads):
-    # This will allow the pre-trained model to be downloaded directly from the HuggingFace repository.
+    from bigdl.llm.transformers import BigdlForCausalLM
-    # The downloaded model will then be converted to binary format with int4 dtype weights,
+    llm = BigdlForCausalLM.from_pretrained(
-    # and saved into the cache_dir folder.
+        pretrained_model_name_or_path=model_path,
    #
    # if you already have the pre-trained model downloaded, you can provide the path to
    # the downloaded folder as the value of `pretrained_model_name_or_path``
    llm = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path=repo_id_or_model_path,
        model_family=model_family,
        dtype='int4',
        cache_dir='./',
        n_threads=n_threads)
    # if you want to explicitly convert the pre-trained model, you can use the `llm_convert` API 
    # to convert the downloaded Huggungface checkpoint first,
    # and then load the binary checkpoint directly.
    #
    # from bigdl.llm import llm_convert
    #
    # model_path = repo_id_or_model_path
    # output_ckpt_path = llm_convert(
    #     model=model_path,
    #     outfile='./',
    #     outtype='int4',
    #     model_family=model_family)
    #
    # llm = AutoModelForCausalLM.from_pretrained(
    #     pretrained_model_name_or_path=output_ckpt_path,
    #     model_family=model_family,
    #     n_threads=n_threads)
    return llm
 def inference(llm, repo_id_or_model_path, model_family, prompt):
    if model_family in ['llama', 'gptneox']:
-        # Option 1: Use HuggingFace transformers tokenizer
+        # ------ Option 1: Use bigdl-llm based tokenizer
        print('-'*20, ' bigdl-llm based tokenizer ', '-'*20)
        st = time.time()
        # please note that the prompt here can either be a string or a list of string
        tokens_id = llm.tokenize(prompt)
        output_tokens_id = llm.generate(tokens_id, max_new_tokens=32)
        output = llm.batch_decode(output_tokens_id)
        print(f'Inference time: {time.time()-st} s')
        print(f'Output:\n{output}')
        # ------- Option 2: Use HuggingFace transformers tokenizer
        print('-'*20, ' HuggingFace transformers tokenizer ', '-'*20)
        print('Please note that the loading of HuggingFace transformers tokenizer may take some time.\n')
        # here is only a workaround for default example model 'decapoda-research/llama-7b-hf' in LLaMA family,
        # due to its out-of-date 'tokenizer_class' defined in its tokenizer_config.json.
-        #
+        
        # for most cases, you could use `AutoTokenizer`.
        if model_family == 'llama':
            from transformers import LlamaTokenizer
@ -84,17 +79,6 @@ def inference(llm, repo_id_or_model_path, model_family, prompt):
        print(f'Inference time: {time.time()-st} s')
        print(f'Output:\n{output}')
        # Option 2: Use bigdl-llm based tokenizer
        print('-'*20, ' bigdl-llm based tokenizer ', '-'*20)
        st = time.time()
        # please note that the prompt here can either be a string or a list of string
        tokens_id = llm.tokenize(prompt)
        output_tokens_id = llm.generate(tokens_id, max_new_tokens=32)
        output = llm.batch_decode(output_tokens_id)
        print(f'Inference time: {time.time()-st} s')
        print(f'Output:\n{output}')
    if model_family in ['llama', 'gptneox', 'bloom']:
        # Option 3: fast forward
@ -121,6 +105,8 @@ def main():
                             ', or the path to the huggingface checkpoint folder')
    parser.add_argument('--prompt', type=str, default='Q: What is CPU? A:',
                        help='Prompt to infer')
    parser.add_argument('--tmp-path', type=str, default='/tmp',
                        help='path to store intermediate model during the conversion process')
    args = parser.parse_args()
    repo_id_or_model_path = args.repo_id_or_model_path
@ -132,12 +118,18 @@ def main():
        elif args.model_family == 'bloom':
            repo_id_or_model_path = 'bigscience/bloomz-7b1'
-    # Step 1: convert and load int4 model
+    # Step 1: convert original model to BigDL llm model
-    llm = convert_and_load(repo_id_or_model_path=repo_id_or_model_path,
+    bigdl_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
-                           model_family=args.model_family,
+                               model_family=args.model_family,
-                           n_threads=args.thread_num)
+                               tmp_path=args.tmp_path)
    # Step 2: load int4 model
    llm = load(model_path=bigdl_llm_path,
               model_family=args.model_family,
               n_threads=args.thread_num)
-    # Step 2: conduct inference
+    # Step 3: inference
    inference(llm=llm,
              repo_id_or_model_path=repo_id_or_model_path,
              model_family=args.model_family,
--- a/python/llm/src/bigdl/llm/ggml/transformers/init.py
+++ b/python/llm/src/bigdl/llm/ggml/transformers/init.py
@ -1,22 +0,0 @@
 #
 # Copyright 2016 The BigDL Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # This would makes sure Python is aware there is more than one sub-package within bigdl,
 # physically located elsewhere.
 # Otherwise there would be module not found error in non-pip's setting as Python would
 # only search the first bigdl package and end up finding only one sub-package.
 from .model import AutoModelForCausalLM
--- a/python/llm/src/bigdl/llm/ggml/transformers/model.py
+++ b/python/llm/src/bigdl/llm/ggml/transformers/model.py
@ -1,128 +0,0 @@
 #
 # Copyright 2016 The BigDL Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # This would makes sure Python is aware there is more than one sub-package within bigdl,
 # physically located elsewhere.
 # Otherwise there would be module not found error in non-pip's setting as Python would
 # only search the first bigdl package and end up finding only one sub-package.
 import os
 import traceback
 from bigdl.llm.utils.common import invalidInputError
 class AutoModelForCausalLM:
    """
    A generic model class that mimics the behavior of
    ``transformers.AutoModelForCausalLM.from_pretrained`` API
    """
    @classmethod
    def from_pretrained(cls,
                        pretrained_model_name_or_path: str,
                        model_format: str = 'pth',
                        model_family: str = 'llama',
                        dtype: str = 'int4',
                        cache_dir: str = './',
                        tmp_path: str = None,
                        **kwargs):
        """
        :param pretrained_model_name_or_path: We support 3 kinds of pretrained model checkpoint
               1. Path to directory for Hugging Face checkpoint that are directly pulled from
                  Hugging Face hub.
                  If ``model_format='pth'``, the folder should contain: weight bin, tokenizer
                  config, tokenizer.model (required for llama) and added_tokens.json (if applied).
                  For lora fine tuned model, the path should be pointed to a merged weight.
                  If ``model_format='gptq'``, the folder should be be a Hugging Face checkpoint
                  in GPTQ format, which contains weights in pytorch's .pt format,
                  and ``tokenizer.model``.
               2. Path for converted BigDL-LLM optimized ggml binary checkpoint.
                  The checkpoint should be converted by ``bigdl.llm.llm_convert``.
               3. A str for Hugging Face hub repo id.
        :param model_format: Specify the model format to be converted. ``pth`` is for
               PyTorch model checkpoint from Hugging Face. ``gptq`` is for GPTQ format
               model from Hugging Face.
        :param model_family: The model family of the pretrained checkpoint.
               Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"`` and ``"starcoder"``.
        :param dtype: Which quantized precision will be converted.
                Now only `int4` and `int8` are supported, and `int8` only works for `llama`
                , `gptneox` and `starcoder`.
        :param cache_dir: (optional) This parameter will only be used when
               ``pretrained_model_name_or_path`` is a hugginface checkpoint or hub repo id.
               It indicates the saving path for the converted low precision model.
        :param tmp_path: (optional) Which path to store the intermediate fp16 model during the
               conversion process. Default to `None` so that intermediate model will not be saved.
        :param **kwargs: keyword arguments which will be passed to the model instance
        :return: a model instance
        """
        invalidInputError(model_family in ['llama', 'gptneox', 'bloom', 'starcoder'],
                          "Now we only support model family: 'llama', 'gptneox', 'bloom',"
                          " 'starcoder', '{}' is not in the list.".format(model_family))
        invalidInputError(dtype.lower() in ['int4', 'int8'],
                          "Now we only support int4 and int8 as date type for weight")
        # check whether pretrained_model_name_or_path exists.
        # if not, it is likely that the user wants to pass in the repo id.
        if not os.path.exists(pretrained_model_name_or_path):
            try:
                # download from Hugging Face based on repo id
                from huggingface_hub import snapshot_download
                pretrained_model_name_or_path = snapshot_download(
                    repo_id=pretrained_model_name_or_path)
            except Exception as e:
                traceback.print_exc()
                # if downloading fails, it could be the case that repo id is invalid,
                # or the user pass in the wrong path for checkpoint
                invalidInputError(False,
                                  "Downloadng from Hugging Face repo id {} failed. "
                                  "Please input valid Hugging Face hub repo id, "
                                  "or provide the valid path to Hugging Face / "
                                  "BigDL-LLM optimized ggml binary checkpoint, "
                                  "for pretrained_model_name_or_path"
                                  .format(pretrained_model_name_or_path))
        ggml_model_path = pretrained_model_name_or_path
        # check whether pretrained_model_name_or_path is a file.
        # if not, it is likely that pretrained_model_name_or_path
        # points to a Hugging Face checkpoint
        if not os.path.isfile(pretrained_model_name_or_path):
            # Hugging Face checkpoint
            from bigdl.llm import llm_convert
            ggml_model_path = llm_convert(model=pretrained_model_name_or_path,
                                          outfile=cache_dir,
                                          model_family=model_family,
                                          outtype=dtype,
                                          model_format=model_format,
                                          tmp_path=tmp_path)
        if model_family == 'llama':
            from bigdl.llm.ggml.model.llama import Llama
            return Llama(model_path=ggml_model_path, **kwargs)
        elif model_family == 'gptneox':
            from bigdl.llm.ggml.model.gptneox import Gptneox
            return Gptneox(model_path=ggml_model_path, **kwargs)
        elif model_family == 'bloom':
            from bigdl.llm.ggml.model.bloom import Bloom
            return Bloom(model_path=ggml_model_path, **kwargs)
        elif model_family == 'starcoder':
            from bigdl.llm.ggml.model.starcoder import Starcoder
            return Starcoder(model_path=ggml_model_path, **kwargs)
--- a/python/llm/src/bigdl/llm/transformers/init.py
+++ b/python/llm/src/bigdl/llm/transformers/init.py
@ -16,3 +16,4 @@
 from .convert import ggml_convert_int4
 from .model import AutoModelForCausalLM, AutoModel
 from .modelling_bigdl import BigdlForCausalLM
--- a/python/llm/src/bigdl/llm/transformers/modelling_bigdl.py
+++ b/python/llm/src/bigdl/llm/transformers/modelling_bigdl.py
@ -0,0 +1,73 @@
 #
 # Copyright 2016 The BigDL Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # This would makes sure Python is aware there is more than one sub-package within bigdl,
 # physically located elsewhere.
 # Otherwise there would be module not found error in non-pip's setting as Python would
 # only search the first bigdl package and end up finding only one sub-package.
 from bigdl.llm.utils.common import invalidInputError
 class BigdlForCausalLM:
    """
    A generic model class that mimics the behavior of
    ``transformers.LlamaForCausalLM.from_pretrained`` API
    """
    @classmethod
    def from_pretrained(cls,
                        pretrained_model_name_or_path: str,
                        model_family: str = 'llama',
                        dtype: str = 'int4',
                        **kwargs):
        """
        :param pretrained_model_name_or_path: Path for converted BigDL-LLM optimized ggml
               binary checkpoint. The checkpoint should be converted by ``bigdl.llm.llm_convert``.
        :param model_family: The model family of the pretrained checkpoint.
               Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"`` and ``"starcoder"``.
        :param dtype: Which quantized precision will be converted.
                Now only `int4` and `int8` are supported, and `int8` only works for `llama`
                , `gptneox` and `starcoder`.
        :param cache_dir: (optional) This parameter will only be used when
               ``pretrained_model_name_or_path`` is a hugginface checkpoint or hub repo id.
               It indicates the saving path for the converted low precision model.
        :param tmp_path: (optional) Which path to store the intermediate fp16 model during the
               conversion process. Default to `None` so that intermediate model will not be saved.
        :param **kwargs: keyword arguments which will be passed to the model instance
        :return: a model instance
        """
        invalidInputError(model_family in ['llama', 'gptneox', 'bloom', 'starcoder'],
                          "Now we only support model family: 'llama', 'gptneox', 'bloom',"
                          " 'starcoder', '{}' is not in the list.".format(model_family))
        invalidInputError(dtype.lower() in ['int4', 'int8'],
                          "Now we only support int4 and int8 as date type for weight")
        ggml_model_path = pretrained_model_name_or_path
        if model_family == 'llama':
            from bigdl.llm.ggml.model.llama import Llama
            return Llama(model_path=ggml_model_path, **kwargs)
        elif model_family == 'gptneox':
            from bigdl.llm.ggml.model.gptneox import Gptneox
            return Gptneox(model_path=ggml_model_path, **kwargs)
        elif model_family == 'bloom':
            from bigdl.llm.ggml.model.bloom import Bloom
            return Bloom(model_path=ggml_model_path, **kwargs)
        elif model_family == 'starcoder':
            from bigdl.llm.ggml.model.starcoder import Starcoder
            return Starcoder(model_path=ggml_model_path, **kwargs)