[FastChat-integration] Add initial implementation for loader (#10323)

* add initial implementation for loader * add test method for model_loader * data * Refine
2024-03-12 10:54:59 +08:00 · 2024-03-12 10:54:59 +08:00 · cc4148636d
commit cc4148636d
parent 17bdb1a60b
4 changed files with 168 additions and 1 deletions
--- a/python/llm/src/bigdl/llm/transformers/init.py
+++ b/python/llm/src/bigdl/llm/transformers/init.py
@ -15,7 +15,7 @@
 #
-from .convert import ggml_convert_low_bit
+from .convert import ggml_convert_low_bit, get_enable_ipex
 from .model import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM, \
        AutoModelForSpeechSeq2Seq, AutoModelForQuestionAnswering, \
        AutoModelForSequenceClassification, AutoModelForMaskedLM, \
--- a/python/llm/src/bigdl/llm/transformers/convert.py
+++ b/python/llm/src/bigdl/llm/transformers/convert.py
@ -590,6 +590,14 @@ def _optimize_pre(model):
    return model
 def get_enable_ipex(low_bit):
    _enable_ipex = os.getenv("BIGDL_OPT_IPEX")
    _enable_ipex = (_enable_ipex is not None) and (_enable_ipex.lower() == "true")
    qtype = ggml_tensor_qtype[low_bit]
    _enable_ipex = _enable_ipex and (qtype == ggml_tensor_qtype["bf16"])
    return _enable_ipex
 def ggml_convert_low_bit(model, qtype, optimize_model=True,
                         convert_shape_only=False, device="cpu",
                         modules_to_not_convert=None, cpu_embedding=False,
--- a/python/llm/src/bigdl/llm/transformers/load_config.yaml
+++ b/python/llm/src/bigdl/llm/transformers/load_config.yaml
@ -0,0 +1,15 @@
 # TODO: move this to a different repo
 repo_id:
  # - 'THUDM/chatglm-6b'
  # - 'THUDM/chatglm2-6b'
  - 'meta-llama/Llama-2-7b-chat-hf'
  - 'baichuan-inc/Baichuan2-7B-Chat'
  - 'Qwen/Qwen-7B-Chat'
  # - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
 local_model_hub: 'path to your local model hub'
 low_bit:
  - 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
  - 'bf16'
 device:
  - 'cpu'
  # - 'xpu'
--- a/python/llm/src/bigdl/llm/transformers/loader.py
+++ b/python/llm/src/bigdl/llm/transformers/loader.py
@ -0,0 +1,144 @@
 #
 # Copyright 2016 The BigDL Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # This file provides an interface for loading models in other repos like FastChat
 import torch
 from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel, get_enable_ipex
 import time
 from datetime import date
 import argparse
 from bigdl.llm.utils.common import invalidInputError
 from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
 LLAMA_IDS = ['llama', 'vicuna', 'merged-baize']
 def get_tokenizer_cls(model_path: str):
    return LlamaTokenizer if any(llama_id in model_path.lower() for llama_id in LLAMA_IDS) \
        else AutoTokenizer
 def get_model_cls(model_path: str, low_bit: str):
    if "chatglm" in model_path.lower() and low_bit == "bf16":
        invalidInputError(False,
                          "Currently, PyTorch does not support "
                          "bfloat16 on CPU for chatglm models.")
    return AutoModel if "chatglm" in model_path.lower() else AutoModelForCausalLM
 def load_model(
    model_path: str,
    device: str = "cpu",
    low_bit: str = 'sym_int4',
    trust_remote_code: bool = True,
 ):
    """Load a model using BigDL LLM backend."""
    # Do a sanity check for device:
    invalidInputError(device == 'cpu' or device == 'xpu',
                      "BigDL-LLM only supports device cpu or xpu")
    tokenizer_cls = get_tokenizer_cls(model_path)
    model_cls = get_model_cls(model_path, low_bit)
    model_kwargs = {"use_cache": True}
    if trust_remote_code:
        model_kwargs["trust_remote_code"] = True
    if low_bit == "bf16":
        model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": torch.bfloat16})
    else:
        model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": 'auto'})
    # Load tokenizer
    tokenizer = tokenizer_cls.from_pretrained(model_path, trust_remote_code=True)
    model = model_cls.from_pretrained(model_path, **model_kwargs)
    if not get_enable_ipex(low_bit):
        model = model.eval()
    if device == "xpu":
        import intel_extension_for_pytorch as ipex
        model = model.to('xpu')
    return model, tokenizer
 def try_run_test_generation(local_model_hub, model_path, device, low_bit):
    path = get_model_path(model_path, local_model_hub)
    try:
        run_test_generation(path, device, low_bit)
    except:
        print(f"Loading model failed for model {model_path} \
              with device:{device} and low_bit:{low_bit}")
        return "False"
    return "True"
 def get_model_path(repo_id, local_model_hub):
    if local_model_hub:
        repo_model_name = repo_id.split("/")[1]
        local_model_path = local_model_hub + os.path.sep + repo_model_name
        invalidInputError(os.path.isdir(local_model_path),
                          local_model_path + " not exists!, Please check your models' folder.")
        return local_model_path
    else:
        return repo_id
 def run_test_generation(model_path, device, low_bit):
    model, tokenizer = load_model(model_path, device, low_bit, True)
    with torch.inference_mode():
        prompt = "What is AI?"
        # TODO: if gpu, will need to move the tensor to xpu
        input_ids = tokenizer.encode(prompt, return_tensors="pt")
        if device == 'xpu':
            input_ids = input_ids.to('xpu')
        st = time.time()
        # if your selected model is capable of utilizing previous key/value attentions
        # to enhance decoding speed, but has `"use_cache": false` in its model config,
        # it is important to set `use_cache=True` explicitly in the `generate` function
        # to obtain optimal performance with BigDL-LLM INT4 optimizations
        output = model.generate(input_ids,
                                max_new_tokens=32)
        end = time.time()
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
        print(f'Inference time: {end-st} s')
        print('-'*20, 'Prompt', '-'*20)
        print(prompt)
        print('-'*20, 'Output', '-'*20)
        print(output_str)
 # Provide a main method for test loads
 # Note that this only test loading models instead of generation correctness
 if __name__ == '__main__':
    import os
    # TODO: move config.yaml to a different folder
    current_dir = os.path.dirname(os.path.realpath(__file__))
    results = []
    from omegaconf import OmegaConf
    conf = OmegaConf.load(f'{current_dir}/load_config.yaml')
    today = date.today()
    import pandas as pd
    csv_name = f'{current_dir}/loader-results-{today}.csv'
    for model in conf.repo_id:
        for low_bit in conf.low_bit:
            for device in conf.device:
                result = try_run_test_generation(conf['local_model_hub'], model, device, low_bit)
                results.append([model, device, low_bit, result])
    df = pd.DataFrame(results, columns=['model', 'device', 'low_bit', 'result'])
    df.to_csv(csv_name)
    results = []