[FastChat-integration] Add initial implementation for loader (#10323)

* add initial implementation for loader * add test method for model_loader * data * Refine
2024-03-12 10:54:59 +08:00 · 2024-03-12 10:54:59 +08:00 · cc4148636d
commit cc4148636d
parent 17bdb1a60b
4 changed files with 168 additions and 1 deletions
--- a/python/llm/src/bigdl/llm/transformers/init.py
+++ b/python/llm/src/bigdl/llm/transformers/init.py
@ -15,7 +15,7 @@
 #


-from .convert import ggml_convert_low_bit
+from .convert import ggml_convert_low_bit, get_enable_ipex
 from .model import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM, \
        AutoModelForSpeechSeq2Seq, AutoModelForQuestionAnswering, \
        AutoModelForSequenceClassification, AutoModelForMaskedLM, \
--- a/python/llm/src/bigdl/llm/transformers/convert.py
+++ b/python/llm/src/bigdl/llm/transformers/convert.py
@ -590,6 +590,14 @@ def _optimize_pre(model):
    return model


+def get_enable_ipex(low_bit):
+    _enable_ipex = os.getenv("BIGDL_OPT_IPEX")
+    _enable_ipex = (_enable_ipex is not None) and (_enable_ipex.lower() == "true")
+    qtype = ggml_tensor_qtype[low_bit]
+    _enable_ipex = _enable_ipex and (qtype == ggml_tensor_qtype["bf16"])
+    return _enable_ipex
+
+
 def ggml_convert_low_bit(model, qtype, optimize_model=True,
                         convert_shape_only=False, device="cpu",
                         modules_to_not_convert=None, cpu_embedding=False,
--- a/python/llm/src/bigdl/llm/transformers/load_config.yaml
+++ b/python/llm/src/bigdl/llm/transformers/load_config.yaml
@ -0,0 +1,15 @@
+# TODO: move this to a different repo
+repo_id:
+  # - 'THUDM/chatglm-6b'
+  # - 'THUDM/chatglm2-6b'
+  - 'meta-llama/Llama-2-7b-chat-hf'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'Qwen/Qwen-7B-Chat'
+  # - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
+local_model_hub: 'path to your local model hub'
+low_bit:
+  - 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
+  - 'bf16'
+device:
+  - 'cpu'
+  # - 'xpu'
--- a/python/llm/src/bigdl/llm/transformers/loader.py
+++ b/python/llm/src/bigdl/llm/transformers/loader.py
@ -0,0 +1,144 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file provides an interface for loading models in other repos like FastChat
+
+import torch
+
+from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel, get_enable_ipex
+import time
+from datetime import date
+import argparse
+from bigdl.llm.utils.common import invalidInputError
+from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
+
+LLAMA_IDS = ['llama', 'vicuna', 'merged-baize']
+
+
+def get_tokenizer_cls(model_path: str):
+    return LlamaTokenizer if any(llama_id in model_path.lower() for llama_id in LLAMA_IDS) \
+        else AutoTokenizer
+
+
+def get_model_cls(model_path: str, low_bit: str):
+    if "chatglm" in model_path.lower() and low_bit == "bf16":
+        invalidInputError(False,
+                          "Currently, PyTorch does not support "
+                          "bfloat16 on CPU for chatglm models.")
+    return AutoModel if "chatglm" in model_path.lower() else AutoModelForCausalLM
+
+
+def load_model(
+    model_path: str,
+    device: str = "cpu",
+    low_bit: str = 'sym_int4',
+    trust_remote_code: bool = True,
+):
+    """Load a model using BigDL LLM backend."""
+
+    # Do a sanity check for device:
+    invalidInputError(device == 'cpu' or device == 'xpu',
+                      "BigDL-LLM only supports device cpu or xpu")
+
+    tokenizer_cls = get_tokenizer_cls(model_path)
+    model_cls = get_model_cls(model_path, low_bit)
+    model_kwargs = {"use_cache": True}
+    if trust_remote_code:
+        model_kwargs["trust_remote_code"] = True
+    if low_bit == "bf16":
+        model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": torch.bfloat16})
+    else:
+        model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": 'auto'})
+
+    # Load tokenizer
+    tokenizer = tokenizer_cls.from_pretrained(model_path, trust_remote_code=True)
+    model = model_cls.from_pretrained(model_path, **model_kwargs)
+    if not get_enable_ipex(low_bit):
+        model = model.eval()
+
+    if device == "xpu":
+        import intel_extension_for_pytorch as ipex
+        model = model.to('xpu')
+
+    return model, tokenizer
+
+
+def try_run_test_generation(local_model_hub, model_path, device, low_bit):
+    path = get_model_path(model_path, local_model_hub)
+    try:
+        run_test_generation(path, device, low_bit)
+    except:
+        print(f"Loading model failed for model {model_path} \
+              with device:{device} and low_bit:{low_bit}")
+        return "False"
+    return "True"
+
+
+def get_model_path(repo_id, local_model_hub):
+    if local_model_hub:
+        repo_model_name = repo_id.split("/")[1]
+        local_model_path = local_model_hub + os.path.sep + repo_model_name
+        invalidInputError(os.path.isdir(local_model_path),
+                          local_model_path + " not exists!, Please check your models' folder.")
+        return local_model_path
+    else:
+        return repo_id
+
+
+def run_test_generation(model_path, device, low_bit):
+    model, tokenizer = load_model(model_path, device, low_bit, True)
+    with torch.inference_mode():
+        prompt = "What is AI?"
+        # TODO: if gpu, will need to move the tensor to xpu
+        input_ids = tokenizer.encode(prompt, return_tensors="pt")
+        if device == 'xpu':
+            input_ids = input_ids.to('xpu')
+        st = time.time()
+        # if your selected model is capable of utilizing previous key/value attentions
+        # to enhance decoding speed, but has `"use_cache": false` in its model config,
+        # it is important to set `use_cache=True` explicitly in the `generate` function
+        # to obtain optimal performance with BigDL-LLM INT4 optimizations
+        output = model.generate(input_ids,
+                                max_new_tokens=32)
+        end = time.time()
+        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+        print(f'Inference time: {end-st} s')
+        print('-'*20, 'Prompt', '-'*20)
+        print(prompt)
+        print('-'*20, 'Output', '-'*20)
+        print(output_str)
+
+
+# Provide a main method for test loads
+# Note that this only test loading models instead of generation correctness
+if __name__ == '__main__':
+    import os
+    # TODO: move config.yaml to a different folder
+    current_dir = os.path.dirname(os.path.realpath(__file__))
+    results = []
+    from omegaconf import OmegaConf
+    conf = OmegaConf.load(f'{current_dir}/load_config.yaml')
+    today = date.today()
+    import pandas as pd
+    csv_name = f'{current_dir}/loader-results-{today}.csv'
+    for model in conf.repo_id:
+        for low_bit in conf.low_bit:
+            for device in conf.device:
+                result = try_run_test_generation(conf['local_model_hub'], model, device, low_bit)
+                results.append([model, device, low_bit, result])
+
+    df = pd.DataFrame(results, columns=['model', 'device', 'low_bit', 'result'])
+    df.to_csv(csv_name)
+    results = []