[FastChat-integration] Add initial implementation for loader (#10323)

* add initial implementation for loader

* add test method for model_loader

* data

* Refine
This commit is contained in:
Guancheng Fu 2024-03-12 10:54:59 +08:00 committed by GitHub
parent 17bdb1a60b
commit cc4148636d
4 changed files with 168 additions and 1 deletions

View file

@ -15,7 +15,7 @@
# #
from .convert import ggml_convert_low_bit from .convert import ggml_convert_low_bit, get_enable_ipex
from .model import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM, \ from .model import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM, \
AutoModelForSpeechSeq2Seq, AutoModelForQuestionAnswering, \ AutoModelForSpeechSeq2Seq, AutoModelForQuestionAnswering, \
AutoModelForSequenceClassification, AutoModelForMaskedLM, \ AutoModelForSequenceClassification, AutoModelForMaskedLM, \

View file

@ -590,6 +590,14 @@ def _optimize_pre(model):
return model return model
def get_enable_ipex(low_bit):
_enable_ipex = os.getenv("BIGDL_OPT_IPEX")
_enable_ipex = (_enable_ipex is not None) and (_enable_ipex.lower() == "true")
qtype = ggml_tensor_qtype[low_bit]
_enable_ipex = _enable_ipex and (qtype == ggml_tensor_qtype["bf16"])
return _enable_ipex
def ggml_convert_low_bit(model, qtype, optimize_model=True, def ggml_convert_low_bit(model, qtype, optimize_model=True,
convert_shape_only=False, device="cpu", convert_shape_only=False, device="cpu",
modules_to_not_convert=None, cpu_embedding=False, modules_to_not_convert=None, cpu_embedding=False,

View file

@ -0,0 +1,15 @@
# TODO: move this to a different repo
repo_id:
# - 'THUDM/chatglm-6b'
# - 'THUDM/chatglm2-6b'
- 'meta-llama/Llama-2-7b-chat-hf'
- 'baichuan-inc/Baichuan2-7B-Chat'
- 'Qwen/Qwen-7B-Chat'
# - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
local_model_hub: 'path to your local model hub'
low_bit:
- 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
- 'bf16'
device:
- 'cpu'
# - 'xpu'

View file

@ -0,0 +1,144 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This file provides an interface for loading models in other repos like FastChat
import torch
from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel, get_enable_ipex
import time
from datetime import date
import argparse
from bigdl.llm.utils.common import invalidInputError
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
LLAMA_IDS = ['llama', 'vicuna', 'merged-baize']
def get_tokenizer_cls(model_path: str):
return LlamaTokenizer if any(llama_id in model_path.lower() for llama_id in LLAMA_IDS) \
else AutoTokenizer
def get_model_cls(model_path: str, low_bit: str):
if "chatglm" in model_path.lower() and low_bit == "bf16":
invalidInputError(False,
"Currently, PyTorch does not support "
"bfloat16 on CPU for chatglm models.")
return AutoModel if "chatglm" in model_path.lower() else AutoModelForCausalLM
def load_model(
model_path: str,
device: str = "cpu",
low_bit: str = 'sym_int4',
trust_remote_code: bool = True,
):
"""Load a model using BigDL LLM backend."""
# Do a sanity check for device:
invalidInputError(device == 'cpu' or device == 'xpu',
"BigDL-LLM only supports device cpu or xpu")
tokenizer_cls = get_tokenizer_cls(model_path)
model_cls = get_model_cls(model_path, low_bit)
model_kwargs = {"use_cache": True}
if trust_remote_code:
model_kwargs["trust_remote_code"] = True
if low_bit == "bf16":
model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": torch.bfloat16})
else:
model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": 'auto'})
# Load tokenizer
tokenizer = tokenizer_cls.from_pretrained(model_path, trust_remote_code=True)
model = model_cls.from_pretrained(model_path, **model_kwargs)
if not get_enable_ipex(low_bit):
model = model.eval()
if device == "xpu":
import intel_extension_for_pytorch as ipex
model = model.to('xpu')
return model, tokenizer
def try_run_test_generation(local_model_hub, model_path, device, low_bit):
path = get_model_path(model_path, local_model_hub)
try:
run_test_generation(path, device, low_bit)
except:
print(f"Loading model failed for model {model_path} \
with device:{device} and low_bit:{low_bit}")
return "False"
return "True"
def get_model_path(repo_id, local_model_hub):
if local_model_hub:
repo_model_name = repo_id.split("/")[1]
local_model_path = local_model_hub + os.path.sep + repo_model_name
invalidInputError(os.path.isdir(local_model_path),
local_model_path + " not exists!, Please check your models' folder.")
return local_model_path
else:
return repo_id
def run_test_generation(model_path, device, low_bit):
model, tokenizer = load_model(model_path, device, low_bit, True)
with torch.inference_mode():
prompt = "What is AI?"
# TODO: if gpu, will need to move the tensor to xpu
input_ids = tokenizer.encode(prompt, return_tensors="pt")
if device == 'xpu':
input_ids = input_ids.to('xpu')
st = time.time()
# if your selected model is capable of utilizing previous key/value attentions
# to enhance decoding speed, but has `"use_cache": false` in its model config,
# it is important to set `use_cache=True` explicitly in the `generate` function
# to obtain optimal performance with BigDL-LLM INT4 optimizations
output = model.generate(input_ids,
max_new_tokens=32)
end = time.time()
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
print(f'Inference time: {end-st} s')
print('-'*20, 'Prompt', '-'*20)
print(prompt)
print('-'*20, 'Output', '-'*20)
print(output_str)
# Provide a main method for test loads
# Note that this only test loading models instead of generation correctness
if __name__ == '__main__':
import os
# TODO: move config.yaml to a different folder
current_dir = os.path.dirname(os.path.realpath(__file__))
results = []
from omegaconf import OmegaConf
conf = OmegaConf.load(f'{current_dir}/load_config.yaml')
today = date.today()
import pandas as pd
csv_name = f'{current_dir}/loader-results-{today}.csv'
for model in conf.repo_id:
for low_bit in conf.low_bit:
for device in conf.device:
result = try_run_test_generation(conf['local_model_hub'], model, device, low_bit)
results.append([model, device, low_bit, result])
df = pd.DataFrame(results, columns=['model', 'device', 'low_bit', 'result'])
df.to_csv(csv_name)
results = []