Add load low-bit in model-serving for reduce EPC (#9239)

* init load low-bit

* fix

* fix
This commit is contained in:
Wang, Jian4 2023-10-23 11:28:20 +08:00 committed by GitHub
parent 0383306688
commit c14a61681b

View file

@ -256,10 +256,28 @@ class BigDLLLMAdapter(BaseModelAdapter):
return model, tokenizer
class BigDLLMLOWBITAdapter(BaseModelAdapter):
"Model adapter for bigdl-llm backend low-bit models"
def match(self, model_path: str):
return "bigdl-lowbit" in model_path
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = AutoTokenizer.from_pretrained(
model_path, use_fast=False, revision=revision
)
print("Customized bigdl-llm loader")
from bigdl.llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.load_low_bit(model_path)
return model, tokenizer
def patch_fastchat():
global is_fastchat_patched
if is_fastchat_patched:
return
register_model_adapter(BigDLLMLOWBITAdapter)
register_model_adapter(BigDLLLMAdapter)
mapping_fastchat = _get_patch_map()