Add load low-bit in model-serving for reduce EPC (#9239)
* init load low-bit * fix * fix
This commit is contained in:
parent
0383306688
commit
c14a61681b
1 changed files with 18 additions and 0 deletions
|
|
@ -256,10 +256,28 @@ class BigDLLLMAdapter(BaseModelAdapter):
|
|||
return model, tokenizer
|
||||
|
||||
|
||||
class BigDLLMLOWBITAdapter(BaseModelAdapter):
|
||||
"Model adapter for bigdl-llm backend low-bit models"
|
||||
|
||||
def match(self, model_path: str):
|
||||
return "bigdl-lowbit" in model_path
|
||||
|
||||
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
|
||||
revision = from_pretrained_kwargs.get("revision", "main")
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_path, use_fast=False, revision=revision
|
||||
)
|
||||
print("Customized bigdl-llm loader")
|
||||
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.load_low_bit(model_path)
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def patch_fastchat():
|
||||
global is_fastchat_patched
|
||||
if is_fastchat_patched:
|
||||
return
|
||||
register_model_adapter(BigDLLMLOWBITAdapter)
|
||||
register_model_adapter(BigDLLLMAdapter)
|
||||
mapping_fastchat = _get_patch_map()
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue