Add load low-bit in model-serving for reduce EPC (#9239)
* init load low-bit * fix * fix
This commit is contained in:
parent
0383306688
commit
c14a61681b
1 changed files with 18 additions and 0 deletions
|
|
@ -256,10 +256,28 @@ class BigDLLLMAdapter(BaseModelAdapter):
|
||||||
return model, tokenizer
|
return model, tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class BigDLLMLOWBITAdapter(BaseModelAdapter):
|
||||||
|
"Model adapter for bigdl-llm backend low-bit models"
|
||||||
|
|
||||||
|
def match(self, model_path: str):
|
||||||
|
return "bigdl-lowbit" in model_path
|
||||||
|
|
||||||
|
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
|
||||||
|
revision = from_pretrained_kwargs.get("revision", "main")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
model_path, use_fast=False, revision=revision
|
||||||
|
)
|
||||||
|
print("Customized bigdl-llm loader")
|
||||||
|
from bigdl.llm.transformers import AutoModelForCausalLM
|
||||||
|
model = AutoModelForCausalLM.load_low_bit(model_path)
|
||||||
|
return model, tokenizer
|
||||||
|
|
||||||
|
|
||||||
def patch_fastchat():
|
def patch_fastchat():
|
||||||
global is_fastchat_patched
|
global is_fastchat_patched
|
||||||
if is_fastchat_patched:
|
if is_fastchat_patched:
|
||||||
return
|
return
|
||||||
|
register_model_adapter(BigDLLMLOWBITAdapter)
|
||||||
register_model_adapter(BigDLLLMAdapter)
|
register_model_adapter(BigDLLLMAdapter)
|
||||||
mapping_fastchat = _get_patch_map()
|
mapping_fastchat = _get_patch_map()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue