Add load low-bit in model-serving for reduce EPC (#9239)

* init load low-bit * fix * fix
2023-10-23 11:28:20 +08:00 · 2023-10-23 11:28:20 +08:00 · c14a61681b
commit c14a61681b
parent 0383306688
1 changed files with 18 additions and 0 deletions
--- a/python/llm/src/bigdl/llm/serving/bigdl_llm_model.py
+++ b/python/llm/src/bigdl/llm/serving/bigdl_llm_model.py
@ -256,10 +256,28 @@ class BigDLLLMAdapter(BaseModelAdapter):
        return model, tokenizer
 class BigDLLMLOWBITAdapter(BaseModelAdapter):
    "Model adapter for bigdl-llm backend low-bit models"
    def match(self, model_path: str):
        return "bigdl-lowbit" in model_path
    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
        revision = from_pretrained_kwargs.get("revision", "main")
        tokenizer = AutoTokenizer.from_pretrained(
            model_path, use_fast=False, revision=revision
        )
        print("Customized bigdl-llm loader")
        from bigdl.llm.transformers import AutoModelForCausalLM
        model = AutoModelForCausalLM.load_low_bit(model_path)
        return model, tokenizer
 def patch_fastchat():
    global is_fastchat_patched
    if is_fastchat_patched:
        return
    register_model_adapter(BigDLLMLOWBITAdapter)
    register_model_adapter(BigDLLLMAdapter)
    mapping_fastchat = _get_patch_map()