Add load low-bit in model-serving for reduce EPC (#9239)

* init load low-bit * fix * fix
2023-10-23 11:28:20 +08:00 · 2023-10-23 11:28:20 +08:00 · c14a61681b
commit c14a61681b
parent 0383306688
1 changed files with 18 additions and 0 deletions
--- a/python/llm/src/bigdl/llm/serving/bigdl_llm_model.py
+++ b/python/llm/src/bigdl/llm/serving/bigdl_llm_model.py
@ -256,10 +256,28 @@ class BigDLLLMAdapter(BaseModelAdapter):
        return model, tokenizer


+class BigDLLMLOWBITAdapter(BaseModelAdapter):
+    "Model adapter for bigdl-llm backend low-bit models"
+
+    def match(self, model_path: str):
+        return "bigdl-lowbit" in model_path
+
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=False, revision=revision
+        )
+        print("Customized bigdl-llm loader")
+        from bigdl.llm.transformers import AutoModelForCausalLM
+        model = AutoModelForCausalLM.load_low_bit(model_path)
+        return model, tokenizer
+
+
 def patch_fastchat():
    global is_fastchat_patched
    if is_fastchat_patched:
        return
+    register_model_adapter(BigDLLMLOWBITAdapter)
    register_model_adapter(BigDLLLMAdapter)
    mapping_fastchat = _get_patch_map()