From c14a61681bdd337765a724dbd66f25941667d410 Mon Sep 17 00:00:00 2001 From: "Wang, Jian4" <61138589+hzjane@users.noreply.github.com> Date: Mon, 23 Oct 2023 11:28:20 +0800 Subject: [PATCH] Add load low-bit in model-serving for reduce EPC (#9239) * init load low-bit * fix * fix --- .../src/bigdl/llm/serving/bigdl_llm_model.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/llm/src/bigdl/llm/serving/bigdl_llm_model.py b/python/llm/src/bigdl/llm/serving/bigdl_llm_model.py index 0f09b346..6c8cc780 100644 --- a/python/llm/src/bigdl/llm/serving/bigdl_llm_model.py +++ b/python/llm/src/bigdl/llm/serving/bigdl_llm_model.py @@ -256,10 +256,28 @@ class BigDLLLMAdapter(BaseModelAdapter): return model, tokenizer +class BigDLLMLOWBITAdapter(BaseModelAdapter): + "Model adapter for bigdl-llm backend low-bit models" + + def match(self, model_path: str): + return "bigdl-lowbit" in model_path + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + revision = from_pretrained_kwargs.get("revision", "main") + tokenizer = AutoTokenizer.from_pretrained( + model_path, use_fast=False, revision=revision + ) + print("Customized bigdl-llm loader") + from bigdl.llm.transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.load_low_bit(model_path) + return model, tokenizer + + def patch_fastchat(): global is_fastchat_patched if is_fastchat_patched: return + register_model_adapter(BigDLLMLOWBITAdapter) register_model_adapter(BigDLLLMAdapter) mapping_fastchat = _get_patch_map()