ipex-llm/docker/llm/serving/cpu/docker/model_adapter.py.patch

--- model_adapter.py.old        2024-03-05 15:08:47.169275336 +0800
+++ model_adapter.py    2024-03-05 15:10:13.434703674 +0800
@@ -1690,15 +1690,17 @@
         )
         # NOTE: if you use the old version of model file, please remove the comments below
         # config.use_flash_attn = False
-        self.float_set(config, "fp16")
+        # self.float_set(config, "fp16")
         generation_config = GenerationConfig.from_pretrained(
             model_path, trust_remote_code=True
         )
+        from ipex_llm.transformers import AutoModelForCausalLM
         model = AutoModelForCausalLM.from_pretrained(
             model_path,
             config=config,
             low_cpu_mem_usage=True,
             trust_remote_code=True,
+            load_in_4bit=True,
             **from_pretrained_kwargs,
         ).eval()
         if hasattr(model.config, "use_dynamic_ntk") and model.config.use_dynamic_ntk: