From 74997a3ed19226951b9cf9e3041753c0e6adfc19 Mon Sep 17 00:00:00 2001
From: Guancheng Fu <110874468+gc-fu@users.noreply.github.com>
Date: Mon, 13 May 2024 15:30:19 +0800
Subject: [PATCH] Adding load_low_bit interface for ipex_llm_worker (#11000)

* initial implementation, need tests

* fix

* fix baichuan issue

* fix typo
---
 .../src/ipex_llm/serving/fastchat/README.md   | 26 +++++---
 .../serving/fastchat/ipex_llm_worker.py       | 17 ++++-
 .../ipex_llm/transformers/load_config.yaml    | 11 ++--
 .../llm/src/ipex_llm/transformers/loader.py   | 65 ++++++++++++-------
 4 files changed, 81 insertions(+), 38 deletions(-)

diff --git a/python/llm/src/ipex_llm/serving/fastchat/README.md b/python/llm/src/ipex_llm/serving/fastchat/README.md
index 553f5fa7..1408e050 100644
--- a/python/llm/src/ipex_llm/serving/fastchat/README.md
+++ b/python/llm/src/ipex_llm/serving/fastchat/README.md
@@ -46,7 +46,7 @@ pip install --pre --upgrade ipex-llm[xpu,serving] --extra-index-url https://pyto
 You need first run the fastchat controller
 
 ```bash
-python3 -m fastchat.serve.controller
+python -m fastchat.serve.controller
 ```
 
 ### Launch model worker(s) and load models
@@ -63,14 +63,22 @@ To run the `ipex_llm_worker` on CPU, using the following code:
 source ipex-llm-init -t
 
 # Available low_bit format including sym_int4, sym_int8, bf16 etc.
-python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "cpu"
+python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "cpu"
 ```
 
 For GPU example:
 
 ```bash
 # Available low_bit format including sym_int4, sym_int8, fp16 etc.
-python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "xpu"
+python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "xpu"
+```
+
+We have also provided an option `--load-low-bit-model` to load models that have been converted and saved into disk using the `save_low_bit` interface as introduced in this [document](https://github.com/intel-analytics/ipex-llm/blob/main/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/README.md).
+
+Check the following examples:
+```bash
+# Or --device "cpu"
+python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path /Low/Bit/Model/Path --trust-remote-code --device "xpu"
 ```
 
 #### For self-speculative decoding example:
@@ -80,14 +88,14 @@ You can use IPEX-LLM to run `self-speculative decoding` example. Refer to [here]
 ```bash
 # Available low_bit format only including bf16 on CPU.
 source ipex-llm-init -t
-python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "bf16" --trust-remote-code --device "cpu" --speculative
+python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "bf16" --trust-remote-code --device "cpu" --speculative
 
 # Available low_bit format only including fp16 on GPU.
 source /opt/intel/oneapi/setvars.sh
 export ENABLE_SDP_FUSION=1
 export SYCL_CACHE_PERSISTENT=1
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "fp16" --trust-remote-code --device "xpu" --speculative
+python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "fp16" --trust-remote-code --device "xpu" --speculative
 ```
 
 For a full list of accepted arguments, you can refer to the main method of the `ipex_llm_worker.py`
@@ -100,16 +108,16 @@ To run using the `vLLM_worker`,  we don't need to change model name, just simply
 
 ```bash
 # On CPU
-python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device cpu
+python -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device cpu
 
 # On GPU
-python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu
+python -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu
 ```
 
 ### Launch Gradio web server
 
 ```bash
-python3 -m fastchat.serve.gradio_web_server
+python -m fastchat.serve.gradio_web_server
 ```
 
 This is the user interface that users will interact with.
@@ -121,5 +129,5 @@ By following these steps, you will be able to serve your models using the web UI
 To start an OpenAI API server that provides compatible APIs using IPEX-LLM backend, you can launch the `openai_api_server` and follow this [doc](https://github.com/lm-sys/FastChat/blob/main/docs/openai_api.md) to use it.
 
 ```bash
-python3 -m fastchat.serve.openai_api_server --host localhost --port 8000
+python -m fastchat.serve.openai_api_server --host localhost --port 8000
 ```
diff --git a/python/llm/src/ipex_llm/serving/fastchat/ipex_llm_worker.py b/python/llm/src/ipex_llm/serving/fastchat/ipex_llm_worker.py
index cf08e928..dd859f42 100644
--- a/python/llm/src/ipex_llm/serving/fastchat/ipex_llm_worker.py
+++ b/python/llm/src/ipex_llm/serving/fastchat/ipex_llm_worker.py
@@ -69,6 +69,7 @@ class BigDLLLMWorker(BaseModelWorker):
         trust_remote_code: bool = False,
         embed_in_truncate: bool = False,
         speculative: bool = False,
+        load_low_bit_model: bool = False,
         stream_interval: int = 4,
     ):
         super().__init__(
@@ -82,6 +83,7 @@ class BigDLLLMWorker(BaseModelWorker):
         )
 
         self.load_in_low_bit = load_in_low_bit
+        self.load_low_bit_model = load_low_bit_model
         logger.info(
             f"Loading the model {self.model_names} on worker {worker_id},"
             f" worker type: BigDLLLM worker..."
@@ -94,7 +96,12 @@ class BigDLLLMWorker(BaseModelWorker):
         self.device = device
         self.speculative = speculative
         self.model, self.tokenizer = load_model(
-            model_path, device, self.load_in_low_bit, trust_remote_code, speculative
+            model_path,
+            device,
+            self.load_in_low_bit,
+            trust_remote_code,
+            speculative,
+            load_low_bit_model,
         )
         self.stream_interval = stream_interval
         self.context_len = get_context_length(self.model.config)
@@ -495,6 +502,12 @@ if __name__ == "__main__":
         help="Trust remote code (e.g., from HuggingFace) when"
         "downloading the model and tokenizer.",
     )
+    parser.add_argument(
+        "--load-low-bit-model",
+        action="store_true",
+        default=False,
+        help="Load models that have been converted/saved using ipex-llm's save_low_bit interface",
+    )
     parser.add_argument("--embed-in-truncate", action="store_true")
 
     args = parser.parse_args()
@@ -512,5 +525,7 @@ if __name__ == "__main__":
         args.trust_remote_code,
         args.embed_in_truncate,
         args.speculative,
+        args.load_low_bit_model,
+        args.stream_interval,
     )
     uvicorn.run(app, host=args.host, port=args.port, log_level="info")
diff --git a/python/llm/src/ipex_llm/transformers/load_config.yaml b/python/llm/src/ipex_llm/transformers/load_config.yaml
index e7e25e66..7f56ae52 100644
--- a/python/llm/src/ipex_llm/transformers/load_config.yaml
+++ b/python/llm/src/ipex_llm/transformers/load_config.yaml
@@ -3,13 +3,14 @@ repo_id:
   # - 'THUDM/chatglm-6b'
   # - 'THUDM/chatglm2-6b'
   - 'meta-llama/Llama-2-7b-chat-hf'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
-  - 'Qwen/Qwen-7B-Chat'
+  # - 'baichuan-inc/Baichuan2-7B-Chat'
+  # - 'Qwen/Qwen-7B-Chat'
   # - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
-local_model_hub: 'path to your local model hub'
+local_model_hub: '/mnt/disk1/models'
 low_bit:
   - 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
   - 'bf16'
 device:
-  - 'cpu'
-  # - 'xpu'
+  #- 'cpu'
+  - 'xpu'
+load_low_bit_model: False
diff --git a/python/llm/src/ipex_llm/transformers/loader.py b/python/llm/src/ipex_llm/transformers/loader.py
index 1a05ba3b..acaddb7a 100644
--- a/python/llm/src/ipex_llm/transformers/loader.py
+++ b/python/llm/src/ipex_llm/transformers/loader.py
@@ -46,6 +46,7 @@ def load_model(
     low_bit: str = 'sym_int4',
     trust_remote_code: bool = True,
     speculative: bool = False,
+    load_low_bit_model: bool = False,
 ):
     """Load a model using BigDL LLM backend."""
 
@@ -53,26 +54,38 @@ def load_model(
     invalidInputError(device == 'cpu' or device == 'xpu',
                       "BigDL-LLM only supports device cpu or xpu")
 
-    tokenizer_cls = get_tokenizer_cls(model_path)
     model_cls = get_model_cls(model_path, low_bit)
-    model_kwargs = {"use_cache": True}
-    if trust_remote_code:
-        model_kwargs["trust_remote_code"] = True
-    if low_bit == "bf16":
-        model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": torch.bfloat16})
-    elif low_bit == "fp16":
-        model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": torch.float16})
-    else:
-        model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": 'auto'})
+    # Load tokenizer
+    tokenizer_cls = get_tokenizer_cls(model_path)
 
+    model_kwargs = {"use_cache": True}
     if speculative:
+        invalidInputError(load_low_bit_model is not True,
+                          "Self-Speculative currently do not support load low-bit format models")
         invalidInputError(low_bit == "fp16" or low_bit == "bf16",
                           "Self-Speculative only supports low_bit fp16 or bf16")
         model_kwargs["speculative"] = True
 
-    # Load tokenizer
-    tokenizer = tokenizer_cls.from_pretrained(model_path, trust_remote_code=True)
-    model = model_cls.from_pretrained(model_path, **model_kwargs)
+    if trust_remote_code:
+        model_kwargs["trust_remote_code"] = True
+
+    if load_low_bit_model:
+        # After save_low_bit, the from_pretrained interface does not accept trust_remote_code=True
+        tokenizer = tokenizer_cls.from_pretrained(model_path)
+        model = model_cls.load_low_bit(model_path, **model_kwargs)
+    else:
+        if trust_remote_code:
+            tokenizer = tokenizer_cls.from_pretrained(model_path, trust_remote_code=True)
+        else:
+            tokenizer = tokenizer_cls.from_pretrained(model_path)
+        if low_bit == "bf16":
+            model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": torch.bfloat16})
+        elif low_bit == "fp16":
+            model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": torch.float16})
+        else:
+            model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": 'auto'})
+
+        model = model_cls.from_pretrained(model_path, **model_kwargs)
     if not get_enable_ipex():
         model = model.eval()
 
@@ -83,13 +96,14 @@ def load_model(
     return model, tokenizer
 
 
-def try_run_test_generation(local_model_hub, model_path, device, low_bit):
+def try_run_test_generation(local_model_hub, model_path, device, low_bit, load_low_bit_model):
     path = get_model_path(model_path, local_model_hub)
     try:
-        run_test_generation(path, device, low_bit)
+        run_test_generation(path, device, low_bit, load_low_bit_model)
     except:
         print(f"Loading model failed for model {model_path} \
-              with device:{device} and low_bit:{low_bit}")
+              with device:{device} and low_bit:{low_bit} \
+              and load_low_bit_model {load_low_bit_model}")
         return "False"
     return "True"
 
@@ -105,11 +119,11 @@ def get_model_path(repo_id, local_model_hub):
         return repo_id
 
 
-def run_test_generation(model_path, device, low_bit):
-    model, tokenizer = load_model(model_path, device, low_bit, True)
+def run_test_generation(model_path, device, low_bit, load_low_bit_model):
+    # Disable speculative by default
+    model, tokenizer = load_model(model_path, device, low_bit, True, False, load_low_bit_model)
     with torch.inference_mode():
         prompt = "What is AI?"
-        # TODO: if gpu, will need to move the tensor to xpu
         input_ids = tokenizer.encode(prompt, return_tensors="pt")
         if device == 'xpu':
             input_ids = input_ids.to('xpu')
@@ -133,7 +147,6 @@ def run_test_generation(model_path, device, low_bit):
 # Note that this only test loading models instead of generation correctness
 if __name__ == '__main__':
     import os
-    # TODO: move config.yaml to a different folder
     current_dir = os.path.dirname(os.path.realpath(__file__))
     results = []
     from omegaconf import OmegaConf
@@ -144,9 +157,15 @@ if __name__ == '__main__':
     for model in conf.repo_id:
         for low_bit in conf.low_bit:
             for device in conf.device:
-                result = try_run_test_generation(conf['local_model_hub'], model, device, low_bit)
-                results.append([model, device, low_bit, result])
+                result = try_run_test_generation(conf['local_model_hub'],
+                                                 model,
+                                                 device,
+                                                 low_bit,
+                                                 conf["load_low_bit_model"]
+                                                 )
+                results.append([model, device, low_bit, conf["load_low_bit_model"], result])
 
-    df = pd.DataFrame(results, columns=['model', 'device', 'low_bit', 'result'])
+    df = pd.DataFrame(results,
+                      columns=['model', 'device', 'low_bit', 'use_low_bit_model', 'result'])
     df.to_csv(csv_name)
     results = []