From 74997a3ed19226951b9cf9e3041753c0e6adfc19 Mon Sep 17 00:00:00 2001 From: Guancheng Fu <110874468+gc-fu@users.noreply.github.com> Date: Mon, 13 May 2024 15:30:19 +0800 Subject: [PATCH] Adding load_low_bit interface for ipex_llm_worker (#11000) * initial implementation, need tests * fix * fix baichuan issue * fix typo --- .../src/ipex_llm/serving/fastchat/README.md | 26 +++++--- .../serving/fastchat/ipex_llm_worker.py | 17 ++++- .../ipex_llm/transformers/load_config.yaml | 11 ++-- .../llm/src/ipex_llm/transformers/loader.py | 65 ++++++++++++------- 4 files changed, 81 insertions(+), 38 deletions(-) diff --git a/python/llm/src/ipex_llm/serving/fastchat/README.md b/python/llm/src/ipex_llm/serving/fastchat/README.md index 553f5fa7..1408e050 100644 --- a/python/llm/src/ipex_llm/serving/fastchat/README.md +++ b/python/llm/src/ipex_llm/serving/fastchat/README.md @@ -46,7 +46,7 @@ pip install --pre --upgrade ipex-llm[xpu,serving] --extra-index-url https://pyto You need first run the fastchat controller ```bash -python3 -m fastchat.serve.controller +python -m fastchat.serve.controller ``` ### Launch model worker(s) and load models @@ -63,14 +63,22 @@ To run the `ipex_llm_worker` on CPU, using the following code: source ipex-llm-init -t # Available low_bit format including sym_int4, sym_int8, bf16 etc. -python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "cpu" +python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "cpu" ``` For GPU example: ```bash # Available low_bit format including sym_int4, sym_int8, fp16 etc. -python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "xpu" +python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "xpu" +``` + +We have also provided an option `--load-low-bit-model` to load models that have been converted and saved into disk using the `save_low_bit` interface as introduced in this [document](https://github.com/intel-analytics/ipex-llm/blob/main/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/README.md). + +Check the following examples: +```bash +# Or --device "cpu" +python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path /Low/Bit/Model/Path --trust-remote-code --device "xpu" ``` #### For self-speculative decoding example: @@ -80,14 +88,14 @@ You can use IPEX-LLM to run `self-speculative decoding` example. Refer to [here] ```bash # Available low_bit format only including bf16 on CPU. source ipex-llm-init -t -python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "bf16" --trust-remote-code --device "cpu" --speculative +python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "bf16" --trust-remote-code --device "cpu" --speculative # Available low_bit format only including fp16 on GPU. source /opt/intel/oneapi/setvars.sh export ENABLE_SDP_FUSION=1 export SYCL_CACHE_PERSISTENT=1 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "fp16" --trust-remote-code --device "xpu" --speculative +python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "fp16" --trust-remote-code --device "xpu" --speculative ``` For a full list of accepted arguments, you can refer to the main method of the `ipex_llm_worker.py` @@ -100,16 +108,16 @@ To run using the `vLLM_worker`, we don't need to change model name, just simply ```bash # On CPU -python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device cpu +python -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device cpu # On GPU -python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu +python -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu ``` ### Launch Gradio web server ```bash -python3 -m fastchat.serve.gradio_web_server +python -m fastchat.serve.gradio_web_server ``` This is the user interface that users will interact with. @@ -121,5 +129,5 @@ By following these steps, you will be able to serve your models using the web UI To start an OpenAI API server that provides compatible APIs using IPEX-LLM backend, you can launch the `openai_api_server` and follow this [doc](https://github.com/lm-sys/FastChat/blob/main/docs/openai_api.md) to use it. ```bash -python3 -m fastchat.serve.openai_api_server --host localhost --port 8000 +python -m fastchat.serve.openai_api_server --host localhost --port 8000 ``` diff --git a/python/llm/src/ipex_llm/serving/fastchat/ipex_llm_worker.py b/python/llm/src/ipex_llm/serving/fastchat/ipex_llm_worker.py index cf08e928..dd859f42 100644 --- a/python/llm/src/ipex_llm/serving/fastchat/ipex_llm_worker.py +++ b/python/llm/src/ipex_llm/serving/fastchat/ipex_llm_worker.py @@ -69,6 +69,7 @@ class BigDLLLMWorker(BaseModelWorker): trust_remote_code: bool = False, embed_in_truncate: bool = False, speculative: bool = False, + load_low_bit_model: bool = False, stream_interval: int = 4, ): super().__init__( @@ -82,6 +83,7 @@ class BigDLLLMWorker(BaseModelWorker): ) self.load_in_low_bit = load_in_low_bit + self.load_low_bit_model = load_low_bit_model logger.info( f"Loading the model {self.model_names} on worker {worker_id}," f" worker type: BigDLLLM worker..." @@ -94,7 +96,12 @@ class BigDLLLMWorker(BaseModelWorker): self.device = device self.speculative = speculative self.model, self.tokenizer = load_model( - model_path, device, self.load_in_low_bit, trust_remote_code, speculative + model_path, + device, + self.load_in_low_bit, + trust_remote_code, + speculative, + load_low_bit_model, ) self.stream_interval = stream_interval self.context_len = get_context_length(self.model.config) @@ -495,6 +502,12 @@ if __name__ == "__main__": help="Trust remote code (e.g., from HuggingFace) when" "downloading the model and tokenizer.", ) + parser.add_argument( + "--load-low-bit-model", + action="store_true", + default=False, + help="Load models that have been converted/saved using ipex-llm's save_low_bit interface", + ) parser.add_argument("--embed-in-truncate", action="store_true") args = parser.parse_args() @@ -512,5 +525,7 @@ if __name__ == "__main__": args.trust_remote_code, args.embed_in_truncate, args.speculative, + args.load_low_bit_model, + args.stream_interval, ) uvicorn.run(app, host=args.host, port=args.port, log_level="info") diff --git a/python/llm/src/ipex_llm/transformers/load_config.yaml b/python/llm/src/ipex_llm/transformers/load_config.yaml index e7e25e66..7f56ae52 100644 --- a/python/llm/src/ipex_llm/transformers/load_config.yaml +++ b/python/llm/src/ipex_llm/transformers/load_config.yaml @@ -3,13 +3,14 @@ repo_id: # - 'THUDM/chatglm-6b' # - 'THUDM/chatglm2-6b' - 'meta-llama/Llama-2-7b-chat-hf' - - 'baichuan-inc/Baichuan2-7B-Chat' - - 'Qwen/Qwen-7B-Chat' + # - 'baichuan-inc/Baichuan2-7B-Chat' + # - 'Qwen/Qwen-7B-Chat' # - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now -local_model_hub: 'path to your local model hub' +local_model_hub: '/mnt/disk1/models' low_bit: - 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) - 'bf16' device: - - 'cpu' - # - 'xpu' + #- 'cpu' + - 'xpu' +load_low_bit_model: False diff --git a/python/llm/src/ipex_llm/transformers/loader.py b/python/llm/src/ipex_llm/transformers/loader.py index 1a05ba3b..acaddb7a 100644 --- a/python/llm/src/ipex_llm/transformers/loader.py +++ b/python/llm/src/ipex_llm/transformers/loader.py @@ -46,6 +46,7 @@ def load_model( low_bit: str = 'sym_int4', trust_remote_code: bool = True, speculative: bool = False, + load_low_bit_model: bool = False, ): """Load a model using BigDL LLM backend.""" @@ -53,26 +54,38 @@ def load_model( invalidInputError(device == 'cpu' or device == 'xpu', "BigDL-LLM only supports device cpu or xpu") - tokenizer_cls = get_tokenizer_cls(model_path) model_cls = get_model_cls(model_path, low_bit) - model_kwargs = {"use_cache": True} - if trust_remote_code: - model_kwargs["trust_remote_code"] = True - if low_bit == "bf16": - model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": torch.bfloat16}) - elif low_bit == "fp16": - model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": torch.float16}) - else: - model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": 'auto'}) + # Load tokenizer + tokenizer_cls = get_tokenizer_cls(model_path) + model_kwargs = {"use_cache": True} if speculative: + invalidInputError(load_low_bit_model is not True, + "Self-Speculative currently do not support load low-bit format models") invalidInputError(low_bit == "fp16" or low_bit == "bf16", "Self-Speculative only supports low_bit fp16 or bf16") model_kwargs["speculative"] = True - # Load tokenizer - tokenizer = tokenizer_cls.from_pretrained(model_path, trust_remote_code=True) - model = model_cls.from_pretrained(model_path, **model_kwargs) + if trust_remote_code: + model_kwargs["trust_remote_code"] = True + + if load_low_bit_model: + # After save_low_bit, the from_pretrained interface does not accept trust_remote_code=True + tokenizer = tokenizer_cls.from_pretrained(model_path) + model = model_cls.load_low_bit(model_path, **model_kwargs) + else: + if trust_remote_code: + tokenizer = tokenizer_cls.from_pretrained(model_path, trust_remote_code=True) + else: + tokenizer = tokenizer_cls.from_pretrained(model_path) + if low_bit == "bf16": + model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": torch.bfloat16}) + elif low_bit == "fp16": + model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": torch.float16}) + else: + model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": 'auto'}) + + model = model_cls.from_pretrained(model_path, **model_kwargs) if not get_enable_ipex(): model = model.eval() @@ -83,13 +96,14 @@ def load_model( return model, tokenizer -def try_run_test_generation(local_model_hub, model_path, device, low_bit): +def try_run_test_generation(local_model_hub, model_path, device, low_bit, load_low_bit_model): path = get_model_path(model_path, local_model_hub) try: - run_test_generation(path, device, low_bit) + run_test_generation(path, device, low_bit, load_low_bit_model) except: print(f"Loading model failed for model {model_path} \ - with device:{device} and low_bit:{low_bit}") + with device:{device} and low_bit:{low_bit} \ + and load_low_bit_model {load_low_bit_model}") return "False" return "True" @@ -105,11 +119,11 @@ def get_model_path(repo_id, local_model_hub): return repo_id -def run_test_generation(model_path, device, low_bit): - model, tokenizer = load_model(model_path, device, low_bit, True) +def run_test_generation(model_path, device, low_bit, load_low_bit_model): + # Disable speculative by default + model, tokenizer = load_model(model_path, device, low_bit, True, False, load_low_bit_model) with torch.inference_mode(): prompt = "What is AI?" - # TODO: if gpu, will need to move the tensor to xpu input_ids = tokenizer.encode(prompt, return_tensors="pt") if device == 'xpu': input_ids = input_ids.to('xpu') @@ -133,7 +147,6 @@ def run_test_generation(model_path, device, low_bit): # Note that this only test loading models instead of generation correctness if __name__ == '__main__': import os - # TODO: move config.yaml to a different folder current_dir = os.path.dirname(os.path.realpath(__file__)) results = [] from omegaconf import OmegaConf @@ -144,9 +157,15 @@ if __name__ == '__main__': for model in conf.repo_id: for low_bit in conf.low_bit: for device in conf.device: - result = try_run_test_generation(conf['local_model_hub'], model, device, low_bit) - results.append([model, device, low_bit, result]) + result = try_run_test_generation(conf['local_model_hub'], + model, + device, + low_bit, + conf["load_low_bit_model"] + ) + results.append([model, device, low_bit, conf["load_low_bit_model"], result]) - df = pd.DataFrame(results, columns=['model', 'device', 'low_bit', 'result']) + df = pd.DataFrame(results, + columns=['model', 'device', 'low_bit', 'use_low_bit_model', 'result']) df.to_csv(csv_name) results = []