Adding load_low_bit interface for ipex_llm_worker (#11000)
* initial implementation, need tests * fix * fix baichuan issue * fix typo
This commit is contained in:
parent
1b3c7a6928
commit
74997a3ed1
4 changed files with 81 additions and 38 deletions
|
|
@ -46,7 +46,7 @@ pip install --pre --upgrade ipex-llm[xpu,serving] --extra-index-url https://pyto
|
||||||
You need first run the fastchat controller
|
You need first run the fastchat controller
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 -m fastchat.serve.controller
|
python -m fastchat.serve.controller
|
||||||
```
|
```
|
||||||
|
|
||||||
### Launch model worker(s) and load models
|
### Launch model worker(s) and load models
|
||||||
|
|
@ -63,14 +63,22 @@ To run the `ipex_llm_worker` on CPU, using the following code:
|
||||||
source ipex-llm-init -t
|
source ipex-llm-init -t
|
||||||
|
|
||||||
# Available low_bit format including sym_int4, sym_int8, bf16 etc.
|
# Available low_bit format including sym_int4, sym_int8, bf16 etc.
|
||||||
python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "cpu"
|
python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "cpu"
|
||||||
```
|
```
|
||||||
|
|
||||||
For GPU example:
|
For GPU example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Available low_bit format including sym_int4, sym_int8, fp16 etc.
|
# Available low_bit format including sym_int4, sym_int8, fp16 etc.
|
||||||
python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "xpu"
|
python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "sym_int4" --trust-remote-code --device "xpu"
|
||||||
|
```
|
||||||
|
|
||||||
|
We have also provided an option `--load-low-bit-model` to load models that have been converted and saved into disk using the `save_low_bit` interface as introduced in this [document](https://github.com/intel-analytics/ipex-llm/blob/main/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/README.md).
|
||||||
|
|
||||||
|
Check the following examples:
|
||||||
|
```bash
|
||||||
|
# Or --device "cpu"
|
||||||
|
python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path /Low/Bit/Model/Path --trust-remote-code --device "xpu"
|
||||||
```
|
```
|
||||||
|
|
||||||
#### For self-speculative decoding example:
|
#### For self-speculative decoding example:
|
||||||
|
|
@ -80,14 +88,14 @@ You can use IPEX-LLM to run `self-speculative decoding` example. Refer to [here]
|
||||||
```bash
|
```bash
|
||||||
# Available low_bit format only including bf16 on CPU.
|
# Available low_bit format only including bf16 on CPU.
|
||||||
source ipex-llm-init -t
|
source ipex-llm-init -t
|
||||||
python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "bf16" --trust-remote-code --device "cpu" --speculative
|
python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "bf16" --trust-remote-code --device "cpu" --speculative
|
||||||
|
|
||||||
# Available low_bit format only including fp16 on GPU.
|
# Available low_bit format only including fp16 on GPU.
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
export ENABLE_SDP_FUSION=1
|
export ENABLE_SDP_FUSION=1
|
||||||
export SYCL_CACHE_PERSISTENT=1
|
export SYCL_CACHE_PERSISTENT=1
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "fp16" --trust-remote-code --device "xpu" --speculative
|
python -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "fp16" --trust-remote-code --device "xpu" --speculative
|
||||||
```
|
```
|
||||||
|
|
||||||
For a full list of accepted arguments, you can refer to the main method of the `ipex_llm_worker.py`
|
For a full list of accepted arguments, you can refer to the main method of the `ipex_llm_worker.py`
|
||||||
|
|
@ -100,16 +108,16 @@ To run using the `vLLM_worker`, we don't need to change model name, just simply
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# On CPU
|
# On CPU
|
||||||
python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device cpu
|
python -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device cpu
|
||||||
|
|
||||||
# On GPU
|
# On GPU
|
||||||
python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu
|
python -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu
|
||||||
```
|
```
|
||||||
|
|
||||||
### Launch Gradio web server
|
### Launch Gradio web server
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 -m fastchat.serve.gradio_web_server
|
python -m fastchat.serve.gradio_web_server
|
||||||
```
|
```
|
||||||
|
|
||||||
This is the user interface that users will interact with.
|
This is the user interface that users will interact with.
|
||||||
|
|
@ -121,5 +129,5 @@ By following these steps, you will be able to serve your models using the web UI
|
||||||
To start an OpenAI API server that provides compatible APIs using IPEX-LLM backend, you can launch the `openai_api_server` and follow this [doc](https://github.com/lm-sys/FastChat/blob/main/docs/openai_api.md) to use it.
|
To start an OpenAI API server that provides compatible APIs using IPEX-LLM backend, you can launch the `openai_api_server` and follow this [doc](https://github.com/lm-sys/FastChat/blob/main/docs/openai_api.md) to use it.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 -m fastchat.serve.openai_api_server --host localhost --port 8000
|
python -m fastchat.serve.openai_api_server --host localhost --port 8000
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -69,6 +69,7 @@ class BigDLLLMWorker(BaseModelWorker):
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
embed_in_truncate: bool = False,
|
embed_in_truncate: bool = False,
|
||||||
speculative: bool = False,
|
speculative: bool = False,
|
||||||
|
load_low_bit_model: bool = False,
|
||||||
stream_interval: int = 4,
|
stream_interval: int = 4,
|
||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
|
|
@ -82,6 +83,7 @@ class BigDLLLMWorker(BaseModelWorker):
|
||||||
)
|
)
|
||||||
|
|
||||||
self.load_in_low_bit = load_in_low_bit
|
self.load_in_low_bit = load_in_low_bit
|
||||||
|
self.load_low_bit_model = load_low_bit_model
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Loading the model {self.model_names} on worker {worker_id},"
|
f"Loading the model {self.model_names} on worker {worker_id},"
|
||||||
f" worker type: BigDLLLM worker..."
|
f" worker type: BigDLLLM worker..."
|
||||||
|
|
@ -94,7 +96,12 @@ class BigDLLLMWorker(BaseModelWorker):
|
||||||
self.device = device
|
self.device = device
|
||||||
self.speculative = speculative
|
self.speculative = speculative
|
||||||
self.model, self.tokenizer = load_model(
|
self.model, self.tokenizer = load_model(
|
||||||
model_path, device, self.load_in_low_bit, trust_remote_code, speculative
|
model_path,
|
||||||
|
device,
|
||||||
|
self.load_in_low_bit,
|
||||||
|
trust_remote_code,
|
||||||
|
speculative,
|
||||||
|
load_low_bit_model,
|
||||||
)
|
)
|
||||||
self.stream_interval = stream_interval
|
self.stream_interval = stream_interval
|
||||||
self.context_len = get_context_length(self.model.config)
|
self.context_len = get_context_length(self.model.config)
|
||||||
|
|
@ -495,6 +502,12 @@ if __name__ == "__main__":
|
||||||
help="Trust remote code (e.g., from HuggingFace) when"
|
help="Trust remote code (e.g., from HuggingFace) when"
|
||||||
"downloading the model and tokenizer.",
|
"downloading the model and tokenizer.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--load-low-bit-model",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Load models that have been converted/saved using ipex-llm's save_low_bit interface",
|
||||||
|
)
|
||||||
parser.add_argument("--embed-in-truncate", action="store_true")
|
parser.add_argument("--embed-in-truncate", action="store_true")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
@ -512,5 +525,7 @@ if __name__ == "__main__":
|
||||||
args.trust_remote_code,
|
args.trust_remote_code,
|
||||||
args.embed_in_truncate,
|
args.embed_in_truncate,
|
||||||
args.speculative,
|
args.speculative,
|
||||||
|
args.load_low_bit_model,
|
||||||
|
args.stream_interval,
|
||||||
)
|
)
|
||||||
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
|
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
|
||||||
|
|
|
||||||
|
|
@ -3,13 +3,14 @@ repo_id:
|
||||||
# - 'THUDM/chatglm-6b'
|
# - 'THUDM/chatglm-6b'
|
||||||
# - 'THUDM/chatglm2-6b'
|
# - 'THUDM/chatglm2-6b'
|
||||||
- 'meta-llama/Llama-2-7b-chat-hf'
|
- 'meta-llama/Llama-2-7b-chat-hf'
|
||||||
- 'baichuan-inc/Baichuan2-7B-Chat'
|
# - 'baichuan-inc/Baichuan2-7B-Chat'
|
||||||
- 'Qwen/Qwen-7B-Chat'
|
# - 'Qwen/Qwen-7B-Chat'
|
||||||
# - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
|
# - 'liuhaotian/llava-v1.5-7b' # requires a LLAVA_REPO_DIR env variables pointing to the llava dir; added only for gpu win related test_api now
|
||||||
local_model_hub: 'path to your local model hub'
|
local_model_hub: '/mnt/disk1/models'
|
||||||
low_bit:
|
low_bit:
|
||||||
- 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
|
- 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
|
||||||
- 'bf16'
|
- 'bf16'
|
||||||
device:
|
device:
|
||||||
- 'cpu'
|
#- 'cpu'
|
||||||
# - 'xpu'
|
- 'xpu'
|
||||||
|
load_low_bit_model: False
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,7 @@ def load_model(
|
||||||
low_bit: str = 'sym_int4',
|
low_bit: str = 'sym_int4',
|
||||||
trust_remote_code: bool = True,
|
trust_remote_code: bool = True,
|
||||||
speculative: bool = False,
|
speculative: bool = False,
|
||||||
|
load_low_bit_model: bool = False,
|
||||||
):
|
):
|
||||||
"""Load a model using BigDL LLM backend."""
|
"""Load a model using BigDL LLM backend."""
|
||||||
|
|
||||||
|
|
@ -53,11 +54,30 @@ def load_model(
|
||||||
invalidInputError(device == 'cpu' or device == 'xpu',
|
invalidInputError(device == 'cpu' or device == 'xpu',
|
||||||
"BigDL-LLM only supports device cpu or xpu")
|
"BigDL-LLM only supports device cpu or xpu")
|
||||||
|
|
||||||
tokenizer_cls = get_tokenizer_cls(model_path)
|
|
||||||
model_cls = get_model_cls(model_path, low_bit)
|
model_cls = get_model_cls(model_path, low_bit)
|
||||||
|
# Load tokenizer
|
||||||
|
tokenizer_cls = get_tokenizer_cls(model_path)
|
||||||
|
|
||||||
model_kwargs = {"use_cache": True}
|
model_kwargs = {"use_cache": True}
|
||||||
|
if speculative:
|
||||||
|
invalidInputError(load_low_bit_model is not True,
|
||||||
|
"Self-Speculative currently do not support load low-bit format models")
|
||||||
|
invalidInputError(low_bit == "fp16" or low_bit == "bf16",
|
||||||
|
"Self-Speculative only supports low_bit fp16 or bf16")
|
||||||
|
model_kwargs["speculative"] = True
|
||||||
|
|
||||||
if trust_remote_code:
|
if trust_remote_code:
|
||||||
model_kwargs["trust_remote_code"] = True
|
model_kwargs["trust_remote_code"] = True
|
||||||
|
|
||||||
|
if load_low_bit_model:
|
||||||
|
# After save_low_bit, the from_pretrained interface does not accept trust_remote_code=True
|
||||||
|
tokenizer = tokenizer_cls.from_pretrained(model_path)
|
||||||
|
model = model_cls.load_low_bit(model_path, **model_kwargs)
|
||||||
|
else:
|
||||||
|
if trust_remote_code:
|
||||||
|
tokenizer = tokenizer_cls.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
else:
|
||||||
|
tokenizer = tokenizer_cls.from_pretrained(model_path)
|
||||||
if low_bit == "bf16":
|
if low_bit == "bf16":
|
||||||
model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": torch.bfloat16})
|
model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": torch.bfloat16})
|
||||||
elif low_bit == "fp16":
|
elif low_bit == "fp16":
|
||||||
|
|
@ -65,13 +85,6 @@ def load_model(
|
||||||
else:
|
else:
|
||||||
model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": 'auto'})
|
model_kwargs.update({"load_in_low_bit": low_bit, "torch_dtype": 'auto'})
|
||||||
|
|
||||||
if speculative:
|
|
||||||
invalidInputError(low_bit == "fp16" or low_bit == "bf16",
|
|
||||||
"Self-Speculative only supports low_bit fp16 or bf16")
|
|
||||||
model_kwargs["speculative"] = True
|
|
||||||
|
|
||||||
# Load tokenizer
|
|
||||||
tokenizer = tokenizer_cls.from_pretrained(model_path, trust_remote_code=True)
|
|
||||||
model = model_cls.from_pretrained(model_path, **model_kwargs)
|
model = model_cls.from_pretrained(model_path, **model_kwargs)
|
||||||
if not get_enable_ipex():
|
if not get_enable_ipex():
|
||||||
model = model.eval()
|
model = model.eval()
|
||||||
|
|
@ -83,13 +96,14 @@ def load_model(
|
||||||
return model, tokenizer
|
return model, tokenizer
|
||||||
|
|
||||||
|
|
||||||
def try_run_test_generation(local_model_hub, model_path, device, low_bit):
|
def try_run_test_generation(local_model_hub, model_path, device, low_bit, load_low_bit_model):
|
||||||
path = get_model_path(model_path, local_model_hub)
|
path = get_model_path(model_path, local_model_hub)
|
||||||
try:
|
try:
|
||||||
run_test_generation(path, device, low_bit)
|
run_test_generation(path, device, low_bit, load_low_bit_model)
|
||||||
except:
|
except:
|
||||||
print(f"Loading model failed for model {model_path} \
|
print(f"Loading model failed for model {model_path} \
|
||||||
with device:{device} and low_bit:{low_bit}")
|
with device:{device} and low_bit:{low_bit} \
|
||||||
|
and load_low_bit_model {load_low_bit_model}")
|
||||||
return "False"
|
return "False"
|
||||||
return "True"
|
return "True"
|
||||||
|
|
||||||
|
|
@ -105,11 +119,11 @@ def get_model_path(repo_id, local_model_hub):
|
||||||
return repo_id
|
return repo_id
|
||||||
|
|
||||||
|
|
||||||
def run_test_generation(model_path, device, low_bit):
|
def run_test_generation(model_path, device, low_bit, load_low_bit_model):
|
||||||
model, tokenizer = load_model(model_path, device, low_bit, True)
|
# Disable speculative by default
|
||||||
|
model, tokenizer = load_model(model_path, device, low_bit, True, False, load_low_bit_model)
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
prompt = "What is AI?"
|
prompt = "What is AI?"
|
||||||
# TODO: if gpu, will need to move the tensor to xpu
|
|
||||||
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
||||||
if device == 'xpu':
|
if device == 'xpu':
|
||||||
input_ids = input_ids.to('xpu')
|
input_ids = input_ids.to('xpu')
|
||||||
|
|
@ -133,7 +147,6 @@ def run_test_generation(model_path, device, low_bit):
|
||||||
# Note that this only test loading models instead of generation correctness
|
# Note that this only test loading models instead of generation correctness
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import os
|
import os
|
||||||
# TODO: move config.yaml to a different folder
|
|
||||||
current_dir = os.path.dirname(os.path.realpath(__file__))
|
current_dir = os.path.dirname(os.path.realpath(__file__))
|
||||||
results = []
|
results = []
|
||||||
from omegaconf import OmegaConf
|
from omegaconf import OmegaConf
|
||||||
|
|
@ -144,9 +157,15 @@ if __name__ == '__main__':
|
||||||
for model in conf.repo_id:
|
for model in conf.repo_id:
|
||||||
for low_bit in conf.low_bit:
|
for low_bit in conf.low_bit:
|
||||||
for device in conf.device:
|
for device in conf.device:
|
||||||
result = try_run_test_generation(conf['local_model_hub'], model, device, low_bit)
|
result = try_run_test_generation(conf['local_model_hub'],
|
||||||
results.append([model, device, low_bit, result])
|
model,
|
||||||
|
device,
|
||||||
|
low_bit,
|
||||||
|
conf["load_low_bit_model"]
|
||||||
|
)
|
||||||
|
results.append([model, device, low_bit, conf["load_low_bit_model"], result])
|
||||||
|
|
||||||
df = pd.DataFrame(results, columns=['model', 'device', 'low_bit', 'result'])
|
df = pd.DataFrame(results,
|
||||||
|
columns=['model', 'device', 'low_bit', 'use_low_bit_model', 'result'])
|
||||||
df.to_csv(csv_name)
|
df.to_csv(csv_name)
|
||||||
results = []
|
results = []
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue