Upgrade to vLLM 0.6.6 (#12796)

* init

* update engine init

* fix serving load_in_low_bit problem

* temp

* temp

* temp

* temp

* temp

* fix

* fixed

* done

* fix

* fix all arguments

* fix

* fix throughput script

* fix

* fix

* use official ipex-llm

* Fix readme

* fix

---------

Co-authored-by: hzjane <a1015616934@qq.com>
This commit is contained in:
Guancheng Fu 2025-02-12 16:47:51 +08:00 committed by GitHub
parent f8ab833f74
commit af693425f1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 1003 additions and 910 deletions

View file

@ -1,27 +1,17 @@
FROM intel/oneapi-basekit:2024.1.1-devel-ubuntu22.04
# First stage: build oneccl
FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04 AS build
ARG http_proxy
ARG https_proxy
ENV TZ=Asia/Shanghai
ENV PYTHONUNBUFFERED=1
# To prevent RPC_TIMEOUT ERROR for the first request
ENV VLLM_RPC_TIMEOUT=100000
# Disable pip's cache behavior
ARG PIP_NO_CACHE_DIR=false
ADD ./gradio_web_server.patch /tmp/gradio_web_server.patch
ADD ./oneccl-binding.patch /tmp/oneccl-binding.patch
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg && \
apt-get update && \
ADD ./ccl_torch.patch /tmp/
RUN apt-get update && \
apt-get install -y --no-install-recommends curl wget git libunwind8-dev vim less && \
ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \
env DEBIAN_FRONTEND=noninteractive apt-get update && \
@ -39,65 +29,105 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
python3 get-pip.py && \
rm get-pip.py && \
pip install --upgrade requests argparse urllib3 && \
pip install --pre --upgrade ipex-llm[xpu,serving] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev numactl && \
# If we do not install this compute-runtime, we will fail the build later
mkdir -p /tmp/neo && \
cd /tmp/neo && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-core-2_2.5.6+18417_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-opencl-2_2.5.6+18417_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu-dbgsym_1.6.32224.5_amd64.ddeb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu_1.6.32224.5_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd-dbgsym_24.52.32224.5_amd64.ddeb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd_24.52.32224.5_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/libigdgmm12_22.5.5_amd64.deb && \
dpkg -i *.deb && \
pip install --pre --upgrade ipex-llm[xpu_2.6] --extra-index-url https://download.pytorch.org/whl/test/xpu && \
mkdir /build && \
cd /build && \
git clone https://github.com/intel/torch-ccl.git && \
cd torch-ccl && \
git checkout ccl_torch2.5.0+xpu && \
git submodule sync && \
git submodule update --init --recursive && \
# This patch will enable build torch-ccl with pytorch 2.6 environment
git apply /tmp/ccl_torch.patch && \
USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py bdist_wheel
# File path: /build/torch-ccl/dist/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl
FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04
COPY --from=build /build/torch-ccl/dist/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl /opt/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl
ARG http_proxy
ARG https_proxy
ENV TZ=Asia/Shanghai
ENV PYTHONUNBUFFERED=1
# To prevent RPC_TIMEOUT ERROR for the first request
ENV VLLM_RPC_TIMEOUT=100000
# Disable pip's cache behavior
ARG PIP_NO_CACHE_DIR=false
RUN apt-get update && \
apt-get install -y --no-install-recommends curl wget git libunwind8-dev vim less && \
ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \
env DEBIAN_FRONTEND=noninteractive apt-get update && \
apt-get install -y --no-install-recommends gnupg gpg-agent software-properties-common kmod && \
# Add Python 3.11 PPA repository
add-apt-repository ppa:deadsnakes/ppa -y && \
apt-get install -y --no-install-recommends python3.11 git curl wget && \
rm /usr/bin/python3 && \
ln -s /usr/bin/python3.11 /usr/bin/python3 && \
ln -s /usr/bin/python3 /usr/bin/python && \
apt-get install -y --no-install-recommends python3-pip python3.11-dev python3-wheel python3.11-distutils && \
wget https://bootstrap.pypa.io/get-pip.py -O get-pip.py && \
python3 get-pip.py && \
rm get-pip.py && \
pip install --upgrade requests argparse urllib3 && \
pip install --pre --upgrade ipex-llm[xpu_2.6] --extra-index-url https://download.pytorch.org/whl/test/xpu && \
pip install transformers_stream_generator einops tiktoken && \
pip install --upgrade colorama && \
# Download all-in-one benchmark and examples
git clone https://github.com/intel-analytics/ipex-llm && \
# The following comment segment is used when building from source...
# cd ipex-llm && \
# git fetch origin pull/12338/head:local_pr && \
# git checkout local_pr && \
# pip uninstall -y ipex-llm && \
# cd python/llm && \
# python setup.py install && \
# cd ../../../ && \
git clone https://github.com/intel/ipex-llm.git && \
cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \
cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \
cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
rm -rf ./ipex-llm && \
# Install vllm dependencies
pip install --upgrade fastapi && \
pip install --upgrade "uvicorn[standard]" && \
# Download vLLM-Serving
cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
rm -rf ./ipex-llm && \
# Install torch-ccl
cd /tmp/ && \
pip install torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 intel-extension-for-pytorch==2.1.30.post0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
# Internal oneccl
wget https://sourceforge.net/projects/oneccl-wks/files/2024.0.0.6.5-release/oneccl_wks_installer_2024.0.0.6.5.sh && \
bash oneccl_wks_installer_2024.0.0.6.5.sh && \
git clone https://github.com/intel/torch-ccl -b v2.1.300+xpu && \
cd torch-ccl && \
patch -p1 < /tmp/oneccl-binding.patch && \
USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py install && \
pip install /opt/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl && \
# install Internal oneccl
cd /opt && \
wget https://sourceforge.net/projects/oneccl-wks/files/2025.0.0.6.6-release/oneccl_wks_installer_2025.0.0.6.6.sh && \
bash oneccl_wks_installer_2025.0.0.6.6.sh && \
apt-get update && \
apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev numactl && \
# apt-get install -y intel-opencl-icd intel-level-zero-gpu=1.3.26241.33-647~22.04 level-zero level-zero-dev --allow-downgrades && \
# Install compute runtime
mkdir -p /tmp/neo && \
cd /tmp/neo && \
wget https://github.com/oneapi-src/level-zero/releases/download/v1.18.5/level-zero_1.18.5+u22.04_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17791.9/intel-igc-core_1.0.17791.9_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17791.9/intel-igc-opencl_1.0.17791.9_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.39.31294.12/intel-level-zero-gpu_1.6.31294.12_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.39.31294.12/intel-opencl-icd_24.39.31294.12_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.39.31294.12/libigdgmm12_22.5.2_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-core-2_2.5.6+18417_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-opencl-2_2.5.6+18417_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu-dbgsym_1.6.32224.5_amd64.ddeb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu_1.6.32224.5_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd-dbgsym_24.52.32224.5_amd64.ddeb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd_24.52.32224.5_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/libigdgmm12_22.5.5_amd64.deb && \
dpkg -i *.deb && \
rm -rf /tmp/neo && \
mkdir -p /llm && \
cd /llm && \
git clone -b 0.6.2 https://github.com/analytics-zoo/vllm.git /llm/vllm && \
rm -rf /tmp/neo && \
# Install vllm
git clone -b 0.6.6-pre https://github.com/analytics-zoo/vllm.git /llm/vllm && \
cd /llm/vllm && \
pip install setuptools-scm && \
pip install --upgrade cmake && \
VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -v /llm/vllm && \
# pip install -r /llm/vllm/requirements-xpu.txt && \
# VLLM_TARGET_DEVICE=xpu python setup.py install && \
pip install mpi4py fastapi uvicorn openai && \
pip install gradio==4.43.0 && \
# pip install transformers==4.44.2 && \
# patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch && \
pip install ray && \
patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch
pip install ray
COPY ./vllm_online_benchmark.py /llm/
COPY ./vllm_offline_inference.py /llm/
@ -106,10 +136,6 @@ COPY ./payload-1024.lua /llm/
COPY ./start-vllm-service.sh /llm/
COPY ./benchmark_vllm_throughput.py /llm/
COPY ./benchmark_vllm_latency.py /llm/
COPY ./start-fastchat-service.sh /llm/
COPY ./start-pp_serving-service.sh /llm/
COPY ./start-lightweight_serving-service.sh /llm/
ENV LD_LIBRARY_PATH /usr/local/lib/python3.11/dist-packages/intel_extension_for_pytorch/lib/:/opt/intel/oneapi/tbb/2021.12/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/2021.12/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.12/lib:/opt/intel/oneapi/mkl/2024.1/lib:/opt/intel/oneapi/ippcp/2021.11/lib/:/opt/intel/oneapi/ipp/2021.11/lib:/opt/intel/oneapi/dpl/2022.5/lib:/opt/intel/oneapi/dnnl/2024.1/lib:/opt/intel/oneapi/debugger/2024.1/opt/debugger/lib:/opt/intel/oneapi/dal/2024.2/lib:/opt/intel/oneapi/compiler/2024.1/opt/oclfpga/host/linux64/lib:/opt/intel/oneapi/compiler/2024.1/opt/compiler/lib:/opt/intel/oneapi/compiler/2024.1/lib:/opt/intel/oneapi/ccl/2021.12/lib/
WORKDIR /llm/

View file

@ -18,7 +18,7 @@ To map the `xpu` into the container, you need to specify `--device=/dev/dri` whe
An example could be:
```bash
#/bin/bash
export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:2.2.0-SNAPSHOT
export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:latest
sudo docker run -itd \
--net=host \
@ -59,86 +59,6 @@ To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to t
For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image.
#### FastChat serving engine
To set up model serving using `IPEX-LLM` as backend using FastChat, you can refer to this [quickstart](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/fastchat_quickstart.html#) or follow these quick steps to deploy a demo.
##### Quick Setup for FastChat with IPEX-LLM
1. **Start the Docker Container**
Run the following command to launch a Docker container with device access:
```bash
#/bin/bash
export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:latest
sudo docker run -itd \
--net=host \
--device=/dev/dri \
--name=demo-container \
# Example: map host model directory to container
-v /LLM_MODELS/:/llm/models/ \
--shm-size="16g" \
# Optional: set proxy if needed
-e http_proxy=... \
-e https_proxy=... \
-e no_proxy="127.0.0.1,localhost" \
$DOCKER_IMAGE
```
2. **Start the FastChat Service**
Enter the container and start the FastChat service:
```bash
#/bin/bash
# This command assumes that you have mapped the host model directory to the container
# and the model directory is /llm/models/
# we take Yi-1.5-34B as an example, and you can replace it with your own model
ps -ef | grep "fastchat" | awk '{print $2}' | xargs kill -9
pip install -U gradio==4.43.0
# start controller
python -m fastchat.serve.controller &
export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
export TORCH_LLM_ALLREDUCE=0
export CCL_DG2_ALLREDUCE=1
# CCL needed environment variables
export CCL_WORKER_COUNT=4
# pin ccl worker to cores
# export CCL_WORKER_AFFINITY=32,33,34,35
export FI_PROVIDER=shm
export CCL_ATL_TRANSPORT=ofi
export CCL_ZE_IPC_EXCHANGE=sockets
export CCL_ATL_SHM=1
source /opt/intel/1ccl-wks/setvars.sh
python -m ipex_llm.serving.fastchat.vllm_worker \
--model-path /llm/models/Yi-1.5-34B \
--device xpu \
--enforce-eager \
--disable-async-output-proc \
--distributed-executor-backend ray \
--dtype float16 \
--load-in-low-bit fp8 \
--tensor-parallel-size 4 \
--gpu-memory-utilization 0.9 \
--max-model-len 4096 \
--max-num-batched-tokens 8000 &
sleep 120
python -m fastchat.serve.gradio_web_server &
```
This quick setup allows you to deploy FastChat with IPEX-LLM efficiently.
#### vLLM serving engine
To run vLLM engine using `IPEX-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/DockerGuides/vllm_docker_quickstart.md).

View file

@ -1,5 +1,6 @@
"""Benchmark the latency of processing a single batch of requests."""
import argparse
import dataclasses
import json
import time
from pathlib import Path
@ -9,55 +10,26 @@ import numpy as np
import torch
from tqdm import tqdm
from vllm import LLM, SamplingParams
from vllm import SamplingParams
from vllm.engine.arg_utils import EngineArgs
from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM
from vllm.inputs import PromptInputs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.inputs import PromptType
from vllm.utils import FlexibleArgumentParser
from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM
def main(args: argparse.Namespace):
print(args)
engine_args = EngineArgs.from_cli_args(args)
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(
model=args.model,
speculative_model=args.speculative_model,
num_speculative_tokens=args.num_speculative_tokens,
speculative_draft_tensor_parallel_size=\
args.speculative_draft_tensor_parallel_size,
tokenizer=args.tokenizer,
quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
max_model_len=args.max_model_len,
enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype,
quantization_param_path=args.quantization_param_path,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size,
gpu_memory_utilization=args.gpu_memory_utilization,
load_format=args.load_format,
distributed_executor_backend=args.distributed_executor_backend,
otlp_traces_endpoint=args.otlp_traces_endpoint,
enable_prefix_caching=args.enable_prefix_caching,
load_in_low_bit=args.load_in_low_bit,
max_num_batched_tokens=args.max_num_batched_tokens,
max_num_seqs=args.max_num_seqs,
)
llm = LLM(**dataclasses.asdict(engine_args), load_in_low_bit = args.load_in_low_bit)
sampling_params = SamplingParams(
n=args.n,
temperature=0.0 if args.use_beam_search else 1.0,
temperature=1.0,
top_p=1.0,
use_beam_search=args.use_beam_search,
ignore_eos=True,
max_tokens=args.output_len,
)
@ -65,37 +37,26 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size,
args.input_len))
dummy_inputs: List[PromptInputs] = [{
dummy_prompts: List[PromptType] = [{
"prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()]
def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
if args.device == "xpu":
with torch.autograd.profiler_legacy.profile(enabled=True, use_xpu=True) as p:
llm.generate(dummy_inputs,
sampling_params=sampling_params,
use_tqdm=False)
print("Sort by CPU time total...")
print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1))
print("Sort by XPU time total...")
print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1))
else:
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
# torch.profiler.ProfilerActivity.XPU,
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p:
llm.generate(dummy_inputs,
llm.generate(dummy_prompts,
sampling_params=sampling_params,
use_tqdm=False)
print(p.key_averages())
else:
start_time = time.perf_counter()
llm.generate(dummy_inputs,
llm.generate(dummy_prompts,
sampling_params=sampling_params,
use_tqdm=False)
end_time = time.perf_counter()
@ -142,19 +103,6 @@ if __name__ == '__main__':
parser = FlexibleArgumentParser(
description='Benchmark the latency of processing a single batch of '
'requests till completion.')
parser.add_argument('--model', type=str, default='facebook/opt-125m')
parser.add_argument('--speculative-model', type=str, default=None)
parser.add_argument('--num-speculative-tokens', type=int, default=None)
parser.add_argument('--speculative-draft-tensor-parallel-size',
'-spec-draft-tp',
type=int,
default=None)
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization',
'-q',
choices=[*QUANTIZATION_METHODS, None],
default=None)
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--input-len', type=int, default=32)
parser.add_argument('--output-len', type=int, default=128)
parser.add_argument('--batch-size', type=int, default=8)
@ -171,45 +119,12 @@ if __name__ == '__main__':
type=int,
default=30,
help='Number of iterations to run.')
parser.add_argument('--trust-remote-code',
action='store_true',
help='trust remote code from huggingface')
parser.add_argument(
'--max-model-len',
type=int,
default=None,
help='Maximum length of a sequence (including prompt and output). '
'If None, will be derived from the model.')
parser.add_argument(
'--dtype',
"--load-in-low-bit",
type=str,
default='auto',
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
help='data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.')
parser.add_argument('--enforce-eager',
action='store_true',
help='enforce eager mode and disable CUDA graph')
parser.add_argument(
'--kv-cache-dtype',
type=str,
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
default="auto",
help='Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
parser.add_argument(
'--quantization-param-path',
type=str,
default=None,
help='Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.')
choices=["sym_int4", "fp8", "fp8_e4m3", "fp16", "fp6"],
default="sym_int4",
help="Low-bit format quantization with IPEX-LLM")
parser.add_argument(
'--profile',
action='store_true',
@ -220,96 +135,12 @@ if __name__ == '__main__':
default=None,
help=('path to save the pytorch profiler output. Can be visualized '
'with ui.perfetto.dev or Tensorboard.'))
parser.add_argument(
"--device",
type=str,
default="auto",
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
'CPU.')
parser.add_argument('--block-size',
type=int,
default=16,
help='block size of key/value cache')
parser.add_argument(
'--enable-chunked-prefill',
action='store_true',
help='If True, the prefill requests can be chunked based on the '
'max_num_batched_tokens')
parser.add_argument("--enable-prefix-caching",
action='store_true',
help="Enable automatic prefix caching")
parser.add_argument('--use-v2-block-manager', action='store_true')
parser.add_argument(
"--ray-workers-use-nsight",
action='store_true',
help="If specified, use nsight to profile ray workers",
)
parser.add_argument('--download-dir',
type=str,
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the latency results in JSON format.')
parser.add_argument('--gpu-memory-utilization',
type=float,
default=0.9,
help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.')
parser.add_argument(
'--load-format',
type=str,
default=EngineArgs.load_format,
choices=[
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
'bitsandbytes'
],
help='The format of the model weights to load.\n\n'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available.\n'
'* "pt" will load the weights in the pytorch bin format.\n'
'* "safetensors" will load the weights in the safetensors format.\n'
'* "npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading.\n'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.\n'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
'section for more information.\n'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.\n')
parser.add_argument(
'--distributed-executor-backend',
choices=['ray', 'mp'],
default=None,
help='Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.')
parser.add_argument(
'--otlp-traces-endpoint',
type=str,
default=None,
help='Target URL to which OpenTelemetry traces will be sent.')
parser.add_argument(
"--load-in-low-bit",
type=str,
choices=["sym_int4", "fp8", "fp8_e4m3", "fp16", "fp6"],
default="sym_int4",
help="Low-bit format quantization with IPEX-LLM")
parser.add_argument('--max-num-batched-tokens',
type=int,
default=4096,
help='maximum number of batched tokens per iteration')
parser.add_argument('--max-num-seqs',
type=int,
default=256,
help='Maximum number of sequences per iteration.')
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
main(args)

View file

@ -4,27 +4,96 @@ import dataclasses
import json
import random
import time
from typing import List, Optional, Tuple
from functools import cache
from typing import Dict, List, Optional, Tuple
import torch
import uvloop
from PIL import Image
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
from ipex_llm.vllm.xpu.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
# from vllm.sampling_params import BeamSearchParams
from vllm.inputs import TextPrompt
from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict
from vllm.sampling_params import BeamSearchParams
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
) -> List[Tuple[str, int, int]]:
@dataclasses.dataclass
class SampleRequest:
"""A class representing a single inference request for benchmarking.
Attributes:
prompt: The input text prompt for the model.
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
lora_request: Optional LoRARequest specifying the LoRA to use.
"""
prompt: str
prompt_len: int
expected_output_len: int
multi_modal_data: Optional[MultiModalDataDict] = None
lora_request: Optional[LoRARequest] = None
def _get_prompt_for_image_model(question: str, *, model: str) -> str:
"""Prepend and append special tokens around the question to form a prompt.
Args:
question: The input question text to wrap with special tokens
model: The name of the model being used, to determine which special
tokens to add
Returns:
The formatted prompt string with appropriate special tokens for the
model
Raises:
ValueError: If an unsupported model name is provided
"""
model = model.lower()
if "pixtral" in model:
return f"<s>[INST]{question}\n[IMG][/INST]"
raise ValueError(f"Unsupported model {model}")
@cache
def lora_path_on_disk(lora_path: str) -> str:
return get_adapter_absolute_path(lora_path)
lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
def get_random_lora_request(
args: argparse.Namespace
) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
global lora_tokenizer_cache
lora_id = random.randint(1, args.max_loras)
lora_request = LoRARequest(lora_name=str(lora_id),
lora_int_id=lora_id,
lora_path=lora_path_on_disk(args.lora_path))
if lora_id not in lora_tokenizer_cache:
lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
return lora_request, lora_tokenizer_cache[lora_id]
def sample_requests(tokenizer: PreTrainedTokenizerBase,
args: argparse.Namespace) -> List[SampleRequest]:
dataset_path: str = args.dataset
num_requests: int = args.num_prompts
fixed_output_len: Optional[int] = args.output_len
model: str = args.model
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@ -33,24 +102,46 @@ def sample_requests(
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
# Only keep the first two turns of each conversation.
dataset = [(data["conversations"][0]["value"],
data["conversations"][1]["value"]) for data in dataset]
# Shuffle the dataset.
random.shuffle(dataset)
# Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
for i in range(len(dataset)):
filtered_dataset: List[SampleRequest] = []
for data in tqdm(dataset,
total=len(filtered_dataset),
desc="sampling requests"):
if len(filtered_dataset) == num_requests:
break
# Only keep the first two turns of each conversation.
prompt = data["conversations"][0]["value"]
completion = data["conversations"][1]["value"]
multi_modal_data: Optional[MultiModalDataDict] = None
if "image" in data:
multi_modal_data = multi_modal_data or {}
image_path = data["image"]
# TODO(vllm-project/vllm/issues/9778): Support multiple images.
assert isinstance(image_path,
str), "Only support single image input"
try:
multi_modal_data["image"] = Image.open(image_path).convert(
"RGB")
except FileNotFoundError:
# Ignore datapoint where asset is missing
continue
prompt = _get_prompt_for_image_model(question=prompt, model=model)
request_tokenizer = tokenizer
lora_request: Optional[LoRARequest] = None
if args.enable_lora:
lora_request, lora_tokenizer = get_random_lora_request(args)
if lora_tokenizer:
request_tokenizer = lora_tokenizer
# Tokenize the prompts and completions.
prompt = dataset[i][0]
prompt_token_ids = tokenizer(prompt).input_ids
completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_token_ids = request_tokenizer(prompt).input_ids
completion_token_ids = request_tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
@ -60,104 +151,110 @@ def sample_requests(
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
filtered_dataset.append((prompt, prompt_len, output_len))
filtered_dataset.append(
SampleRequest(prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=multi_modal_data,
lora_request=lora_request))
return filtered_dataset
def run_vllm(
requests: List[Tuple[str, int, int]],
requests: List[SampleRequest],
n: int,
low_bit: str,
engine_args: EngineArgs,
) -> float:
from vllm import SamplingParams
from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM
llm = LLM(**dataclasses.asdict(engine_args), load_in_low_bit=low_bit)
llm = LLM(**dataclasses.asdict(engine_args), load_in_low_bit = args.load_in_low_bit)
# Add the requests to the engine.
warm_prompt = "hi " * (1024 - 1)
warm_requests = [(warm_prompt, 1024, 1024)
for _ in range(8)]
prompts: List[str] = []
prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = []
for prompt, _, output_len in warm_requests:
prompts.append(prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=0.0,
top_p=1.0,
ignore_eos=True,
max_tokens=output_len,
))
llm.generate(prompts, sampling_params, use_tqdm=True)
# Add the requests to the engine.
prompts: List[str] = []
sampling_params: List[SamplingParams] = []
for prompt, _, output_len in requests:
prompts.append(prompt)
for request in requests:
prompts.append(
TextPrompt(prompt=request.prompt,
multi_modal_data=request.multi_modal_data))
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=output_len,
max_tokens=request.expected_output_len,
))
lora_requests: Optional[List[LoRARequest]] = None
if engine_args.enable_lora:
lora_requests = [request.lora_request for request in requests]
use_beam_search = False
if not use_beam_search:
start = time.perf_counter()
llm.generate(prompts, sampling_params, use_tqdm=True)
llm.generate(prompts,
sampling_params,
lora_request=lora_requests,
use_tqdm=True)
end = time.perf_counter()
else:
prompts = [prompt for prompt, _, _ in requests]
assert lora_requests is None, "BeamSearch API does not support LoRA"
prompts = [request.prompt for request in requests]
# output_len should be the same for all requests.
output_len = requests[0][2]
for prompt, input_len, _output_len in requests:
assert _output_len == output_len
for request in requests:
assert request.expected_output_len == output_len
start = time.perf_counter()
llm.beam_search(prompts,
llm.beam_search(
prompts,
BeamSearchParams(
beam_width=n,
max_tokens=output_len,
ignore_eos=True)
ignore_eos=True,
))
end = time.perf_counter()
return end - start
async def run_vllm_async(
requests: List[Tuple[str, int, int]],
requests: List[SampleRequest],
n: int,
engine_args: AsyncEngineArgs,
disable_frontend_multiprocessing: bool = False,
load_in_low_bit: str = "sym_int4",
) -> float:
from vllm import SamplingParams
async with build_async_engine_client_from_engine_args(
engine_args, disable_frontend_multiprocessing) as llm:
engine_args, disable_frontend_multiprocessing, load_in_low_bit=load_in_low_bit) as llm:
# Add the requests to the engine.
prompts: List[str] = []
prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = []
for prompt, _, output_len in requests:
prompts.append(prompt)
lora_requests: List[Optional[LoRARequest]] = []
for request in requests:
prompts.append(
TextPrompt(prompt=request.prompt,
multi_modal_data=request.multi_modal_data))
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=output_len,
max_tokens=request.expected_output_len,
))
lora_requests.append(request.lora_request)
generators = []
start = time.perf_counter()
for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
generator = llm.generate(prompt, sp, request_id=f"test{i}")
for i, (prompt, sp,
lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
generator = llm.generate(prompt,
sp,
lora_request=lr,
request_id=f"test{i}")
generators.append(generator)
all_gens = merge_async_iterators(*generators)
async for i, res in all_gens:
@ -167,7 +264,7 @@ async def run_vllm_async(
def run_hf(
requests: List[Tuple[str, int, int]],
requests: List[SampleRequest],
model: str,
tokenizer: PreTrainedTokenizerBase,
n: int,
@ -225,14 +322,14 @@ def run_hf(
def run_mii(
requests: List[Tuple[str, int, int]],
requests: List[SampleRequest],
model: str,
tensor_parallel_size: int,
output_len: int,
) -> float:
from mii import client, serve
llm = serve(model, tensor_parallel=tensor_parallel_size)
prompts = [prompt for prompt, _, _ in requests]
prompts = [request.prompt for request in requests]
start = time.perf_counter()
llm.generate(prompts, max_new_tokens=output_len)
@ -250,23 +347,50 @@ def main(args: argparse.Namespace):
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code)
if args.dataset is None:
vocab_size = tokenizer.vocab_size
requests = []
for _ in range(args.num_prompts):
request_tokenizer = tokenizer
lora_request: Optional[LoRARequest] = None
if args.enable_lora:
lora_request, lora_tokenizer = get_random_lora_request(args)
if lora_tokenizer:
request_tokenizer = lora_tokenizer
# Synthesize a prompt with the given input length.
candidate_ids = [
random.randint(0, vocab_size - 1)
for _ in range(args.input_len)
]
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
for i in range(-10, 10):
prompt = "hi " * (args.input_len + i)
tokenized_prompt = tokenizer(prompt).input_ids
if len(tokenized_prompt) == args.input_len:
break
else:
raise ValueError(
f"Failed to synthesize a prompt with {args.input_len} tokens.")
requests = [(prompt, args.input_len, args.output_len)
for _ in range(args.num_prompts)]
else:
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
args.output_len)
for _ in range(5): # Max attempts to correct
candidate_prompt = request_tokenizer.decode(candidate_ids)
tokenized_len = len(request_tokenizer.encode(candidate_prompt))
if tokenized_len == args.input_len:
break
# Adjust length based on difference
diff = args.input_len - tokenized_len
if diff > 0:
candidate_ids.extend([
random.randint(100, vocab_size - 100)
for _ in range(diff)
])
else:
candidate_ids = candidate_ids[:diff]
requests.append(
SampleRequest(prompt=candidate_prompt,
prompt_len=args.input_len,
expected_output_len=args.output_len,
lora_request=lora_request))
else:
requests = sample_requests(tokenizer, args)
is_multi_modal = any(request.multi_modal_data is not None
for request in requests)
if args.backend == "vllm":
if args.async_engine:
elapsed_time = uvloop.run(
@ -275,9 +399,10 @@ def main(args: argparse.Namespace):
args.n,
AsyncEngineArgs.from_cli_args(args),
args.disable_frontend_multiprocessing,
args.load_in_low_bit,
))
else:
elapsed_time = run_vllm(requests, args.n, args.load_in_low_bit,
elapsed_time = run_vllm(requests, args.n,
EngineArgs.from_cli_args(args))
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
@ -288,9 +413,15 @@ def main(args: argparse.Namespace):
args.output_len)
else:
raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(prompt_len + output_len
for _, prompt_len, output_len in requests)
total_output_tokens = sum(output_len for _, _, output_len in requests)
total_num_tokens = sum(request.prompt_len + request.expected_output_len
for request in requests)
total_output_tokens = sum(request.expected_output_len
for request in requests)
if is_multi_modal:
print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
"following metrics are not accurate because image tokens are not"
" counted. See vllm-project/vllm/issues/9778 for details.")
# TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
@ -317,7 +448,9 @@ if __name__ == "__main__":
parser.add_argument("--dataset",
type=str,
default=None,
help="Path to the dataset.")
help="Path to the dataset. The dataset is expected to "
"be a json in form of List[Dict[..., conversations: "
"List[Dict[..., value: <prompt_or_response>]]]]")
parser.add_argument("--input-len",
type=int,
default=None,
@ -352,12 +485,21 @@ if __name__ == "__main__":
action='store_true',
default=False,
help="Disable decoupled async engine frontend.")
# IPEX-LLM optimization
parser.add_argument(
"--load-in-low-bit",
type=str,
choices=["sym_int4", "fp8", "fp8_e4m3", "fp16", "fp6"],
choices=["sym_int4", "woq_int4", "fp8", "fp8_e4m3", "fp16", "fp6"],
default="sym_int4",
help="Low-bit format quantization with IPEX-LLM")
# LoRA
parser.add_argument(
"--lora-path",
type=str,
default=None,
help="Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier.")
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
if args.tokenizer is None:
@ -367,6 +509,8 @@ if __name__ == "__main__":
assert args.output_len is not None
else:
assert args.input_len is None
if args.enable_lora:
assert args.lora_path is not None
if args.backend == "vllm":
if args.hf_max_batch_size is not None:
@ -376,6 +520,9 @@ if __name__ == "__main__":
raise ValueError("HF max batch size is required for HF backend.")
if args.quantization is not None:
raise ValueError("Quantization is only for vLLM backend.")
if args.enable_lora is not None:
raise ValueError("LoRA benchmarking is only supported for vLLM"
" backend")
elif args.backend == "mii":
if args.dtype != "auto":
raise ValueError("dtype must be auto for MII backend.")
@ -388,5 +535,7 @@ if __name__ == "__main__":
if args.tokenizer != args.model:
raise ValueError("Tokenizer must be the same as the model for MII "
"backend.")
if args.enable_lora is not None:
raise ValueError("LoRA benchmarking is only supported for vLLM"
" backend")
main(args)

View file

@ -0,0 +1,47 @@
diff --git a/setup.py b/setup.py
index 67016cb..305be5c 100644
--- a/setup.py
+++ b/setup.py
@@ -118,8 +118,8 @@ class BuildCMakeExt(BuildExtension):
if compute_backend == 'dpcpp':
runtime = 'dpcpp'
build_options['COMPUTE_BACKEND'] = compute_backend
- import intel_extension_for_pytorch
- build_options['CMAKE_PREFIX_PATH'] += ";" + intel_extension_for_pytorch.cmake_prefix_path
+ # import intel_extension_for_pytorch
+ # build_options['CMAKE_PREFIX_PATH'] += ";" + intel_extension_for_pytorch.cmake_prefix_path
if "DPCPP_GCC_INSTALL_DIR" in my_env:
exist_cflags = "CFLAGS" in my_env
cflags = ""
diff --git a/src/gpu/CMakeLists.txt b/src/gpu/CMakeLists.txt
index 5628f31..d1589c4 100644
--- a/src/gpu/CMakeLists.txt
+++ b/src/gpu/CMakeLists.txt
@@ -1,4 +1,4 @@
-find_package(IPEX REQUIRED)
+# find_package(IPEX REQUIRED)
set(CCL_DPCPP_SRCS dpcpp_ccl.cpp ze_exception.hpp allreduce.h sycl_misc.hpp runtime.hpp cxxopts.hpp)
@@ -9,7 +9,7 @@ add_library(oneccl_bindings_for_pytorch_xpu SHARED ${CCL_DPCPP_SRCS})
target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC ${DEPENDS_LIB})
target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC oneccl_bindings_for_pytorch)
-target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC intel-ext-pt-gpu)
+# target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC intel-ext-pt-gpu)
foreach(RPATH ${CMAKE_INSTALL_RPATH})
set_target_properties(oneccl_bindings_for_pytorch_xpu PROPERTIES LINK_FLAGS "-Wl,-rpath,${RPATH}")
diff --git a/src/gpu/dpcpp_ccl.cpp b/src/gpu/dpcpp_ccl.cpp
index 1631b85..0945031 100644
--- a/src/gpu/dpcpp_ccl.cpp
+++ b/src/gpu/dpcpp_ccl.cpp
@@ -32,7 +32,7 @@
#include <ATen/record_function.h>
#include <ProcessGroupCCL.hpp>
#include <dispatch_stub.h>
-#include <ipex.h>
+// #include <ipex.h>
#include <sycl/sycl.hpp>
//#include "allreduce.h"

View file

@ -1,209 +0,0 @@
--- a/gradio_web_server.py
+++ b/gradio_web_server_new.py
@@ -9,8 +9,10 @@ import hashlib
import json
import os
import random
+import pandas as pd
import time
import uuid
+import numpy as np
import gradio as gr
import requests
@@ -241,7 +243,7 @@ def clear_history(request: gr.Request):
ip = get_ip(request)
logger.info(f"clear_history. ip: {ip}")
state = None
- return (state, [], "", None) + (disable_btn,) * 5
+ return (state, [], "", None, "", "", "", "") + (disable_btn,) * 5
def get_ip(request: gr.Request):
@@ -354,6 +356,18 @@ def is_limit_reached(model_name, ip):
return None
+def handle_latency_metrics(first_token_time, next_token_time):
+ # next token time is a numpy array...
+ # first token time might be None
+ first_token_latency = "None"
+ next_token_latency = "None"
+ if first_token_time is not None:
+ first_token_latency = f"{first_token_time * 1000 :.2f} ms"
+ if next_token_time.size > 0:
+ next_token_latency = f"{np.mean(next_token_time) * 1000 :.2f} ms"
+ return first_token_latency, next_token_latency
+
+
def bot_response(
state,
temperature,
@@ -372,7 +386,7 @@ def bot_response(
if state.skip_next:
# This generate call is skipped due to invalid inputs
state.skip_next = False
- yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5
return
if apply_rate_limit:
@@ -381,7 +395,7 @@ def bot_response(
error_msg = RATE_LIMIT_MSG + "\n\n" + ret["reason"]
logger.info(f"rate limit reached. ip: {ip}. error_msg: {ret['reason']}")
state.conv.update_last_message(error_msg)
- yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5
return
conv, model_name = state.conv, state.model_name
@@ -404,6 +418,10 @@ def bot_response(
yield (
state,
state.to_gradio_chatbot(),
+ "None",
+ "None",
+ "None",
+ "None",
disable_btn,
disable_btn,
disable_btn,
@@ -444,18 +462,32 @@ def bot_response(
)
conv.update_last_message("▌")
- yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+ # We probably need to change this method
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (disable_btn,) * 5
+ prompt_tokens = 0
+ generated_tokens = 0
+ first_token_latency = None
+ next_token_latencies = np.array([])
+ start_time = time.time()
try:
for i, data in enumerate(stream_iter):
if data["error_code"] == 0:
+ prompt_tokens = data["usage"]["prompt_tokens"]
+ generated_tokens = data["usage"]["completion_tokens"]
output = data["text"].strip()
conv.update_last_message(output + "▌")
- yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+ if first_token_latency is None:
+ first_token_latency = time.time() - start_time
+ else:
+ next_token_latencies = np.append(next_token_latencies, time.time() - start_time)
+ start_time = time.time()
+ first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies)
+ yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (disable_btn,) * 5
else:
output = data["text"] + f"\n\n(error_code: {data['error_code']})"
conv.update_last_message(output)
- yield (state, state.to_gradio_chatbot()) + (
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
disable_btn,
disable_btn,
disable_btn,
@@ -465,13 +497,14 @@ def bot_response(
return
output = data["text"].strip()
conv.update_last_message(output)
- yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
+ first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies)
+ yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (enable_btn,) * 5
except requests.exceptions.RequestException as e:
conv.update_last_message(
f"{SERVER_ERROR_MSG}\n\n"
f"(error_code: {ErrorCode.GRADIO_REQUEST_ERROR}, {e})"
)
- yield (state, state.to_gradio_chatbot()) + (
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
disable_btn,
disable_btn,
disable_btn,
@@ -484,7 +517,7 @@ def bot_response(
f"{SERVER_ERROR_MSG}\n\n"
f"(error_code: {ErrorCode.GRADIO_STREAM_UNKNOWN_ERROR}, {e})"
)
- yield (state, state.to_gradio_chatbot()) + (
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
disable_btn,
disable_btn,
disable_btn,
@@ -646,7 +679,8 @@ def build_single_model_ui(models, add_promotion_links=False):
)
notice_markdown = f"""
-# 🏔️ Chat with Open Large Language Models
+# 🏔️ ChatBot based Xeon-W & Arc GPUs
+### Deployed with IPEX-LLM
{promotion}
"""
@@ -717,6 +751,22 @@ def build_single_model_ui(models, add_promotion_links=False):
label="Max output tokens",
)
+ with gr.Row():
+ with gr.Column():
+ gr.Markdown("### Performance Metrics")
+ prompt_token = gr.Label(
+ label="Prompt token length:",
+ )
+ next_token = gr.Label(
+ label="Generated token length:",
+ )
+ first_token_latency = gr.Label(
+ label="First token Latency:",
+ )
+ next_token_latency = gr.Label(
+ label="Next token Latency:",
+ )
+
if add_promotion_links:
gr.Markdown(acknowledgment_md, elem_id="ack_markdown")
@@ -743,9 +793,9 @@ def build_single_model_ui(models, add_promotion_links=False):
).then(
bot_response,
[state, temperature, top_p, max_output_tokens],
- [state, chatbot] + btn_list,
+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
)
- clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox] + btn_list)
+ clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list)
model_selector.change(
clear_history, None, [state, chatbot, textbox, imagebox] + btn_list
@@ -758,7 +808,7 @@ def build_single_model_ui(models, add_promotion_links=False):
).then(
bot_response,
[state, temperature, top_p, max_output_tokens],
- [state, chatbot] + btn_list,
+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
)
send_btn.click(
add_text,
@@ -767,7 +817,7 @@ def build_single_model_ui(models, add_promotion_links=False):
).then(
bot_response,
[state, temperature, top_p, max_output_tokens],
- [state, chatbot] + btn_list,
+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
)
return [state, model_selector]
@@ -775,7 +825,7 @@ def build_single_model_ui(models, add_promotion_links=False):
def build_demo(models):
with gr.Blocks(
- title="Chat with Open Large Language Models",
+ title="ChatBot based Xeon-W & Arc GPUs",
theme=gr.themes.Default(),
css=block_css,
) as demo:
@@ -885,3 +935,4 @@ if __name__ == "__main__":
auth=auth,
root_path=args.gradio_root_path,
)
+

View file

@ -1,125 +0,0 @@
#!/bin/bash
usage() {
echo "Usage: $0 [-w --worker <model_worker|vllm_worker>] [--help]"
echo "--help: Print help message."
echo "The following environment variables can be set."
echo "MODEL_PATH (default: empty)."
echo "LOW_BIT_FORMAT (default: sym_int4)"
echo "CONTROLLER_HOST (default: localhost)."
echo "CONTROLLER_PORT (default: 21001)."
echo "WORKER_HOST (default: localhost)."
echo "WORKER_PORT (default: 21002)."
echo "API_HOST (default: localhost)."
echo "API_PORT (default: 8000)."
exit 1
}
# Default values
controller_host="localhost"
controller_port="21001"
worker_host="localhost"
worker_port="21002"
api_host="localhost"
api_port="8000"
model_path=""
mode=""
dispatch_method="shortest_queue" # shortest_queue or lottery
stream_interval=1
worker_type="model_worker"
low_bit_format="sym_int4"
# We do not have any arguments, just run bash
# Parse command-line options
options=$(getopt -o "hw:" --long "help,worker:" -n "$0" -- "$@")
if [ $? != 0 ]; then
usage
fi
eval set -- "$options"
while true; do
case "$1" in
-w|--worker)
worker_type="$2"
[[ $worker_type == "model_worker" || $worker_type == "vllm_worker" ]] || usage
shift 2
;;
-h|--help)
usage
;;
--)
shift
break
;;
*)
usage
;;
esac
done
if [ "$worker_type" == "model_worker" ]; then
worker_type="ipex_llm.serving.fastchat.ipex_llm_worker"
elif [ "$worker_type" == "vllm_worker" ]; then
worker_type="ipex_llm.serving.fastchat.vllm_worker"
fi
if [[ -n $CONTROLLER_HOST ]]; then
controller_host=$CONTROLLER_HOST
fi
if [[ -n $CONTROLLER_PORT ]]; then
controller_port=$CONTROLLER_PORT
fi
if [[ -n $LOW_BIT_FORMAT ]]; then
low_bit_format=$LOW_BIT_FORMAT
fi
if [[ -n $WORKER_HOST ]]; then
worker_host=$WORKER_HOST
fi
if [[ -n $WORKER_PORT ]]; then
worker_port=$WORKER_PORT
fi
if [[ -n $MODEL_PATH ]]; then
model_path=$MODEL_PATH
fi
if [[ -n $API_HOST ]]; then
api_host=$API_HOST
fi
if [[ -n $API_PORT ]]; then
api_port=$API_PORT
fi
if [[ -n $DISPATCH_METHOD ]]; then
dispatch_method=$DISPATCH_METHOD
fi
if [[ -n $STREAM_INTERVAL ]]; then
stream_interval=$STREAM_INTERVAL
fi
controller_address="http://$controller_host:$controller_port"
echo "Controller address: $controller_address"
python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
worker_address="http://$worker_host:$worker_port"
echo "Worker type: $worker_type"
echo "Worker address: $worker_address"
if [ "$worker_type" == "ipex_llm.serving.fastchat.ipex_llm_worker" ]; then
python3 -m "$worker_type" --model-path $model_path --device xpu --low-bit $low_bit_format --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval &
elif [ "$worker_type" == "ipex_llm.serving.fastchat.vllm_worker" ]; then
python3 -m "$worker_type" --model-path $model_path --device xpu --load-in-low-bit $low_bit_format --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --enforce-eager --gpu-memory-utilization 0.85 &
fi
sleep 10
api_address="http://$api_host:$api_port"
echo "OpenAI API address: $api_address"
python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address

View file

@ -1,4 +0,0 @@
cd /llm/lightweight_serving
model_path="/llm/models/Llama-2-7b-chat-hf"
low_bit="sym_int4"
python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit

View file

@ -667,7 +667,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
out_features,
mp_group,
None,
None,
optimize_lm_head,
None
)

View file

@ -13,18 +13,28 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Dict, Optional
from vllm.logger import init_logger
from typing import Dict, Optional, Any, Union, Type
from vllm.engine.llm_engine import LLMEngine
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.llm import LLM
from vllm.utils import Counter
from vllm.config import EngineConfig
from vllm.config import VllmConfig
from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert
from vllm.usage.usage_lib import UsageContext
from vllm.engine.metrics import StatLoggerBase
from vllm.engine.multiprocessing.engine import MQLLMEngine
import signal
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
TaskOption)
from vllm.config import CompilationConfig
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
from vllm import envs
from vllm.v1.engine.async_llm import AsyncLLM
import os
logger = init_logger(__name__)
class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
@ -35,7 +45,7 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
def from_engine_args(
cls,
engine_args: AsyncEngineArgs,
engine_config: Optional[EngineConfig] = None,
engine_config: Optional[VllmConfig] = None,
start_engine_loop: bool = True,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
load_in_low_bit: str = "sym_int4",
@ -49,6 +59,27 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
usage_context=usage_context, stat_loggers=stat_loggers)
class IPEXLLMAsyncV1Engine(AsyncLLM):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@classmethod
def from_engine_args(
cls,
engine_args: AsyncEngineArgs,
engine_config: Optional[VllmConfig] = None,
start_engine_loop: bool = True,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
load_in_low_bit: str = "sym_int4",
stat_loggers: Optional[Dict[str, StatLoggerBase]]=None, # noqa
) -> "AsyncLLM":
_ipex_llm_convert(load_in_low_bit)
return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
start_engine_loop=start_engine_loop,
usage_context=usage_context, stat_loggers=stat_loggers)
class IPEXLLMClass(LLM):
def __init__(
self,
@ -57,6 +88,7 @@ class IPEXLLMClass(LLM):
tokenizer_mode: str = "auto",
skip_tokenizer_init: bool = False,
trust_remote_code: bool = False,
allowed_local_media_path: str = "",
tensor_parallel_size: int = 1,
dtype: str = "auto",
quantization: Optional[str] = None,
@ -64,28 +96,48 @@ class IPEXLLMClass(LLM):
tokenizer_revision: Optional[str] = None,
seed: int = 0,
gpu_memory_utilization: float = 0.9,
swap_space: int = 4,
swap_space: float = 4,
cpu_offload_gb: float = 0,
enforce_eager: bool = False,
max_context_len_to_capture: Optional[int] = None,
enforce_eager: Optional[bool] = None,
max_seq_len_to_capture: int = 8192,
disable_custom_all_reduce: bool = False,
disable_async_output_proc: bool = True,
hf_overrides: Optional[HfOverrides] = None,
mm_processor_kwargs: Optional[Dict[str, Any]]=None,
# After positional args are removed, move this right below `model`
task: TaskOption = "auto",
override_pooler_config: Optional[PoolerConfig] = None,
compilation_config: Optional[Union[int, Dict[str, Any]]]=None,
load_in_low_bit: str = "sym_int4",
**kwargs,
) -> None:
'''
LLM constructor.
Note: if enforce_eager is unset (enforce_eager is None)
it defaults to False.
'''
if "disable_log_stats" not in kwargs:
kwargs["disable_log_stats"] = True
removed_vision_keys = ("image_token_id", "image_feature_size",
"image_input_shape", "image_input_type")
if any(k in kwargs for k in removed_vision_keys):
raise TypeError( # noqa
"There is no need to pass vision-related arguments anymore.")
if compilation_config is not None:
if isinstance(compilation_config, (int, dict)):
compilation_config_instance = CompilationConfig.from_cli(
str(compilation_config))
else:
compilation_config_instance = compilation_config
else:
compilation_config_instance = None
engine_args = EngineArgs(
model=model,
task=task,
tokenizer=tokenizer,
tokenizer_mode=tokenizer_mode,
skip_tokenizer_init=skip_tokenizer_init,
trust_remote_code=trust_remote_code,
allowed_local_media_path=allowed_local_media_path,
tensor_parallel_size=tensor_parallel_size,
dtype=dtype,
quantization=quantization,
@ -96,16 +148,53 @@ class IPEXLLMClass(LLM):
swap_space=swap_space,
cpu_offload_gb=cpu_offload_gb,
enforce_eager=enforce_eager,
max_context_len_to_capture=max_context_len_to_capture,
max_seq_len_to_capture=max_seq_len_to_capture,
disable_custom_all_reduce=disable_custom_all_reduce,
disable_async_output_proc=disable_async_output_proc,
hf_overrides=hf_overrides,
mm_processor_kwargs=mm_processor_kwargs,
override_pooler_config=override_pooler_config,
compilation_config=compilation_config_instance,
**kwargs,
)
self.llm_engine = IPEXLLMLLMEngine.from_engine_args(
# Logic to switch between engines is done at runtime instead of import
# to avoid import order issues
self.engine_class = self.get_engine_class()
self.llm_engine = self.engine_class.from_engine_args(
engine_args, usage_context=UsageContext.LLM_CLASS,
load_in_low_bit=load_in_low_bit)
self.request_counter = Counter()
@staticmethod
def get_engine_class() -> Type[LLMEngine]:
if envs.VLLM_USE_V1:
return IPEXLLMLLMV1Engine
return IPEXLLMLLMEngine
class IPEXLLMLLMV1Engine(V1LLMEngine):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@classmethod
def from_engine_args(
cls,
engine_args: EngineArgs,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
enable_multiprocessing: bool = False,
load_in_low_bit: str = "sym_int4",
) -> "LLMEngine":
"""Creates an LLM engine from the engine arguments."""
# Create the engine configs.
_ipex_llm_convert(load_in_low_bit)
return super().from_engine_args(engine_args,
usage_context,
stat_loggers,
enable_multiprocessing)
class IPEXLLMLLMEngine(LLMEngine):
def __init__(self, *args, **kwargs):
@ -134,12 +223,13 @@ class IPEXLLMMQLLMEngine(MQLLMEngine):
def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
ipc_path: str, load_in_low_bit: str):
ipc_path: str, load_in_low_bit: str, engine_alive):
def signal_handler(*_) -> None:
# Interrupt server on sigterm
raise KeyboardInterrupt("MQLLMEngine terminated") # noqa
try:
signal.signal(signal.SIGTERM, signal_handler)
engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
@ -147,3 +237,10 @@ def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
ipc_path=ipc_path,
load_in_low_bit=load_in_low_bit)
engine.start()
except BaseException as e:
logger.exception(e)
engine_alive.value = False
raise e # noqa
if os.getenv("VLLM_USE_V1"):
IPEXLLMAsyncLLMEngine = IPEXLLMAsyncV1Engine

View file

@ -1,4 +1,5 @@
import asyncio
import atexit
import importlib
import inspect
import multiprocessing
@ -7,11 +8,12 @@ import re
import signal
import socket
import tempfile
import uuid
from argparse import Namespace
from contextlib import asynccontextmanager
from functools import partial
from http import HTTPStatus
from typing import AsyncIterator, Set
from typing import AsyncIterator, Optional, Set, Tuple
import uvloop
from fastapi import APIRouter, FastAPI, Request
@ -29,9 +31,13 @@ from ipex_llm.vllm.xpu.engine import IPEXLLMAsyncLLMEngine as AsyncLLMEngine
from vllm.engine.multiprocessing.client import MQLLMEngineClient
from ipex_llm.vllm.xpu.engine import run_mp_engine
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import load_chat_template
from vllm.entrypoints.launcher import serve_http
from vllm.entrypoints.logger import RequestLogger
from ipex_llm.vllm.xpu.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.cli_args import (make_arg_parser,
validate_parsed_serve_args)
# from ipex_llm.vllm.xpu.entrypoints.openai.cli_args import make_arg_parser
# yapf conflicts with isort for this block
# yapf: disable
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
@ -41,8 +47,12 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DetokenizeRequest,
DetokenizeResponse,
EmbeddingRequest,
EmbeddingResponse, ErrorResponse,
EmbeddingResponse,
EmbeddingResponseData,
ErrorResponse,
LoadLoraAdapterRequest,
PoolingRequest, PoolingResponse,
ScoreRequest, ScoreResponse,
TokenizeRequest,
TokenizeResponse,
UnloadLoraAdapterRequest)
@ -50,12 +60,20 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
from vllm.entrypoints.openai.serving_engine import BaseModelPath
from vllm.entrypoints.openai.serving_models import (BaseModelPath,
OpenAIServingModels)
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
from vllm.entrypoints.openai.serving_score import OpenAIServingScores
from vllm.entrypoints.openai.serving_tokenization import (
OpenAIServingTokenization)
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
from vllm.entrypoints.utils import with_cancellation
from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path
from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
is_valid_ipv6_address, set_ulimit)
from vllm.version import __version__ as VLLM_VERSION
TIMEOUT_KEEP_ALIVE = 5 # seconds
@ -111,7 +129,7 @@ async def build_async_engine_client(
async def build_async_engine_client_from_engine_args(
engine_args: AsyncEngineArgs,
disable_frontend_multiprocessing: bool = False,
load_in_low_bit: str = 'sym_int4',
load_in_low_bit: str = "sym_int4",
) -> AsyncIterator[EngineClient]:
"""
Create EngineClient, either:
@ -124,25 +142,19 @@ async def build_async_engine_client_from_engine_args(
# Fall back
# TODO: fill out feature matrix.
if (MQLLMEngineClient.is_unsupported_config(engine_args)
or disable_frontend_multiprocessing):
engine_config = engine_args.create_engine_config()
uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
"uses_ray", False)
build_engine = partial(AsyncLLMEngine.from_engine_args,
or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
engine_client: Optional[EngineClient] = None
try:
# When starting this, we are actually starting with the V1Engine
# Here we are doing a classification, we will need to do this in IPEX-LLM
engine_client = AsyncLLMEngine.from_engine_args(
engine_args=engine_args,
load_in_low_bit=load_in_low_bit,
engine_config=engine_config,
usage_context=UsageContext.OPENAI_API_SERVER)
if uses_ray:
# Must run in main thread with ray for its signal handlers to work
engine_client = build_engine()
else:
engine_client = await asyncio.get_running_loop().run_in_executor(
None, build_engine)
usage_context=UsageContext.OPENAI_API_SERVER,
load_in_low_bit=load_in_low_bit)
yield engine_client
return
finally:
if engine_client and hasattr(engine_client, "shutdown"):
engine_client.shutdown()
# Otherwise, use the multiprocessing AsyncLLMEngine.
else:
@ -163,7 +175,7 @@ async def build_async_engine_client_from_engine_args(
# Select random path for IPC.
ipc_path = get_open_zmq_ipc_path()
logger.info("Multiprocessing frontend to use %s for IPC Path.",
logger.debug("Multiprocessing frontend to use %s for IPC Path.",
ipc_path)
# Start RPCServer in separate process (holds the LLMEngine).
@ -171,37 +183,52 @@ async def build_async_engine_client_from_engine_args(
# so we need to spawn a new process
context = multiprocessing.get_context("spawn")
# The Process can raise an exception during startup, which may
# not actually result in an exitcode being reported. As a result
# we use a shared variable to communicate the information.
engine_alive = multiprocessing.Value('b', True, lock=False)
engine_process = context.Process(target=run_mp_engine,
args=(engine_args,
UsageContext.OPENAI_API_SERVER,
ipc_path,
load_in_low_bit))
ipc_path, load_in_low_bit, engine_alive))
engine_process.start()
logger.info("Started engine process with PID %d", engine_process.pid)
engine_pid = engine_process.pid
assert engine_pid is not None, "Engine process failed to start."
logger.info("Started engine process with PID %d", engine_pid)
def _cleanup_ipc_path():
socket_path = ipc_path.replace("ipc://", "")
if os.path.exists(socket_path):
os.remove(socket_path)
# Ensure we clean up the local IPC socket file on exit.
atexit.register(_cleanup_ipc_path)
# Build RPCClient, which conforms to EngineClient Protocol.
# NOTE: Actually, this is not true yet. We still need to support
# embedding models via RPC (see TODO above)
engine_config = engine_args.create_engine_config()
mp_engine_client = MQLLMEngineClient(ipc_path, engine_config)
build_client = partial(MQLLMEngineClient, ipc_path, engine_config,
engine_pid)
mq_engine_client = await asyncio.get_running_loop().run_in_executor(
None, build_client)
try:
while True:
try:
await mp_engine_client.setup()
await mq_engine_client.setup()
break
except TimeoutError:
if not engine_process.is_alive():
if (not engine_process.is_alive()
or not engine_alive.value):
raise RuntimeError(
"Engine process failed to start") from None
"Engine process failed to start. See stack "
"trace for the root cause.") from None
yield mp_engine_client # type: ignore[misc]
yield mq_engine_client # type: ignore[misc]
finally:
# Ensure rpc server process was terminated
engine_process.terminate()
# Close all open connections to the backend
mp_engine_client.close()
mq_engine_client.close()
# Wait for engine process to join
engine_process.join(4)
@ -230,7 +257,7 @@ def mount_metrics(app: FastAPI):
prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
if prometheus_multiproc_dir_path is not None:
logger.info("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
prometheus_multiproc_dir_path)
registry = CollectorRegistry()
multiprocess.MultiProcessCollector(registry)
@ -246,22 +273,35 @@ def mount_metrics(app: FastAPI):
app.routes.append(metrics_route)
def chat(request: Request) -> OpenAIServingChat:
def base(request: Request) -> OpenAIServing:
# Reuse the existing instance
return tokenization(request)
def chat(request: Request) -> Optional[OpenAIServingChat]:
return request.app.state.openai_serving_chat
def completion(request: Request) -> OpenAIServingCompletion:
def completion(request: Request) -> Optional[OpenAIServingCompletion]:
return request.app.state.openai_serving_completion
def pooling(request: Request) -> Optional[OpenAIServingPooling]:
return request.app.state.openai_serving_pooling
def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
return request.app.state.openai_serving_embedding
def score(request: Request) -> Optional[OpenAIServingScores]:
return request.app.state.openai_serving_scores
def tokenization(request: Request) -> OpenAIServingTokenization:
return request.app.state.openai_serving_tokenization
def embedding(request: Request) -> OpenAIServingEmbedding:
return request.app.state.openai_serving_embedding
def engine_client(request: Request) -> EngineClient:
return request.app.state.engine_client
@ -274,8 +314,11 @@ async def health(raw_request: Request) -> Response:
@router.post("/tokenize")
@with_cancellation
async def tokenize(request: TokenizeRequest, raw_request: Request):
generator = await tokenization(raw_request).create_tokenize(request)
handler = tokenization(raw_request)
generator = await handler.create_tokenize(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
@ -286,8 +329,11 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
@router.post("/detokenize")
@with_cancellation
async def detokenize(request: DetokenizeRequest, raw_request: Request):
generator = await tokenization(raw_request).create_detokenize(request)
handler = tokenization(raw_request)
generator = await handler.create_detokenize(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
@ -299,7 +345,9 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
@router.get("/v1/models")
async def show_available_models(raw_request: Request):
models = await completion(raw_request).show_available_models()
handler = base(raw_request)
models = await handler.show_available_models()
return JSONResponse(content=models.model_dump())
@ -310,11 +358,15 @@ async def show_version():
@router.post("/v1/chat/completions")
@with_cancellation
async def create_chat_completion(request: ChatCompletionRequest,
raw_request: Request):
handler = chat(raw_request)
if handler is None:
return base(raw_request).create_error_response(
message="The model does not support Chat Completions API")
generator = await chat(raw_request).create_chat_completion(
request, raw_request)
generator = await handler.create_chat_completion(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
@ -327,9 +379,14 @@ async def create_chat_completion(request: ChatCompletionRequest,
@router.post("/v1/completions")
@with_cancellation
async def create_completion(request: CompletionRequest, raw_request: Request):
generator = await completion(raw_request).create_completion(
request, raw_request)
handler = completion(raw_request)
if handler is None:
return base(raw_request).create_error_response(
message="The model does not support Completions API")
generator = await handler.create_completion(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
@ -340,9 +397,40 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
@router.post("/v1/embeddings")
@with_cancellation
async def create_embedding(request: EmbeddingRequest, raw_request: Request):
generator = await embedding(raw_request).create_embedding(
request, raw_request)
handler = embedding(raw_request)
if handler is None:
fallback_handler = pooling(raw_request)
if fallback_handler is None:
return base(raw_request).create_error_response(
message="The model does not support Embeddings API")
logger.warning(
"Embeddings API will become exclusive to embedding models "
"in a future release. To return the hidden states directly, "
"use the Pooling API (`/pooling`) instead.")
res = await fallback_handler.create_pooling(request, raw_request)
if isinstance(res, PoolingResponse):
generator = EmbeddingResponse(
id=res.id,
object=res.object,
created=res.created,
model=res.model,
data=[
EmbeddingResponseData(
index=d.index,
embedding=d.data, # type: ignore
) for d in res.data
],
usage=res.usage,
)
else:
generator = res
else:
generator = await handler.create_embedding(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
@ -352,6 +440,52 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
assert_never(generator)
@router.post("/pooling")
@with_cancellation
async def create_pooling(request: PoolingRequest, raw_request: Request):
handler = pooling(raw_request)
if handler is None:
return base(raw_request).create_error_response(
message="The model does not support Pooling API")
generator = await handler.create_pooling(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
elif isinstance(generator, PoolingResponse):
return JSONResponse(content=generator.model_dump())
assert_never(generator)
@router.post("/score")
@with_cancellation
async def create_score(request: ScoreRequest, raw_request: Request):
handler = score(raw_request)
if handler is None:
return base(raw_request).create_error_response(
message="The model does not support Score API")
generator = await handler.create_score(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
elif isinstance(generator, ScoreResponse):
return JSONResponse(content=generator.model_dump())
assert_never(generator)
@router.post("/v1/score")
@with_cancellation
async def create_score_v1(request: ScoreRequest, raw_request: Request):
logger.warning(
"To indicate that Score API is not part of standard OpenAI API, we "
"have moved it to `/score`. Please update your client accordingly.")
return await create_score(request, raw_request)
if envs.VLLM_TORCH_PROFILER_DIR:
logger.warning(
"Torch Profiler is enabled in the API server. This should ONLY be "
@ -380,12 +514,10 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
@router.post("/v1/load_lora_adapter")
async def load_lora_adapter(request: LoadLoraAdapterRequest,
raw_request: Request):
response = await chat(raw_request).load_lora_adapter(request)
if isinstance(response, ErrorResponse):
return JSONResponse(content=response.model_dump(),
status_code=response.code)
response = await completion(raw_request).load_lora_adapter(request)
for route in [chat, completion, embedding]:
handler = route(raw_request)
if handler is not None:
response = await handler.load_lora_adapter(request)
if isinstance(response, ErrorResponse):
return JSONResponse(content=response.model_dump(),
status_code=response.code)
@ -395,12 +527,10 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
@router.post("/v1/unload_lora_adapter")
async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
raw_request: Request):
response = await chat(raw_request).unload_lora_adapter(request)
if isinstance(response, ErrorResponse):
return JSONResponse(content=response.model_dump(),
status_code=response.code)
response = await completion(raw_request).unload_lora_adapter(request)
for route in [chat, completion, embedding]:
handler = route(raw_request)
if handler is not None:
response = await handler.unload_lora_adapter(request)
if isinstance(response, ErrorResponse):
return JSONResponse(content=response.model_dump(),
status_code=response.code)
@ -431,8 +561,9 @@ def build_app(args: Namespace) -> FastAPI:
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(_, exc):
chat = app.state.openai_serving_chat
err = chat.create_error_response(message=str(exc))
err = ErrorResponse(message=str(exc),
type="BadRequestError",
code=HTTPStatus.BAD_REQUEST)
return JSONResponse(err.model_dump(),
status_code=HTTPStatus.BAD_REQUEST)
@ -440,16 +571,31 @@ def build_app(args: Namespace) -> FastAPI:
@app.middleware("http")
async def authentication(request: Request, call_next):
root_path = "" if args.root_path is None else args.root_path
if request.method == "OPTIONS":
return await call_next(request)
if not request.url.path.startswith(f"{root_path}/v1"):
url_path = request.url.path
if app.root_path and url_path.startswith(app.root_path):
url_path = url_path[len(app.root_path):]
if not url_path.startswith("/v1"):
return await call_next(request)
if request.headers.get("Authorization") != "Bearer " + token:
return JSONResponse(content={"error": "Unauthorized"},
status_code=401)
return await call_next(request)
if args.enable_request_id_headers:
logger.warning(
"CAUTION: Enabling X-Request-Id headers in the API Server. "
"This can harm performance at high QPS.")
@app.middleware("http")
async def add_request_id(request: Request, call_next):
request_id = request.headers.get(
"X-Request-Id") or uuid.uuid4().hex
response = await call_next(request)
response.headers["X-Request-Id"] = request_id
return response
for middleware in args.middleware:
module_path, object_name = middleware.rsplit(".", 1)
imported = getattr(importlib.import_module(module_path), object_name)
@ -488,49 +634,179 @@ def init_app_state(
state.engine_client = engine_client
state.log_stats = not args.disable_log_stats
resolved_chat_template = load_chat_template(args.chat_template)
logger.info("Using supplied chat template:\n%s", resolved_chat_template)
state.openai_serving_models = OpenAIServingModels(
model_config=model_config,
base_model_paths=base_model_paths,
lora_modules=args.lora_modules,
prompt_adapters=args.prompt_adapters,
)
# TODO: The chat template is now broken for lora adapters :(
state.openai_serving_chat = OpenAIServingChat(
engine_client,
model_config,
base_model_paths,
state.openai_serving_models,
args.response_role,
lora_modules=args.lora_modules,
prompt_adapters=args.prompt_adapters,
request_logger=request_logger,
chat_template=args.chat_template,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
tool_parser=args.tool_call_parser)
tool_parser=args.tool_call_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
) if model_config.runner_type == "generate" else None
state.openai_serving_completion = OpenAIServingCompletion(
engine_client,
model_config,
base_model_paths,
lora_modules=args.lora_modules,
prompt_adapters=args.prompt_adapters,
state.openai_serving_models,
request_logger=request_logger,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
)
) if model_config.runner_type == "generate" else None
state.openai_serving_pooling = OpenAIServingPooling(
engine_client,
model_config,
state.openai_serving_models,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
) if model_config.runner_type == "pooling" else None
state.openai_serving_embedding = OpenAIServingEmbedding(
engine_client,
model_config,
base_model_paths,
state.openai_serving_models,
request_logger=request_logger,
)
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
) if model_config.task == "embed" else None
state.openai_serving_scores = OpenAIServingScores(
engine_client,
model_config,
state.openai_serving_models,
request_logger=request_logger
) if model_config.task == "score" else None
state.openai_serving_tokenization = OpenAIServingTokenization(
engine_client,
model_config,
base_model_paths,
lora_modules=args.lora_modules,
state.openai_serving_models,
request_logger=request_logger,
chat_template=args.chat_template,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
)
state.task = model_config.task
# if args.served_model_name is not None:
# served_model_names = args.served_model_name
# else:
# served_model_names = [args.model]
# if args.disable_log_requests:
# request_logger = None
# else:
# request_logger = RequestLogger(max_log_len=args.max_log_len)
# base_model_paths = [
# BaseModelPath(name=name, model_path=args.model)
# for name in served_model_names
# ]
# state.engine_client = engine_client
# state.log_stats = not args.disable_log_stats
# resolved_chat_template = load_chat_template(args.chat_template)
# logger.info("Using supplied chat template:\n%s", resolved_chat_template)
# state.openai_serving_chat = OpenAIServingChat(
# engine_client,
# model_config,
# base_model_paths,
# args.response_role,
# lora_modules=args.lora_modules,
# prompt_adapters=args.prompt_adapters,
# request_logger=request_logger,
# chat_template=resolved_chat_template,
# chat_template_content_format=args.chat_template_content_format,
# return_tokens_as_token_ids=args.return_tokens_as_token_ids,
# enable_auto_tools=args.enable_auto_tool_choice,
# tool_parser=args.tool_call_parser,
# enable_prompt_tokens_details=args.enable_prompt_tokens_details,
# ) if model_config.runner_type == "generate" else None
# state.openai_serving_completion = OpenAIServingCompletion(
# engine_client,
# model_config,
# base_model_paths,
# lora_modules=args.lora_modules,
# prompt_adapters=args.prompt_adapters,
# request_logger=request_logger,
# return_tokens_as_token_ids=args.return_tokens_as_token_ids,
# ) if model_config.runner_type == "generate" else None
# state.openai_serving_pooling = OpenAIServingPooling(
# engine_client,
# model_config,
# base_model_paths,
# request_logger=request_logger,
# chat_template=resolved_chat_template,
# chat_template_content_format=args.chat_template_content_format,
# ) if model_config.runner_type == "pooling" else None
# state.openai_serving_embedding = OpenAIServingEmbedding(
# engine_client,
# model_config,
# base_model_paths,
# request_logger=request_logger,
# chat_template=resolved_chat_template,
# chat_template_content_format=args.chat_template_content_format,
# ) if model_config.task == "embed" else None
# state.openai_serving_scores = OpenAIServingScores(
# engine_client,
# model_config,
# base_model_paths,
# request_logger=request_logger
# ) if model_config.task == "score" else None
# state.openai_serving_tokenization = OpenAIServingTokenization(
# engine_client,
# model_config,
# base_model_paths,
# lora_modules=args.lora_modules,
# request_logger=request_logger,
# chat_template=resolved_chat_template,
# chat_template_content_format=args.chat_template_content_format,
# )
def create_server_socket(addr: Tuple[str, int]) -> socket.socket:
family = socket.AF_INET
if is_valid_ipv6_address(addr[0]):
family = socket.AF_INET6
sock = socket.socket(family=family, type=socket.SOCK_STREAM)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.bind(addr)
return sock
async def run_server(args, **uvicorn_kwargs) -> None:
logger.info("vLLM API server version %s", VLLM_VERSION)
logger.info("args: %s", args)
temp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
temp_socket.bind(("", args.port))
if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
valide_tool_parses = ToolParserManager.tool_parsers.keys()
if args.enable_auto_tool_choice \
and args.tool_call_parser not in valide_tool_parses:
raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
f"(chose from {{ {','.join(valide_tool_parses)} }})")
# workaround to make sure that we bind the port before the engine is set up.
# This avoids race conditions with ray.
# see https://github.com/vllm-project/vllm/issues/8204
sock_addr = (args.host or "", args.port)
sock = create_server_socket(sock_addr)
# workaround to avoid footguns where uvicorn drops requests with too
# many concurrent requests active
set_ulimit()
def signal_handler(*_) -> None:
# Interrupt server on sigterm while initializing
@ -544,8 +820,6 @@ async def run_server(args, **uvicorn_kwargs) -> None:
model_config = await engine_client.get_model_config()
init_app_state(engine_client, model_config, app.state, args)
temp_socket.close()
shutdown_task = await serve_http(
app,
host=args.host,
@ -562,13 +836,23 @@ async def run_server(args, **uvicorn_kwargs) -> None:
# NB: Await server shutdown only after the backend context is exited
await shutdown_task
sock.close()
if __name__ == "__main__":
# NOTE(simon):
# This section should be in sync with vllm/scripts.py for CLI entrypoints.
logger.warning("Warning: Please use `ipex_llm.vllm.xpu.entrypoints.openai.api_server` "
"instead of `vllm.entrypoints.openai.api_server` to start the API server")
parser = FlexibleArgumentParser(
description="vLLM OpenAI-Compatible RESTful API server.")
parser = make_arg_parser(parser)
parser.add_argument(
"--load-in-low-bit",
type=str,
default="sym_int4",
help="Low-bit quantization for IPEX-LLM models")
args = parser.parse_args()
validate_parsed_serve_args(args)
uvloop.run(run_server(args))

View file

@ -7,11 +7,14 @@ purposes.
import argparse
import json
import ssl
from typing import List, Optional, Sequence, Union
from typing import List, Optional, Sequence, Union, get_args
from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
validate_chat_template)
from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
PromptAdapterPath)
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
from vllm.utils import FlexibleArgumentParser
@ -130,10 +133,23 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
help="The file path to the chat template, "
"or the template in single-line form "
"for the specified model")
parser.add_argument(
'--chat-template-content-format',
type=str,
default="auto",
choices=get_args(ChatTemplateContentFormatOption),
help='The format to render message content within a chat template.'
'\n\n'
'* "string" will render the content as a string. '
'Example: "Hello World"\n'
'* "openai" will render the content as a list of dictionaries, '
'similar to OpenAI schema. '
'Example: [{"type": "text", "text": "Hello world!"}]')
parser.add_argument("--response-role",
type=nullable_str,
default="assistant",
help="The role name to return if `request.add_generation_prompt=true`.")
help="The role name to return if "
"`request.add_generation_prompt=true`.")
parser.add_argument("--ssl-keyfile",
type=nullable_str,
default=None,
@ -180,7 +196,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
action="store_true",
help="If specified, will run the OpenAI frontend server in the same "
"process as the model serving engine.")
parser.add_argument(
"--enable-request-id-headers",
action="store_true",
help="If specified, API server will add X-Request-Id header to "
"responses. Caution: this hurts performance at high QPS.")
parser.add_argument(
"--enable-auto-tool-choice",
action="store_true",
@ -188,20 +208,24 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
help="Enable auto tool choice for supported models. Use --tool-call-parser"
" to specify which parser to use")
valid_tool_parsers = ToolParserManager.tool_parsers.keys()
parser.add_argument(
"--tool-call-parser",
type=str,
choices=["mistral", "hermes"],
metavar="{" + ",".join(valid_tool_parsers) + "} or name registered in "
"--tool-parser-plugin",
default=None,
help="Select the tool call parser depending on the model that you're using."
" This is used to parse the model-generated tool call into OpenAI API "
"format. Required for --enable-auto-tool-choice.")
parser.add_argument(
"--load-in-low-bit",
"--tool-parser-plugin",
type=str,
default="sym_int4",
help="Low-bit quantization for IPEX-LLM models")
default="",
help="Special the tool parser plugin write to parse the model-generated tool"
" into OpenAI API format, the name register in this plugin can be used "
"in --tool-call-parser.")
parser = AsyncEngineArgs.add_cli_args(parser)
@ -218,10 +242,35 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
default=False,
help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint"
)
parser.add_argument(
"--enable-prompt-tokens-details",
action='store_true',
default=False,
help="If set to True, enable prompt_tokens_details in usage.")
parser.add_argument(
"--load-in-low-bit",
type=str,
default="sym_int4",
help="Low-bit quantization for IPEX-LLM models")
return parser
def validate_parsed_serve_args(args: argparse.Namespace):
"""Quick checks for model serve args that raise prior to loading.""" # noqa
if hasattr(args, "subparser") and args.subparser != "serve":
return
# Ensure that the chat template is valid; raises if it likely isn't
validate_chat_template(args.chat_template)
# Enable auto tool needs a tool call parser to be valid
if args.enable_auto_tool_choice and not args.tool_call_parser:
raise TypeError("Error: --enable-auto-tool-choice requires " # noqa
"--tool-call-parser")
def create_parser_for_docs() -> FlexibleArgumentParser:
parser_for_docs = FlexibleArgumentParser(
prog="-m vllm.entrypoints.openai.api_server")

View file

@ -0,0 +1,23 @@
from vllm.logger import init_logger
from vllm.v1.executor.ray_utils import RayWorkerWrapper
logger = init_logger(__name__)
class IPEXLLMV1Wrapper(RayWorkerWrapper):
def __init__(self, load_in_low_bit="sym_int4", *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert
_ipex_llm_convert(load_in_low_bit=load_in_low_bit)
self.compiled_dag_cuda_device_set = False
def get_ipex_llm_v1_wrapper(load_in_low_bit):
# The reason why we not using functools.partial is that
# ray seems not work well with it.
class WrapperWithLoadBit(IPEXLLMV1Wrapper):
def __init__(self, *args, **kwargs) -> None:
super().__init__(load_in_low_bit=load_in_low_bit, *args, **kwargs)
return WrapperWithLoadBit

View file

@ -65,9 +65,14 @@ def _model_sample_convert():
def _ipex_llm_convert(load_in_low_bit):
from vllm.worker.xpu_model_runner import XPUModelRunner
from ipex_llm.vllm.xpu.ipex_llm_wrapper import get_ipex_llm_wrapper
import vllm.executor.ray_utils as ray_utils
from ipex_llm.vllm.xpu.ipex_llm_v1_wrapper import get_ipex_llm_v1_wrapper
import vllm.executor.ray_utils as ray_utils_v0
import vllm.v1.executor.ray_utils as ray_utils_v1
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
setattr(XPUModelRunner, "load_model", get_load_function(load_in_low_bit))
setattr(ray_utils, "RayWorkerWrapper", get_ipex_llm_wrapper(load_in_low_bit))
setattr(GPUModelRunner, "load_model", get_load_function(load_in_low_bit))
setattr(ray_utils_v0, "RayWorkerWrapper", get_ipex_llm_wrapper(load_in_low_bit))
setattr(ray_utils_v1, "RayWorkerWrapper", get_ipex_llm_v1_wrapper(load_in_low_bit))
def get_load_function(low_bit):
@ -77,19 +82,16 @@ def get_load_function(low_bit):
# from vllm.utils import measure_device_memory
from vllm.utils import DeviceMemoryProfiler
with DeviceMemoryProfiler() as m:
from dataclasses import replace
new_device_config = DeviceConfig("cpu")
new_vllm_config = replace(self.vllm_config, device_config=new_device_config)
self.model = get_model(
model_config=self.model_config,
device_config=DeviceConfig("cpu"),
load_config=self.load_config,
lora_config=self.lora_config,
parallel_config=self.parallel_config,
scheduler_config=self.scheduler_config,
cache_config=self.cache_config,
vllm_config=new_vllm_config
)
if "qwen" in self.model_config.model.lower() or \
"baichuan" in self.model_config.model.lower() or \
"codegeex4-all" in self.model_config.model.lower() or \
"chatglm" in self.model_config.model.lower():
if "qwen" in self.vllm_config.model_config.model.lower() or \
"baichuan" in self.vllm_config.model_config.model.lower() or \
"codegeex4-all" in self.vllm_config.model_config.model.lower() or \
"chatglm" in self.vllm_config.model_config.model.lower():
self.model.apply(padding_mlp)
from ipex_llm import optimize_model
import os
@ -99,18 +101,22 @@ def get_load_function(low_bit):
modules = ["35.mlp", "36.mlp", "37.mlp", "38.mlp", "39.mlp"]
else:
modules = None
if "minicpm" in self.model_config.model.lower():
if "minicpm" in self.vllm_config.model_config.model.lower():
modules = ["vpm", "resampler"]
# only for minicpm_2_6
if "minicpm-v" in self.model_config.model.lower():
if "minicpm-v" in self.vllm_config.model_config.model.lower():
from ipex_llm.transformers.models.minicpmv import merge_qkv
self.model.vpm.apply(merge_qkv)
if "internvl2" in self.model_config.model.lower():
if "internvl2" in self.vllm_config.model_config.model.lower():
modules = ["vision_model", "mlp1"]
optimize_model(self.model, low_bit=low_bit, torch_dtype=self.model_config.dtype,
if "deepseek-v2" in self.vllm_config.model_config.model.lower():
modules = ["down_proj"]
optimize_model(self.model,
low_bit=low_bit,
torch_dtype=self.vllm_config.model_config.dtype,
modules_to_not_convert=modules)
self.model = self.model.to(device=self.device_config.device,
dtype=self.model_config.dtype)
self.model = self.model.to(device=self.vllm_config.device_config.device,
dtype=self.vllm_config.model_config.dtype)
self.model_memory_usage = m.consumed_memory
logger = init_logger(__name__)