From af693425f124a34d2bc12e2407f82d51a37c72d0 Mon Sep 17 00:00:00 2001 From: Guancheng Fu <110874468+gc-fu@users.noreply.github.com> Date: Wed, 12 Feb 2025 16:47:51 +0800 Subject: [PATCH] Upgrade to vLLM 0.6.6 (#12796) * init * update engine init * fix serving load_in_low_bit problem * temp * temp * temp * temp * temp * fix * fixed * done * fix * fix all arguments * fix * fix throughput script * fix * fix * use official ipex-llm * Fix readme * fix --------- Co-authored-by: hzjane --- docker/llm/serving/xpu/docker/Dockerfile | 154 +++--- docker/llm/serving/xpu/docker/README.md | 82 +-- .../xpu/docker/benchmark_vllm_latency.py | 223 +------- .../xpu/docker/benchmark_vllm_throughput.py | 323 ++++++++---- docker/llm/serving/xpu/docker/ccl_torch.patch | 47 ++ .../xpu/docker/gradio_web_server.patch | 209 -------- .../xpu/docker/start-fastchat-service.sh | 125 ----- .../start-lightweight_serving-service.sh | 4 - .../llm/src/ipex_llm/transformers/convert.py | 1 - .../src/ipex_llm/vllm/xpu/engine/engine.py | 137 ++++- .../vllm/xpu/entrypoints/openai/api_server.py | 476 ++++++++++++++---- .../vllm/xpu/entrypoints/openai/cli_args.py | 65 ++- .../ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py | 23 + .../src/ipex_llm/vllm/xpu/model_convert.py | 44 +- 14 files changed, 1003 insertions(+), 910 deletions(-) create mode 100644 docker/llm/serving/xpu/docker/ccl_torch.patch delete mode 100644 docker/llm/serving/xpu/docker/gradio_web_server.patch delete mode 100644 docker/llm/serving/xpu/docker/start-fastchat-service.sh delete mode 100644 docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh create mode 100644 python/llm/src/ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index 8e56921d..6b517eda 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -1,27 +1,17 @@ -FROM intel/oneapi-basekit:2024.1.1-devel-ubuntu22.04 +# First stage: build oneccl +FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04 AS build ARG http_proxy ARG https_proxy ENV TZ=Asia/Shanghai ENV PYTHONUNBUFFERED=1 -# To prevent RPC_TIMEOUT ERROR for the first request -ENV VLLM_RPC_TIMEOUT=100000 - -# Disable pip's cache behavior ARG PIP_NO_CACHE_DIR=false -ADD ./gradio_web_server.patch /tmp/gradio_web_server.patch -ADD ./oneccl-binding.patch /tmp/oneccl-binding.patch -RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ - echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ - chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \ - rm /etc/apt/sources.list.d/intel-graphics.list && \ - wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \ - echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \ - chmod 644 /usr/share/keyrings/intel-graphics.gpg && \ - apt-get update && \ +ADD ./ccl_torch.patch /tmp/ + +RUN apt-get update && \ apt-get install -y --no-install-recommends curl wget git libunwind8-dev vim less && \ ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \ env DEBIAN_FRONTEND=noninteractive apt-get update && \ @@ -39,77 +29,113 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO python3 get-pip.py && \ rm get-pip.py && \ pip install --upgrade requests argparse urllib3 && \ - pip install --pre --upgrade ipex-llm[xpu,serving] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \ + apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev numactl && \ + # If we do not install this compute-runtime, we will fail the build later + mkdir -p /tmp/neo && \ + cd /tmp/neo && \ + wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-core-2_2.5.6+18417_amd64.deb && \ + wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-opencl-2_2.5.6+18417_amd64.deb && \ + wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu-dbgsym_1.6.32224.5_amd64.ddeb && \ + wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu_1.6.32224.5_amd64.deb && \ + wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd-dbgsym_24.52.32224.5_amd64.ddeb && \ + wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd_24.52.32224.5_amd64.deb && \ + wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/libigdgmm12_22.5.5_amd64.deb && \ + dpkg -i *.deb && \ + pip install --pre --upgrade ipex-llm[xpu_2.6] --extra-index-url https://download.pytorch.org/whl/test/xpu && \ + mkdir /build && \ + cd /build && \ + git clone https://github.com/intel/torch-ccl.git && \ + cd torch-ccl && \ + git checkout ccl_torch2.5.0+xpu && \ + git submodule sync && \ + git submodule update --init --recursive && \ + # This patch will enable build torch-ccl with pytorch 2.6 environment + git apply /tmp/ccl_torch.patch && \ + USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py bdist_wheel + # File path: /build/torch-ccl/dist/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl + +FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04 + +COPY --from=build /build/torch-ccl/dist/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl /opt/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl + +ARG http_proxy +ARG https_proxy + +ENV TZ=Asia/Shanghai +ENV PYTHONUNBUFFERED=1 +# To prevent RPC_TIMEOUT ERROR for the first request +ENV VLLM_RPC_TIMEOUT=100000 + + +# Disable pip's cache behavior +ARG PIP_NO_CACHE_DIR=false + +RUN apt-get update && \ + apt-get install -y --no-install-recommends curl wget git libunwind8-dev vim less && \ + ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \ + env DEBIAN_FRONTEND=noninteractive apt-get update && \ + apt-get install -y --no-install-recommends gnupg gpg-agent software-properties-common kmod && \ + # Add Python 3.11 PPA repository + add-apt-repository ppa:deadsnakes/ppa -y && \ + apt-get install -y --no-install-recommends python3.11 git curl wget && \ + rm /usr/bin/python3 && \ + ln -s /usr/bin/python3.11 /usr/bin/python3 && \ + ln -s /usr/bin/python3 /usr/bin/python && \ + apt-get install -y --no-install-recommends python3-pip python3.11-dev python3-wheel python3.11-distutils && \ + wget https://bootstrap.pypa.io/get-pip.py -O get-pip.py && \ + python3 get-pip.py && \ + rm get-pip.py && \ + pip install --upgrade requests argparse urllib3 && \ + pip install --pre --upgrade ipex-llm[xpu_2.6] --extra-index-url https://download.pytorch.org/whl/test/xpu && \ pip install transformers_stream_generator einops tiktoken && \ pip install --upgrade colorama && \ - # Download all-in-one benchmark and examples - git clone https://github.com/intel-analytics/ipex-llm && \ - # The following comment segment is used when building from source... - # cd ipex-llm && \ - # git fetch origin pull/12338/head:local_pr && \ - # git checkout local_pr && \ - # pip uninstall -y ipex-llm && \ - # cd python/llm && \ - # python setup.py install && \ - # cd ../../../ && \ + git clone https://github.com/intel/ipex-llm.git && \ cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \ cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \ + cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \ + rm -rf ./ipex-llm && \ # Install vllm dependencies pip install --upgrade fastapi && \ pip install --upgrade "uvicorn[standard]" && \ - # Download vLLM-Serving - cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \ - rm -rf ./ipex-llm && \ # Install torch-ccl - cd /tmp/ && \ - pip install torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 intel-extension-for-pytorch==2.1.30.post0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \ - # Internal oneccl - wget https://sourceforge.net/projects/oneccl-wks/files/2024.0.0.6.5-release/oneccl_wks_installer_2024.0.0.6.5.sh && \ - bash oneccl_wks_installer_2024.0.0.6.5.sh && \ - git clone https://github.com/intel/torch-ccl -b v2.1.300+xpu && \ - cd torch-ccl && \ - patch -p1 < /tmp/oneccl-binding.patch && \ - USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py install && \ + pip install /opt/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl && \ + # install Internal oneccl + cd /opt && \ + wget https://sourceforge.net/projects/oneccl-wks/files/2025.0.0.6.6-release/oneccl_wks_installer_2025.0.0.6.6.sh && \ + bash oneccl_wks_installer_2025.0.0.6.6.sh && \ apt-get update && \ apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev numactl && \ - # apt-get install -y intel-opencl-icd intel-level-zero-gpu=1.3.26241.33-647~22.04 level-zero level-zero-dev --allow-downgrades && \ + # Install compute runtime mkdir -p /tmp/neo && \ cd /tmp/neo && \ - wget https://github.com/oneapi-src/level-zero/releases/download/v1.18.5/level-zero_1.18.5+u22.04_amd64.deb && \ - wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17791.9/intel-igc-core_1.0.17791.9_amd64.deb && \ - wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17791.9/intel-igc-opencl_1.0.17791.9_amd64.deb && \ - wget https://github.com/intel/compute-runtime/releases/download/24.39.31294.12/intel-level-zero-gpu_1.6.31294.12_amd64.deb && \ - wget https://github.com/intel/compute-runtime/releases/download/24.39.31294.12/intel-opencl-icd_24.39.31294.12_amd64.deb && \ - wget https://github.com/intel/compute-runtime/releases/download/24.39.31294.12/libigdgmm12_22.5.2_amd64.deb && \ + wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-core-2_2.5.6+18417_amd64.deb && \ + wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.5.6/intel-igc-opencl-2_2.5.6+18417_amd64.deb && \ + wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu-dbgsym_1.6.32224.5_amd64.ddeb && \ + wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-level-zero-gpu_1.6.32224.5_amd64.deb && \ + wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd-dbgsym_24.52.32224.5_amd64.ddeb && \ + wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/intel-opencl-icd_24.52.32224.5_amd64.deb && \ + wget https://github.com/intel/compute-runtime/releases/download/24.52.32224.5/libigdgmm12_22.5.5_amd64.deb && \ dpkg -i *.deb && \ - rm -rf /tmp/neo && \ mkdir -p /llm && \ cd /llm && \ - git clone -b 0.6.2 https://github.com/analytics-zoo/vllm.git /llm/vllm && \ + rm -rf /tmp/neo && \ + # Install vllm + git clone -b 0.6.6-pre https://github.com/analytics-zoo/vllm.git /llm/vllm && \ cd /llm/vllm && \ pip install setuptools-scm && \ pip install --upgrade cmake && \ VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -v /llm/vllm && \ - # pip install -r /llm/vllm/requirements-xpu.txt && \ - # VLLM_TARGET_DEVICE=xpu python setup.py install && \ pip install mpi4py fastapi uvicorn openai && \ pip install gradio==4.43.0 && \ - # pip install transformers==4.44.2 && \ - # patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch && \ - pip install ray && \ - patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch + pip install ray -COPY ./vllm_online_benchmark.py /llm/ -COPY ./vllm_offline_inference.py /llm/ +COPY ./vllm_online_benchmark.py /llm/ +COPY ./vllm_offline_inference.py /llm/ COPY ./vllm_offline_inference_vision_language.py /llm/ -COPY ./payload-1024.lua /llm/ -COPY ./start-vllm-service.sh /llm/ -COPY ./benchmark_vllm_throughput.py /llm/ -COPY ./benchmark_vllm_latency.py /llm/ -COPY ./start-fastchat-service.sh /llm/ -COPY ./start-pp_serving-service.sh /llm/ -COPY ./start-lightweight_serving-service.sh /llm/ - -ENV LD_LIBRARY_PATH /usr/local/lib/python3.11/dist-packages/intel_extension_for_pytorch/lib/:/opt/intel/oneapi/tbb/2021.12/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/2021.12/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.12/lib:/opt/intel/oneapi/mkl/2024.1/lib:/opt/intel/oneapi/ippcp/2021.11/lib/:/opt/intel/oneapi/ipp/2021.11/lib:/opt/intel/oneapi/dpl/2022.5/lib:/opt/intel/oneapi/dnnl/2024.1/lib:/opt/intel/oneapi/debugger/2024.1/opt/debugger/lib:/opt/intel/oneapi/dal/2024.2/lib:/opt/intel/oneapi/compiler/2024.1/opt/oclfpga/host/linux64/lib:/opt/intel/oneapi/compiler/2024.1/opt/compiler/lib:/opt/intel/oneapi/compiler/2024.1/lib:/opt/intel/oneapi/ccl/2021.12/lib/ +COPY ./payload-1024.lua /llm/ +COPY ./start-vllm-service.sh /llm/ +COPY ./benchmark_vllm_throughput.py /llm/ +COPY ./benchmark_vllm_latency.py /llm/ +COPY ./start-pp_serving-service.sh /llm/ WORKDIR /llm/ diff --git a/docker/llm/serving/xpu/docker/README.md b/docker/llm/serving/xpu/docker/README.md index 64ef9d3f..ba71fb4a 100644 --- a/docker/llm/serving/xpu/docker/README.md +++ b/docker/llm/serving/xpu/docker/README.md @@ -18,7 +18,7 @@ To map the `xpu` into the container, you need to specify `--device=/dev/dri` whe An example could be: ```bash #/bin/bash -export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:2.2.0-SNAPSHOT +export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:latest sudo docker run -itd \ --net=host \ @@ -59,86 +59,6 @@ To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to t For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image. -#### FastChat serving engine - -To set up model serving using `IPEX-LLM` as backend using FastChat, you can refer to this [quickstart](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/fastchat_quickstart.html#) or follow these quick steps to deploy a demo. - -##### Quick Setup for FastChat with IPEX-LLM - -1. **Start the Docker Container** - - Run the following command to launch a Docker container with device access: - - ```bash - #/bin/bash - export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:latest - - sudo docker run -itd \ - --net=host \ - --device=/dev/dri \ - --name=demo-container \ - # Example: map host model directory to container - -v /LLM_MODELS/:/llm/models/ \ - --shm-size="16g" \ - # Optional: set proxy if needed - -e http_proxy=... \ - -e https_proxy=... \ - -e no_proxy="127.0.0.1,localhost" \ - $DOCKER_IMAGE - ``` - -2. **Start the FastChat Service** - - Enter the container and start the FastChat service: - ```bash - #/bin/bash - - # This command assumes that you have mapped the host model directory to the container - # and the model directory is /llm/models/ - # we take Yi-1.5-34B as an example, and you can replace it with your own model - - ps -ef | grep "fastchat" | awk '{print $2}' | xargs kill -9 - pip install -U gradio==4.43.0 - - # start controller - python -m fastchat.serve.controller & - - export USE_XETLA=OFF - export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 - - export TORCH_LLM_ALLREDUCE=0 - export CCL_DG2_ALLREDUCE=1 - # CCL needed environment variables - export CCL_WORKER_COUNT=4 - # pin ccl worker to cores - # export CCL_WORKER_AFFINITY=32,33,34,35 - export FI_PROVIDER=shm - export CCL_ATL_TRANSPORT=ofi - export CCL_ZE_IPC_EXCHANGE=sockets - export CCL_ATL_SHM=1 - - source /opt/intel/1ccl-wks/setvars.sh - - python -m ipex_llm.serving.fastchat.vllm_worker \ - --model-path /llm/models/Yi-1.5-34B \ - --device xpu \ - --enforce-eager \ - --disable-async-output-proc \ - --distributed-executor-backend ray \ - --dtype float16 \ - --load-in-low-bit fp8 \ - --tensor-parallel-size 4 \ - --gpu-memory-utilization 0.9 \ - --max-model-len 4096 \ - --max-num-batched-tokens 8000 & - - sleep 120 - - python -m fastchat.serve.gradio_web_server & - ``` - -This quick setup allows you to deploy FastChat with IPEX-LLM efficiently. - #### vLLM serving engine To run vLLM engine using `IPEX-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/DockerGuides/vllm_docker_quickstart.md). diff --git a/docker/llm/serving/xpu/docker/benchmark_vllm_latency.py b/docker/llm/serving/xpu/docker/benchmark_vllm_latency.py index 3a80fcbb..05bc4dc2 100644 --- a/docker/llm/serving/xpu/docker/benchmark_vllm_latency.py +++ b/docker/llm/serving/xpu/docker/benchmark_vllm_latency.py @@ -1,5 +1,6 @@ """Benchmark the latency of processing a single batch of requests.""" import argparse +import dataclasses import json import time from pathlib import Path @@ -9,55 +10,26 @@ import numpy as np import torch from tqdm import tqdm -from vllm import LLM, SamplingParams +from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs -from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM -from vllm.inputs import PromptInputs -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.inputs import PromptType from vllm.utils import FlexibleArgumentParser +from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM def main(args: argparse.Namespace): print(args) + engine_args = EngineArgs.from_cli_args(args) + # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM( - model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - speculative_draft_tensor_parallel_size=\ - args.speculative_draft_tensor_parallel_size, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - max_model_len=args.max_model_len, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - use_v2_block_manager=args.use_v2_block_manager, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - load_format=args.load_format, - distributed_executor_backend=args.distributed_executor_backend, - otlp_traces_endpoint=args.otlp_traces_endpoint, - enable_prefix_caching=args.enable_prefix_caching, - load_in_low_bit=args.load_in_low_bit, - max_num_batched_tokens=args.max_num_batched_tokens, - max_num_seqs=args.max_num_seqs, - ) + llm = LLM(**dataclasses.asdict(engine_args), load_in_low_bit = args.load_in_low_bit) sampling_params = SamplingParams( n=args.n, - temperature=0.0 if args.use_beam_search else 1.0, + temperature=1.0, top_p=1.0, - use_beam_search=args.use_beam_search, ignore_eos=True, max_tokens=args.output_len, ) @@ -65,37 +37,26 @@ def main(args: argparse.Namespace): dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len)) - dummy_inputs: List[PromptInputs] = [{ + dummy_prompts: List[PromptType] = [{ "prompt_token_ids": batch } for batch in dummy_prompt_token_ids.tolist()] def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: - if args.device == "xpu": - with torch.autograd.profiler_legacy.profile(enabled=True, use_xpu=True) as p: - llm.generate(dummy_inputs, - sampling_params=sampling_params, - use_tqdm=False) - print("Sort by CPU time total...") - print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1)) - print("Sort by XPU time total...") - print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1)) - else: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - # torch.profiler.ProfilerActivity.XPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir))) as p: - llm.generate(dummy_inputs, - sampling_params=sampling_params, - use_tqdm=False) - print(p.key_averages()) + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + on_trace_ready=torch.profiler.tensorboard_trace_handler( + str(profile_dir))) as p: + llm.generate(dummy_prompts, + sampling_params=sampling_params, + use_tqdm=False) + print(p.key_averages()) else: start_time = time.perf_counter() - llm.generate(dummy_inputs, + llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) end_time = time.perf_counter() @@ -142,19 +103,6 @@ if __name__ == '__main__': parser = FlexibleArgumentParser( description='Benchmark the latency of processing a single batch of ' 'requests till completion.') - parser.add_argument('--model', type=str, default='facebook/opt-125m') - parser.add_argument('--speculative-model', type=str, default=None) - parser.add_argument('--num-speculative-tokens', type=int, default=None) - parser.add_argument('--speculative-draft-tensor-parallel-size', - '-spec-draft-tp', - type=int, - default=None) - parser.add_argument('--tokenizer', type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--input-len', type=int, default=32) parser.add_argument('--output-len', type=int, default=128) parser.add_argument('--batch-size', type=int, default=8) @@ -171,45 +119,12 @@ if __name__ == '__main__': type=int, default=30, help='Number of iterations to run.') - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', + "--load-in-low-bit", type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--enforce-eager', - action='store_true', - help='enforce eager mode and disable CUDA graph') - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') + choices=["sym_int4", "fp8", "fp8_e4m3", "fp16", "fp6"], + default="sym_int4", + help="Low-bit format quantization with IPEX-LLM") parser.add_argument( '--profile', action='store_true', @@ -220,96 +135,12 @@ if __name__ == '__main__': default=None, help=('path to save the pytorch profiler output. Can be visualized ' 'with ui.perfetto.dev or Tensorboard.')) - parser.add_argument( - "--device", - type=str, - default="auto", - choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"], - help='device type for vLLM execution, supporting CUDA, OpenVINO and ' - 'CPU.') - parser.add_argument('--block-size', - type=int, - default=16, - help='block size of key/value cache') - parser.add_argument( - '--enable-chunked-prefill', - action='store_true', - help='If True, the prefill requests can be chunked based on the ' - 'max_num_batched_tokens') - parser.add_argument("--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching") - parser.add_argument('--use-v2-block-manager', action='store_true') - parser.add_argument( - "--ray-workers-use-nsight", - action='store_true', - help="If specified, use nsight to profile ray workers", - ) - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the latency results in JSON format.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], - default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--otlp-traces-endpoint', - type=str, - default=None, - help='Target URL to which OpenTelemetry traces will be sent.') - parser.add_argument( - "--load-in-low-bit", - type=str, - choices=["sym_int4", "fp8", "fp8_e4m3", "fp16", "fp6"], - default="sym_int4", - help="Low-bit format quantization with IPEX-LLM") - parser.add_argument('--max-num-batched-tokens', - type=int, - default=4096, - help='maximum number of batched tokens per iteration') - parser.add_argument('--max-num-seqs', - type=int, - default=256, - help='Maximum number of sequences per iteration.') + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() - main(args) + main(args) \ No newline at end of file diff --git a/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py b/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py index fe035865..e112ec9c 100644 --- a/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py +++ b/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py @@ -4,27 +4,96 @@ import dataclasses import json import random import time -from typing import List, Optional, Tuple +from functools import cache +from typing import Dict, List, Optional, Tuple import torch import uvloop +from PIL import Image from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.entrypoints.openai.api_server import ( +from ipex_llm.vllm.xpu.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) -# from vllm.sampling_params import BeamSearchParams +from vllm.inputs import TextPrompt +from vllm.lora.request import LoRARequest +from vllm.lora.utils import get_adapter_absolute_path +from vllm.multimodal import MultiModalDataDict +from vllm.sampling_params import BeamSearchParams +from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer from vllm.utils import FlexibleArgumentParser, merge_async_iterators -def sample_requests( - dataset_path: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int], -) -> List[Tuple[str, int, int]]: +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). + lora_request: Optional LoRARequest specifying the LoRA to use. + """ + prompt: str + prompt_len: int + expected_output_len: int + multi_modal_data: Optional[MultiModalDataDict] = None + lora_request: Optional[LoRARequest] = None + + +def _get_prompt_for_image_model(question: str, *, model: str) -> str: + """Prepend and append special tokens around the question to form a prompt. + + Args: + question: The input question text to wrap with special tokens + model: The name of the model being used, to determine which special + tokens to add + + Returns: + The formatted prompt string with appropriate special tokens for the + model + + Raises: + ValueError: If an unsupported model name is provided + """ + model = model.lower() + if "pixtral" in model: + return f"[INST]{question}\n[IMG][/INST]" + raise ValueError(f"Unsupported model {model}") + + +@cache +def lora_path_on_disk(lora_path: str) -> str: + return get_adapter_absolute_path(lora_path) + + +lora_tokenizer_cache: Dict[int, AnyTokenizer] = {} + + +def get_random_lora_request( + args: argparse.Namespace +) -> Tuple[LoRARequest, Optional[AnyTokenizer]]: + global lora_tokenizer_cache + lora_id = random.randint(1, args.max_loras) + lora_request = LoRARequest(lora_name=str(lora_id), + lora_int_id=lora_id, + lora_path=lora_path_on_disk(args.lora_path)) + if lora_id not in lora_tokenizer_cache: + lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request) + return lora_request, lora_tokenizer_cache[lora_id] + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + + dataset_path: str = args.dataset + num_requests: int = args.num_prompts + fixed_output_len: Optional[int] = args.output_len + model: str = args.model if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -33,24 +102,46 @@ def sample_requests( dataset = json.load(f) # Filter out the conversations with less than 2 turns. dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] - # Shuffle the dataset. random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] - for i in range(len(dataset)): + filtered_dataset: List[SampleRequest] = [] + for data in tqdm(dataset, + total=len(filtered_dataset), + desc="sampling requests"): if len(filtered_dataset) == num_requests: break + # Only keep the first two turns of each conversation. + prompt = data["conversations"][0]["value"] + completion = data["conversations"][1]["value"] + + multi_modal_data: Optional[MultiModalDataDict] = None + if "image" in data: + multi_modal_data = multi_modal_data or {} + image_path = data["image"] + # TODO(vllm-project/vllm/issues/9778): Support multiple images. + assert isinstance(image_path, + str), "Only support single image input" + try: + multi_modal_data["image"] = Image.open(image_path).convert( + "RGB") + except FileNotFoundError: + # Ignore datapoint where asset is missing + continue + prompt = _get_prompt_for_image_model(question=prompt, model=model) + + request_tokenizer = tokenizer + lora_request: Optional[LoRARequest] = None + if args.enable_lora: + lora_request, lora_tokenizer = get_random_lora_request(args) + if lora_tokenizer: + request_tokenizer = lora_tokenizer + # Tokenize the prompts and completions. - prompt = dataset[i][0] - prompt_token_ids = tokenizer(prompt).input_ids - completion = dataset[i][1] - completion_token_ids = tokenizer(completion).input_ids + prompt_token_ids = request_tokenizer(prompt).input_ids + completion_token_ids = request_tokenizer(completion).input_ids prompt_len = len(prompt_token_ids) output_len = len(completion_token_ids ) if fixed_output_len is None else fixed_output_len @@ -60,104 +151,110 @@ def sample_requests( if prompt_len > 1024 or prompt_len + output_len > 2048: # Prune too long sequences. continue - filtered_dataset.append((prompt, prompt_len, output_len)) + filtered_dataset.append( + SampleRequest(prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=multi_modal_data, + lora_request=lora_request)) return filtered_dataset def run_vllm( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], n: int, - low_bit: str, engine_args: EngineArgs, ) -> float: from vllm import SamplingParams from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM - llm = LLM(**dataclasses.asdict(engine_args), load_in_low_bit=low_bit) + llm = LLM(**dataclasses.asdict(engine_args), load_in_low_bit = args.load_in_low_bit) # Add the requests to the engine. - warm_prompt = "hi " * (1024 - 1) - warm_requests = [(warm_prompt, 1024, 1024) - for _ in range(8)] - - prompts: List[str] = [] + prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] - for prompt, _, output_len in warm_requests: - prompts.append(prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=0.0, - top_p=1.0, - ignore_eos=True, - max_tokens=output_len, - )) - llm.generate(prompts, sampling_params, use_tqdm=True) - - # Add the requests to the engine. - prompts: List[str] = [] - sampling_params: List[SamplingParams] = [] - for prompt, _, output_len in requests: - prompts.append(prompt) + for request in requests: + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, temperature=1.0, top_p=1.0, ignore_eos=True, - max_tokens=output_len, + max_tokens=request.expected_output_len, )) + lora_requests: Optional[List[LoRARequest]] = None + if engine_args.enable_lora: + lora_requests = [request.lora_request for request in requests] use_beam_search = False if not use_beam_search: start = time.perf_counter() - llm.generate(prompts, sampling_params, use_tqdm=True) + llm.generate(prompts, + sampling_params, + lora_request=lora_requests, + use_tqdm=True) end = time.perf_counter() else: - prompts = [prompt for prompt, _, _ in requests] + assert lora_requests is None, "BeamSearch API does not support LoRA" + prompts = [request.prompt for request in requests] # output_len should be the same for all requests. output_len = requests[0][2] - for prompt, input_len, _output_len in requests: - assert _output_len == output_len + for request in requests: + assert request.expected_output_len == output_len start = time.perf_counter() - llm.beam_search(prompts, + llm.beam_search( + prompts, + BeamSearchParams( beam_width=n, max_tokens=output_len, - ignore_eos=True) + ignore_eos=True, + )) end = time.perf_counter() return end - start async def run_vllm_async( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], n: int, engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, + load_in_low_bit: str = "sym_int4", ) -> float: from vllm import SamplingParams async with build_async_engine_client_from_engine_args( - engine_args, disable_frontend_multiprocessing) as llm: + engine_args, disable_frontend_multiprocessing, load_in_low_bit=load_in_low_bit) as llm: # Add the requests to the engine. - prompts: List[str] = [] + prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] - for prompt, _, output_len in requests: - prompts.append(prompt) + lora_requests: List[Optional[LoRARequest]] = [] + for request in requests: + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, temperature=1.0, top_p=1.0, ignore_eos=True, - max_tokens=output_len, + max_tokens=request.expected_output_len, )) + lora_requests.append(request.lora_request) generators = [] start = time.perf_counter() - for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): - generator = llm.generate(prompt, sp, request_id=f"test{i}") + for i, (prompt, sp, + lr) in enumerate(zip(prompts, sampling_params, lora_requests)): + generator = llm.generate(prompt, + sp, + lora_request=lr, + request_id=f"test{i}") generators.append(generator) all_gens = merge_async_iterators(*generators) async for i, res in all_gens: @@ -167,7 +264,7 @@ async def run_vllm_async( def run_hf( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], model: str, tokenizer: PreTrainedTokenizerBase, n: int, @@ -225,14 +322,14 @@ def run_hf( def run_mii( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], model: str, tensor_parallel_size: int, output_len: int, ) -> float: from mii import client, serve llm = serve(model, tensor_parallel=tensor_parallel_size) - prompts = [prompt for prompt, _, _ in requests] + prompts = [request.prompt for request in requests] start = time.perf_counter() llm.generate(prompts, max_new_tokens=output_len) @@ -250,23 +347,50 @@ def main(args: argparse.Namespace): tokenizer = AutoTokenizer.from_pretrained( args.tokenizer, trust_remote_code=args.trust_remote_code) if args.dataset is None: - # Synthesize a prompt with the given input length. - # As tokenizer may add additional tokens like BOS, we need to try - # different lengths to get the desired input length. - for i in range(-10, 10): - prompt = "hi " * (args.input_len + i) - tokenized_prompt = tokenizer(prompt).input_ids - if len(tokenized_prompt) == args.input_len: - break - else: - raise ValueError( - f"Failed to synthesize a prompt with {args.input_len} tokens.") - requests = [(prompt, args.input_len, args.output_len) - for _ in range(args.num_prompts)] - else: - requests = sample_requests(args.dataset, args.num_prompts, tokenizer, - args.output_len) + vocab_size = tokenizer.vocab_size + requests = [] + for _ in range(args.num_prompts): + request_tokenizer = tokenizer + lora_request: Optional[LoRARequest] = None + if args.enable_lora: + lora_request, lora_tokenizer = get_random_lora_request(args) + if lora_tokenizer: + request_tokenizer = lora_tokenizer + + # Synthesize a prompt with the given input length. + candidate_ids = [ + random.randint(0, vocab_size - 1) + for _ in range(args.input_len) + ] + # As tokenizer may add additional tokens like BOS, we need to try + # different lengths to get the desired input length. + for _ in range(5): # Max attempts to correct + candidate_prompt = request_tokenizer.decode(candidate_ids) + tokenized_len = len(request_tokenizer.encode(candidate_prompt)) + + if tokenized_len == args.input_len: + break + + # Adjust length based on difference + diff = args.input_len - tokenized_len + if diff > 0: + candidate_ids.extend([ + random.randint(100, vocab_size - 100) + for _ in range(diff) + ]) + else: + candidate_ids = candidate_ids[:diff] + requests.append( + SampleRequest(prompt=candidate_prompt, + prompt_len=args.input_len, + expected_output_len=args.output_len, + lora_request=lora_request)) + else: + requests = sample_requests(tokenizer, args) + + is_multi_modal = any(request.multi_modal_data is not None + for request in requests) if args.backend == "vllm": if args.async_engine: elapsed_time = uvloop.run( @@ -275,9 +399,10 @@ def main(args: argparse.Namespace): args.n, AsyncEngineArgs.from_cli_args(args), args.disable_frontend_multiprocessing, + args.load_in_low_bit, )) else: - elapsed_time = run_vllm(requests, args.n, args.load_in_low_bit, + elapsed_time = run_vllm(requests, args.n, EngineArgs.from_cli_args(args)) elif args.backend == "hf": assert args.tensor_parallel_size == 1 @@ -288,9 +413,15 @@ def main(args: argparse.Namespace): args.output_len) else: raise ValueError(f"Unknown backend: {args.backend}") - total_num_tokens = sum(prompt_len + output_len - for _, prompt_len, output_len in requests) - total_output_tokens = sum(output_len for _, _, output_len in requests) + total_num_tokens = sum(request.prompt_len + request.expected_output_len + for request in requests) + total_output_tokens = sum(request.expected_output_len + for request in requests) + if is_multi_modal: + print("\033[91mWARNING\033[0m: Multi-modal request detected. The " + "following metrics are not accurate because image tokens are not" + " counted. See vllm-project/vllm/issues/9778 for details.") + # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length. print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s") @@ -317,7 +448,9 @@ if __name__ == "__main__": parser.add_argument("--dataset", type=str, default=None, - help="Path to the dataset.") + help="Path to the dataset. The dataset is expected to " + "be a json in form of List[Dict[..., conversations: " + "List[Dict[..., value: ]]]]") parser.add_argument("--input-len", type=int, default=None, @@ -352,12 +485,21 @@ if __name__ == "__main__": action='store_true', default=False, help="Disable decoupled async engine frontend.") + # IPEX-LLM optimization parser.add_argument( "--load-in-low-bit", type=str, - choices=["sym_int4", "fp8", "fp8_e4m3", "fp16", "fp6"], + choices=["sym_int4", "woq_int4", "fp8", "fp8_e4m3", "fp16", "fp6"], default="sym_int4", help="Low-bit format quantization with IPEX-LLM") + # LoRA + parser.add_argument( + "--lora-path", + type=str, + default=None, + help="Path to the lora adapters to use. This can be an absolute path, " + "a relative path, or a Hugging Face model identifier.") + parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: @@ -367,6 +509,8 @@ if __name__ == "__main__": assert args.output_len is not None else: assert args.input_len is None + if args.enable_lora: + assert args.lora_path is not None if args.backend == "vllm": if args.hf_max_batch_size is not None: @@ -376,6 +520,9 @@ if __name__ == "__main__": raise ValueError("HF max batch size is required for HF backend.") if args.quantization is not None: raise ValueError("Quantization is only for vLLM backend.") + if args.enable_lora is not None: + raise ValueError("LoRA benchmarking is only supported for vLLM" + " backend") elif args.backend == "mii": if args.dtype != "auto": raise ValueError("dtype must be auto for MII backend.") @@ -388,5 +535,7 @@ if __name__ == "__main__": if args.tokenizer != args.model: raise ValueError("Tokenizer must be the same as the model for MII " "backend.") - main(args) - + if args.enable_lora is not None: + raise ValueError("LoRA benchmarking is only supported for vLLM" + " backend") + main(args) \ No newline at end of file diff --git a/docker/llm/serving/xpu/docker/ccl_torch.patch b/docker/llm/serving/xpu/docker/ccl_torch.patch new file mode 100644 index 00000000..c3a658df --- /dev/null +++ b/docker/llm/serving/xpu/docker/ccl_torch.patch @@ -0,0 +1,47 @@ +diff --git a/setup.py b/setup.py +index 67016cb..305be5c 100644 +--- a/setup.py ++++ b/setup.py +@@ -118,8 +118,8 @@ class BuildCMakeExt(BuildExtension): + if compute_backend == 'dpcpp': + runtime = 'dpcpp' + build_options['COMPUTE_BACKEND'] = compute_backend +- import intel_extension_for_pytorch +- build_options['CMAKE_PREFIX_PATH'] += ";" + intel_extension_for_pytorch.cmake_prefix_path ++ # import intel_extension_for_pytorch ++ # build_options['CMAKE_PREFIX_PATH'] += ";" + intel_extension_for_pytorch.cmake_prefix_path + if "DPCPP_GCC_INSTALL_DIR" in my_env: + exist_cflags = "CFLAGS" in my_env + cflags = "" +diff --git a/src/gpu/CMakeLists.txt b/src/gpu/CMakeLists.txt +index 5628f31..d1589c4 100644 +--- a/src/gpu/CMakeLists.txt ++++ b/src/gpu/CMakeLists.txt +@@ -1,4 +1,4 @@ +-find_package(IPEX REQUIRED) ++# find_package(IPEX REQUIRED) + + set(CCL_DPCPP_SRCS dpcpp_ccl.cpp ze_exception.hpp allreduce.h sycl_misc.hpp runtime.hpp cxxopts.hpp) + +@@ -9,7 +9,7 @@ add_library(oneccl_bindings_for_pytorch_xpu SHARED ${CCL_DPCPP_SRCS}) + + target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC ${DEPENDS_LIB}) + target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC oneccl_bindings_for_pytorch) +-target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC intel-ext-pt-gpu) ++# target_link_libraries(oneccl_bindings_for_pytorch_xpu PUBLIC intel-ext-pt-gpu) + + foreach(RPATH ${CMAKE_INSTALL_RPATH}) + set_target_properties(oneccl_bindings_for_pytorch_xpu PROPERTIES LINK_FLAGS "-Wl,-rpath,${RPATH}") +diff --git a/src/gpu/dpcpp_ccl.cpp b/src/gpu/dpcpp_ccl.cpp +index 1631b85..0945031 100644 +--- a/src/gpu/dpcpp_ccl.cpp ++++ b/src/gpu/dpcpp_ccl.cpp +@@ -32,7 +32,7 @@ + #include + #include + #include +-#include ++// #include + + #include + //#include "allreduce.h" diff --git a/docker/llm/serving/xpu/docker/gradio_web_server.patch b/docker/llm/serving/xpu/docker/gradio_web_server.patch deleted file mode 100644 index dbc77c0d..00000000 --- a/docker/llm/serving/xpu/docker/gradio_web_server.patch +++ /dev/null @@ -1,209 +0,0 @@ ---- a/gradio_web_server.py -+++ b/gradio_web_server_new.py -@@ -9,8 +9,10 @@ import hashlib - import json - import os - import random -+import pandas as pd - import time - import uuid -+import numpy as np - - import gradio as gr - import requests -@@ -241,7 +243,7 @@ def clear_history(request: gr.Request): - ip = get_ip(request) - logger.info(f"clear_history. ip: {ip}") - state = None -- return (state, [], "", None) + (disable_btn,) * 5 -+ return (state, [], "", None, "", "", "", "") + (disable_btn,) * 5 - - - def get_ip(request: gr.Request): -@@ -354,6 +356,18 @@ def is_limit_reached(model_name, ip): - return None - - -+def handle_latency_metrics(first_token_time, next_token_time): -+ # next token time is a numpy array... -+ # first token time might be None -+ first_token_latency = "None" -+ next_token_latency = "None" -+ if first_token_time is not None: -+ first_token_latency = f"{first_token_time * 1000 :.2f} ms" -+ if next_token_time.size > 0: -+ next_token_latency = f"{np.mean(next_token_time) * 1000 :.2f} ms" -+ return first_token_latency, next_token_latency -+ -+ - def bot_response( - state, - temperature, -@@ -372,7 +386,7 @@ def bot_response( - if state.skip_next: - # This generate call is skipped due to invalid inputs - state.skip_next = False -- yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5 -+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5 - return - - if apply_rate_limit: -@@ -381,7 +395,7 @@ def bot_response( - error_msg = RATE_LIMIT_MSG + "\n\n" + ret["reason"] - logger.info(f"rate limit reached. ip: {ip}. error_msg: {ret['reason']}") - state.conv.update_last_message(error_msg) -- yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5 -+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5 - return - - conv, model_name = state.conv, state.model_name -@@ -404,6 +418,10 @@ def bot_response( - yield ( - state, - state.to_gradio_chatbot(), -+ "None", -+ "None", -+ "None", -+ "None", - disable_btn, - disable_btn, - disable_btn, -@@ -444,18 +462,32 @@ def bot_response( - ) - - conv.update_last_message("▌") -- yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5 -+ # We probably need to change this method -+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (disable_btn,) * 5 -+ prompt_tokens = 0 -+ generated_tokens = 0 -+ first_token_latency = None -+ next_token_latencies = np.array([]) -+ start_time = time.time() - - try: - for i, data in enumerate(stream_iter): - if data["error_code"] == 0: -+ prompt_tokens = data["usage"]["prompt_tokens"] -+ generated_tokens = data["usage"]["completion_tokens"] - output = data["text"].strip() - conv.update_last_message(output + "▌") -- yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5 -+ if first_token_latency is None: -+ first_token_latency = time.time() - start_time -+ else: -+ next_token_latencies = np.append(next_token_latencies, time.time() - start_time) -+ start_time = time.time() -+ first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies) -+ yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (disable_btn,) * 5 - else: - output = data["text"] + f"\n\n(error_code: {data['error_code']})" - conv.update_last_message(output) -- yield (state, state.to_gradio_chatbot()) + ( -+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + ( - disable_btn, - disable_btn, - disable_btn, -@@ -465,13 +497,14 @@ def bot_response( - return - output = data["text"].strip() - conv.update_last_message(output) -- yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5 -+ first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies) -+ yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (enable_btn,) * 5 - except requests.exceptions.RequestException as e: - conv.update_last_message( - f"{SERVER_ERROR_MSG}\n\n" - f"(error_code: {ErrorCode.GRADIO_REQUEST_ERROR}, {e})" - ) -- yield (state, state.to_gradio_chatbot()) + ( -+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + ( - disable_btn, - disable_btn, - disable_btn, -@@ -484,7 +517,7 @@ def bot_response( - f"{SERVER_ERROR_MSG}\n\n" - f"(error_code: {ErrorCode.GRADIO_STREAM_UNKNOWN_ERROR}, {e})" - ) -- yield (state, state.to_gradio_chatbot()) + ( -+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + ( - disable_btn, - disable_btn, - disable_btn, -@@ -646,7 +679,8 @@ def build_single_model_ui(models, add_promotion_links=False): - ) - - notice_markdown = f""" --# 🏔️ Chat with Open Large Language Models -+# 🏔️ ChatBot based Xeon-W & Arc GPUs -+### Deployed with IPEX-LLM - {promotion} - """ - -@@ -717,6 +751,22 @@ def build_single_model_ui(models, add_promotion_links=False): - label="Max output tokens", - ) - -+ with gr.Row(): -+ with gr.Column(): -+ gr.Markdown("### Performance Metrics") -+ prompt_token = gr.Label( -+ label="Prompt token length:", -+ ) -+ next_token = gr.Label( -+ label="Generated token length:", -+ ) -+ first_token_latency = gr.Label( -+ label="First token Latency:", -+ ) -+ next_token_latency = gr.Label( -+ label="Next token Latency:", -+ ) -+ - if add_promotion_links: - gr.Markdown(acknowledgment_md, elem_id="ack_markdown") - -@@ -743,9 +793,9 @@ def build_single_model_ui(models, add_promotion_links=False): - ).then( - bot_response, - [state, temperature, top_p, max_output_tokens], -- [state, chatbot] + btn_list, -+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list, - ) -- clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox] + btn_list) -+ clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list) - - model_selector.change( - clear_history, None, [state, chatbot, textbox, imagebox] + btn_list -@@ -758,7 +808,7 @@ def build_single_model_ui(models, add_promotion_links=False): - ).then( - bot_response, - [state, temperature, top_p, max_output_tokens], -- [state, chatbot] + btn_list, -+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list, - ) - send_btn.click( - add_text, -@@ -767,7 +817,7 @@ def build_single_model_ui(models, add_promotion_links=False): - ).then( - bot_response, - [state, temperature, top_p, max_output_tokens], -- [state, chatbot] + btn_list, -+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list, - ) - - return [state, model_selector] -@@ -775,7 +825,7 @@ def build_single_model_ui(models, add_promotion_links=False): - - def build_demo(models): - with gr.Blocks( -- title="Chat with Open Large Language Models", -+ title="ChatBot based Xeon-W & Arc GPUs", - theme=gr.themes.Default(), - css=block_css, - ) as demo: -@@ -885,3 +935,4 @@ if __name__ == "__main__": - auth=auth, - root_path=args.gradio_root_path, - ) -+ diff --git a/docker/llm/serving/xpu/docker/start-fastchat-service.sh b/docker/llm/serving/xpu/docker/start-fastchat-service.sh deleted file mode 100644 index 8d8ca975..00000000 --- a/docker/llm/serving/xpu/docker/start-fastchat-service.sh +++ /dev/null @@ -1,125 +0,0 @@ -#!/bin/bash - -usage() { - echo "Usage: $0 [-w --worker ] [--help]" - echo "--help: Print help message." - echo "The following environment variables can be set." - echo "MODEL_PATH (default: empty)." - echo "LOW_BIT_FORMAT (default: sym_int4)" - echo "CONTROLLER_HOST (default: localhost)." - echo "CONTROLLER_PORT (default: 21001)." - echo "WORKER_HOST (default: localhost)." - echo "WORKER_PORT (default: 21002)." - echo "API_HOST (default: localhost)." - echo "API_PORT (default: 8000)." - exit 1 -} - -# Default values -controller_host="localhost" -controller_port="21001" -worker_host="localhost" -worker_port="21002" -api_host="localhost" -api_port="8000" -model_path="" -mode="" -dispatch_method="shortest_queue" # shortest_queue or lottery -stream_interval=1 -worker_type="model_worker" -low_bit_format="sym_int4" - -# We do not have any arguments, just run bash - -# Parse command-line options -options=$(getopt -o "hw:" --long "help,worker:" -n "$0" -- "$@") -if [ $? != 0 ]; then - usage -fi -eval set -- "$options" - -while true; do - case "$1" in - -w|--worker) - worker_type="$2" - [[ $worker_type == "model_worker" || $worker_type == "vllm_worker" ]] || usage - shift 2 - ;; - -h|--help) - usage - ;; - --) - shift - break - ;; - *) - usage - ;; - esac -done - -if [ "$worker_type" == "model_worker" ]; then - worker_type="ipex_llm.serving.fastchat.ipex_llm_worker" -elif [ "$worker_type" == "vllm_worker" ]; then - worker_type="ipex_llm.serving.fastchat.vllm_worker" -fi - -if [[ -n $CONTROLLER_HOST ]]; then - controller_host=$CONTROLLER_HOST -fi - -if [[ -n $CONTROLLER_PORT ]]; then - controller_port=$CONTROLLER_PORT -fi - -if [[ -n $LOW_BIT_FORMAT ]]; then - low_bit_format=$LOW_BIT_FORMAT -fi - -if [[ -n $WORKER_HOST ]]; then - worker_host=$WORKER_HOST -fi - -if [[ -n $WORKER_PORT ]]; then - worker_port=$WORKER_PORT -fi - -if [[ -n $MODEL_PATH ]]; then - model_path=$MODEL_PATH -fi - -if [[ -n $API_HOST ]]; then - api_host=$API_HOST -fi - -if [[ -n $API_PORT ]]; then - api_port=$API_PORT -fi - -if [[ -n $DISPATCH_METHOD ]]; then - dispatch_method=$DISPATCH_METHOD -fi - -if [[ -n $STREAM_INTERVAL ]]; then - stream_interval=$STREAM_INTERVAL -fi - -controller_address="http://$controller_host:$controller_port" -echo "Controller address: $controller_address" -python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method & - -worker_address="http://$worker_host:$worker_port" -echo "Worker type: $worker_type" -echo "Worker address: $worker_address" - -if [ "$worker_type" == "ipex_llm.serving.fastchat.ipex_llm_worker" ]; then - python3 -m "$worker_type" --model-path $model_path --device xpu --low-bit $low_bit_format --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval & -elif [ "$worker_type" == "ipex_llm.serving.fastchat.vllm_worker" ]; then - python3 -m "$worker_type" --model-path $model_path --device xpu --load-in-low-bit $low_bit_format --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --enforce-eager --gpu-memory-utilization 0.85 & -fi - -sleep 10 - -api_address="http://$api_host:$api_port" -echo "OpenAI API address: $api_address" -python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address diff --git a/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh b/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh deleted file mode 100644 index b51e4fc3..00000000 --- a/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh +++ /dev/null @@ -1,4 +0,0 @@ -cd /llm/lightweight_serving -model_path="/llm/models/Llama-2-7b-chat-hf" -low_bit="sym_int4" -python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit \ No newline at end of file diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index 994c822b..ef3f6cf7 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -667,7 +667,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, out_features, mp_group, None, - None, optimize_lm_head, None ) diff --git a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py index c4bed0c5..e6424493 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py +++ b/python/llm/src/ipex_llm/vllm/xpu/engine/engine.py @@ -13,18 +13,28 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from typing import Dict, Optional +from vllm.logger import init_logger +from typing import Dict, Optional, Any, Union, Type from vllm.engine.llm_engine import LLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.llm import LLM from vllm.utils import Counter -from vllm.config import EngineConfig +from vllm.config import VllmConfig from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert from vllm.usage.usage_lib import UsageContext from vllm.engine.metrics import StatLoggerBase from vllm.engine.multiprocessing.engine import MQLLMEngine import signal +from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig, + TaskOption) +from vllm.config import CompilationConfig +from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine +from vllm import envs +from vllm.v1.engine.async_llm import AsyncLLM +import os + +logger = init_logger(__name__) class IPEXLLMAsyncLLMEngine(AsyncLLMEngine): @@ -35,7 +45,7 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine): def from_engine_args( cls, engine_args: AsyncEngineArgs, - engine_config: Optional[EngineConfig] = None, + engine_config: Optional[VllmConfig] = None, start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, load_in_low_bit: str = "sym_int4", @@ -49,6 +59,27 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine): usage_context=usage_context, stat_loggers=stat_loggers) +class IPEXLLMAsyncV1Engine(AsyncLLM): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @classmethod + def from_engine_args( + cls, + engine_args: AsyncEngineArgs, + engine_config: Optional[VllmConfig] = None, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + load_in_low_bit: str = "sym_int4", + stat_loggers: Optional[Dict[str, StatLoggerBase]]=None, # noqa + ) -> "AsyncLLM": + _ipex_llm_convert(load_in_low_bit) + return super().from_engine_args(engine_args=engine_args, engine_config=engine_config, + start_engine_loop=start_engine_loop, + usage_context=usage_context, stat_loggers=stat_loggers) + + class IPEXLLMClass(LLM): def __init__( self, @@ -57,6 +88,7 @@ class IPEXLLMClass(LLM): tokenizer_mode: str = "auto", skip_tokenizer_init: bool = False, trust_remote_code: bool = False, + allowed_local_media_path: str = "", tensor_parallel_size: int = 1, dtype: str = "auto", quantization: Optional[str] = None, @@ -64,28 +96,48 @@ class IPEXLLMClass(LLM): tokenizer_revision: Optional[str] = None, seed: int = 0, gpu_memory_utilization: float = 0.9, - swap_space: int = 4, + swap_space: float = 4, cpu_offload_gb: float = 0, - enforce_eager: bool = False, - max_context_len_to_capture: Optional[int] = None, + enforce_eager: Optional[bool] = None, max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, + disable_async_output_proc: bool = True, + hf_overrides: Optional[HfOverrides] = None, + mm_processor_kwargs: Optional[Dict[str, Any]]=None, + # After positional args are removed, move this right below `model` + task: TaskOption = "auto", + override_pooler_config: Optional[PoolerConfig] = None, + compilation_config: Optional[Union[int, Dict[str, Any]]]=None, load_in_low_bit: str = "sym_int4", **kwargs, ) -> None: + ''' + LLM constructor. + + Note: if enforce_eager is unset (enforce_eager is None) + it defaults to False. + ''' + if "disable_log_stats" not in kwargs: kwargs["disable_log_stats"] = True - removed_vision_keys = ("image_token_id", "image_feature_size", - "image_input_shape", "image_input_type") - if any(k in kwargs for k in removed_vision_keys): - raise TypeError( # noqa - "There is no need to pass vision-related arguments anymore.") + + if compilation_config is not None: + if isinstance(compilation_config, (int, dict)): + compilation_config_instance = CompilationConfig.from_cli( + str(compilation_config)) + else: + compilation_config_instance = compilation_config + else: + compilation_config_instance = None + engine_args = EngineArgs( model=model, + task=task, tokenizer=tokenizer, tokenizer_mode=tokenizer_mode, skip_tokenizer_init=skip_tokenizer_init, trust_remote_code=trust_remote_code, + allowed_local_media_path=allowed_local_media_path, tensor_parallel_size=tensor_parallel_size, dtype=dtype, quantization=quantization, @@ -96,16 +148,53 @@ class IPEXLLMClass(LLM): swap_space=swap_space, cpu_offload_gb=cpu_offload_gb, enforce_eager=enforce_eager, - max_context_len_to_capture=max_context_len_to_capture, max_seq_len_to_capture=max_seq_len_to_capture, disable_custom_all_reduce=disable_custom_all_reduce, + disable_async_output_proc=disable_async_output_proc, + hf_overrides=hf_overrides, + mm_processor_kwargs=mm_processor_kwargs, + override_pooler_config=override_pooler_config, + compilation_config=compilation_config_instance, **kwargs, ) - self.llm_engine = IPEXLLMLLMEngine.from_engine_args( + # Logic to switch between engines is done at runtime instead of import + # to avoid import order issues + self.engine_class = self.get_engine_class() + self.llm_engine = self.engine_class.from_engine_args( engine_args, usage_context=UsageContext.LLM_CLASS, load_in_low_bit=load_in_low_bit) + self.request_counter = Counter() + @staticmethod + def get_engine_class() -> Type[LLMEngine]: + if envs.VLLM_USE_V1: + return IPEXLLMLLMV1Engine + return IPEXLLMLLMEngine + + +class IPEXLLMLLMV1Engine(V1LLMEngine): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @classmethod + def from_engine_args( + cls, + engine_args: EngineArgs, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[Dict[str, StatLoggerBase]]=None, + enable_multiprocessing: bool = False, + load_in_low_bit: str = "sym_int4", + ) -> "LLMEngine": + """Creates an LLM engine from the engine arguments.""" + # Create the engine configs. + + _ipex_llm_convert(load_in_low_bit) + return super().from_engine_args(engine_args, + usage_context, + stat_loggers, + enable_multiprocessing) + class IPEXLLMLLMEngine(LLMEngine): def __init__(self, *args, **kwargs): @@ -134,16 +223,24 @@ class IPEXLLMMQLLMEngine(MQLLMEngine): def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext, - ipc_path: str, load_in_low_bit: str): + ipc_path: str, load_in_low_bit: str, engine_alive): def signal_handler(*_) -> None: # Interrupt server on sigterm raise KeyboardInterrupt("MQLLMEngine terminated") # noqa - signal.signal(signal.SIGTERM, signal_handler) + try: + signal.signal(signal.SIGTERM, signal_handler) - engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args, - usage_context=usage_context, - ipc_path=ipc_path, - load_in_low_bit=load_in_low_bit) - engine.start() + engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args, + usage_context=usage_context, + ipc_path=ipc_path, + load_in_low_bit=load_in_low_bit) + engine.start() + except BaseException as e: + logger.exception(e) + engine_alive.value = False + raise e # noqa + +if os.getenv("VLLM_USE_V1"): + IPEXLLMAsyncLLMEngine = IPEXLLMAsyncV1Engine diff --git a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py index 1bd9835f..71460db9 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +++ b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py @@ -1,4 +1,5 @@ import asyncio +import atexit import importlib import inspect import multiprocessing @@ -7,11 +8,12 @@ import re import signal import socket import tempfile +import uuid from argparse import Namespace from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus -from typing import AsyncIterator, Set +from typing import AsyncIterator, Optional, Set, Tuple import uvloop from fastapi import APIRouter, FastAPI, Request @@ -29,9 +31,13 @@ from ipex_llm.vllm.xpu.engine import IPEXLLMAsyncLLMEngine as AsyncLLMEngine from vllm.engine.multiprocessing.client import MQLLMEngineClient from ipex_llm.vllm.xpu.engine import run_mp_engine from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.logger import RequestLogger -from ipex_llm.vllm.xpu.entrypoints.openai.cli_args import make_arg_parser +from vllm.entrypoints.openai.cli_args import (make_arg_parser, + validate_parsed_serve_args) + +# from ipex_llm.vllm.xpu.entrypoints.openai.cli_args import make_arg_parser # yapf conflicts with isort for this block # yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, @@ -41,8 +47,12 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DetokenizeRequest, DetokenizeResponse, EmbeddingRequest, - EmbeddingResponse, ErrorResponse, + EmbeddingResponse, + EmbeddingResponseData, + ErrorResponse, LoadLoraAdapterRequest, + PoolingRequest, PoolingResponse, + ScoreRequest, ScoreResponse, TokenizeRequest, TokenizeResponse, UnloadLoraAdapterRequest) @@ -50,12 +60,20 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding -from vllm.entrypoints.openai.serving_engine import BaseModelPath +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) + +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling +from vllm.entrypoints.openai.serving_score import OpenAIServingScores from vllm.entrypoints.openai.serving_tokenization import ( OpenAIServingTokenization) +from vllm.entrypoints.openai.tool_parsers import ToolParserManager +from vllm.entrypoints.utils import with_cancellation from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path +from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, + is_valid_ipv6_address, set_ulimit) from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -111,7 +129,7 @@ async def build_async_engine_client( async def build_async_engine_client_from_engine_args( engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, - load_in_low_bit: str = 'sym_int4', + load_in_low_bit: str = "sym_int4", ) -> AsyncIterator[EngineClient]: """ Create EngineClient, either: @@ -124,25 +142,19 @@ async def build_async_engine_client_from_engine_args( # Fall back # TODO: fill out feature matrix. if (MQLLMEngineClient.is_unsupported_config(engine_args) - or disable_frontend_multiprocessing): - engine_config = engine_args.create_engine_config() - uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config), - "uses_ray", False) - - build_engine = partial(AsyncLLMEngine.from_engine_args, - engine_args=engine_args, - load_in_low_bit=load_in_low_bit, - engine_config=engine_config, - usage_context=UsageContext.OPENAI_API_SERVER) - if uses_ray: - # Must run in main thread with ray for its signal handlers to work - engine_client = build_engine() - else: - engine_client = await asyncio.get_running_loop().run_in_executor( - None, build_engine) - - yield engine_client - return + or envs.VLLM_USE_V1 or disable_frontend_multiprocessing): + engine_client: Optional[EngineClient] = None + try: + # When starting this, we are actually starting with the V1Engine + # Here we are doing a classification, we will need to do this in IPEX-LLM + engine_client = AsyncLLMEngine.from_engine_args( + engine_args=engine_args, + usage_context=UsageContext.OPENAI_API_SERVER, + load_in_low_bit=load_in_low_bit) + yield engine_client + finally: + if engine_client and hasattr(engine_client, "shutdown"): + engine_client.shutdown() # Otherwise, use the multiprocessing AsyncLLMEngine. else: @@ -163,45 +175,60 @@ async def build_async_engine_client_from_engine_args( # Select random path for IPC. ipc_path = get_open_zmq_ipc_path() - logger.info("Multiprocessing frontend to use %s for IPC Path.", - ipc_path) + logger.debug("Multiprocessing frontend to use %s for IPC Path.", + ipc_path) # Start RPCServer in separate process (holds the LLMEngine). # the current process might have CUDA context, # so we need to spawn a new process context = multiprocessing.get_context("spawn") + # The Process can raise an exception during startup, which may + # not actually result in an exitcode being reported. As a result + # we use a shared variable to communicate the information. + engine_alive = multiprocessing.Value('b', True, lock=False) engine_process = context.Process(target=run_mp_engine, args=(engine_args, UsageContext.OPENAI_API_SERVER, - ipc_path, - load_in_low_bit)) + ipc_path, load_in_low_bit, engine_alive)) engine_process.start() - logger.info("Started engine process with PID %d", engine_process.pid) + engine_pid = engine_process.pid + assert engine_pid is not None, "Engine process failed to start." + logger.info("Started engine process with PID %d", engine_pid) + + def _cleanup_ipc_path(): + socket_path = ipc_path.replace("ipc://", "") + if os.path.exists(socket_path): + os.remove(socket_path) + + # Ensure we clean up the local IPC socket file on exit. + atexit.register(_cleanup_ipc_path) # Build RPCClient, which conforms to EngineClient Protocol. - # NOTE: Actually, this is not true yet. We still need to support - # embedding models via RPC (see TODO above) engine_config = engine_args.create_engine_config() - mp_engine_client = MQLLMEngineClient(ipc_path, engine_config) - + build_client = partial(MQLLMEngineClient, ipc_path, engine_config, + engine_pid) + mq_engine_client = await asyncio.get_running_loop().run_in_executor( + None, build_client) try: while True: try: - await mp_engine_client.setup() + await mq_engine_client.setup() break except TimeoutError: - if not engine_process.is_alive(): + if (not engine_process.is_alive() + or not engine_alive.value): raise RuntimeError( - "Engine process failed to start") from None + "Engine process failed to start. See stack " + "trace for the root cause.") from None - yield mp_engine_client # type: ignore[misc] + yield mq_engine_client # type: ignore[misc] finally: # Ensure rpc server process was terminated engine_process.terminate() # Close all open connections to the backend - mp_engine_client.close() + mq_engine_client.close() # Wait for engine process to join engine_process.join(4) @@ -230,8 +257,8 @@ def mount_metrics(app: FastAPI): prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None) if prometheus_multiproc_dir_path is not None: - logger.info("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR", - prometheus_multiproc_dir_path) + logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR", + prometheus_multiproc_dir_path) registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) @@ -246,22 +273,35 @@ def mount_metrics(app: FastAPI): app.routes.append(metrics_route) -def chat(request: Request) -> OpenAIServingChat: +def base(request: Request) -> OpenAIServing: + # Reuse the existing instance + return tokenization(request) + + +def chat(request: Request) -> Optional[OpenAIServingChat]: return request.app.state.openai_serving_chat -def completion(request: Request) -> OpenAIServingCompletion: +def completion(request: Request) -> Optional[OpenAIServingCompletion]: return request.app.state.openai_serving_completion +def pooling(request: Request) -> Optional[OpenAIServingPooling]: + return request.app.state.openai_serving_pooling + + +def embedding(request: Request) -> Optional[OpenAIServingEmbedding]: + return request.app.state.openai_serving_embedding + + +def score(request: Request) -> Optional[OpenAIServingScores]: + return request.app.state.openai_serving_scores + + def tokenization(request: Request) -> OpenAIServingTokenization: return request.app.state.openai_serving_tokenization -def embedding(request: Request) -> OpenAIServingEmbedding: - return request.app.state.openai_serving_embedding - - def engine_client(request: Request) -> EngineClient: return request.app.state.engine_client @@ -274,8 +314,11 @@ async def health(raw_request: Request) -> Response: @router.post("/tokenize") +@with_cancellation async def tokenize(request: TokenizeRequest, raw_request: Request): - generator = await tokenization(raw_request).create_tokenize(request) + handler = tokenization(raw_request) + + generator = await handler.create_tokenize(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -286,8 +329,11 @@ async def tokenize(request: TokenizeRequest, raw_request: Request): @router.post("/detokenize") +@with_cancellation async def detokenize(request: DetokenizeRequest, raw_request: Request): - generator = await tokenization(raw_request).create_detokenize(request) + handler = tokenization(raw_request) + + generator = await handler.create_detokenize(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -299,7 +345,9 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request): @router.get("/v1/models") async def show_available_models(raw_request: Request): - models = await completion(raw_request).show_available_models() + handler = base(raw_request) + + models = await handler.show_available_models() return JSONResponse(content=models.model_dump()) @@ -310,11 +358,15 @@ async def show_version(): @router.post("/v1/chat/completions") +@with_cancellation async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): + handler = chat(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Chat Completions API") - generator = await chat(raw_request).create_chat_completion( - request, raw_request) + generator = await handler.create_chat_completion(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -327,9 +379,14 @@ async def create_chat_completion(request: ChatCompletionRequest, @router.post("/v1/completions") +@with_cancellation async def create_completion(request: CompletionRequest, raw_request: Request): - generator = await completion(raw_request).create_completion( - request, raw_request) + handler = completion(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Completions API") + + generator = await handler.create_completion(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -340,9 +397,40 @@ async def create_completion(request: CompletionRequest, raw_request: Request): @router.post("/v1/embeddings") +@with_cancellation async def create_embedding(request: EmbeddingRequest, raw_request: Request): - generator = await embedding(raw_request).create_embedding( - request, raw_request) + handler = embedding(raw_request) + if handler is None: + fallback_handler = pooling(raw_request) + if fallback_handler is None: + return base(raw_request).create_error_response( + message="The model does not support Embeddings API") + + logger.warning( + "Embeddings API will become exclusive to embedding models " + "in a future release. To return the hidden states directly, " + "use the Pooling API (`/pooling`) instead.") + + res = await fallback_handler.create_pooling(request, raw_request) + if isinstance(res, PoolingResponse): + generator = EmbeddingResponse( + id=res.id, + object=res.object, + created=res.created, + model=res.model, + data=[ + EmbeddingResponseData( + index=d.index, + embedding=d.data, # type: ignore + ) for d in res.data + ], + usage=res.usage, + ) + else: + generator = res + else: + generator = await handler.create_embedding(request, raw_request) + if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -352,6 +440,52 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): assert_never(generator) +@router.post("/pooling") +@with_cancellation +async def create_pooling(request: PoolingRequest, raw_request: Request): + handler = pooling(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Pooling API") + + generator = await handler.create_pooling(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, PoolingResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post("/score") +@with_cancellation +async def create_score(request: ScoreRequest, raw_request: Request): + handler = score(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Score API") + + generator = await handler.create_score(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, ScoreResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post("/v1/score") +@with_cancellation +async def create_score_v1(request: ScoreRequest, raw_request: Request): + logger.warning( + "To indicate that Score API is not part of standard OpenAI API, we " + "have moved it to `/score`. Please update your client accordingly.") + + return await create_score(request, raw_request) + + if envs.VLLM_TORCH_PROFILER_DIR: logger.warning( "Torch Profiler is enabled in the API server. This should ONLY be " @@ -380,30 +514,26 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: @router.post("/v1/load_lora_adapter") async def load_lora_adapter(request: LoadLoraAdapterRequest, raw_request: Request): - response = await chat(raw_request).load_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) - - response = await completion(raw_request).load_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + for route in [chat, completion, embedding]: + handler = route(raw_request) + if handler is not None: + response = await handler.load_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) @router.post("/v1/unload_lora_adapter") async def unload_lora_adapter(request: UnloadLoraAdapterRequest, raw_request: Request): - response = await chat(raw_request).unload_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) - - response = await completion(raw_request).unload_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + for route in [chat, completion, embedding]: + handler = route(raw_request) + if handler is not None: + response = await handler.unload_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) @@ -431,8 +561,9 @@ def build_app(args: Namespace) -> FastAPI: @app.exception_handler(RequestValidationError) async def validation_exception_handler(_, exc): - chat = app.state.openai_serving_chat - err = chat.create_error_response(message=str(exc)) + err = ErrorResponse(message=str(exc), + type="BadRequestError", + code=HTTPStatus.BAD_REQUEST) return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) @@ -440,16 +571,31 @@ def build_app(args: Namespace) -> FastAPI: @app.middleware("http") async def authentication(request: Request, call_next): - root_path = "" if args.root_path is None else args.root_path if request.method == "OPTIONS": return await call_next(request) - if not request.url.path.startswith(f"{root_path}/v1"): + url_path = request.url.path + if app.root_path and url_path.startswith(app.root_path): + url_path = url_path[len(app.root_path):] + if not url_path.startswith("/v1"): return await call_next(request) if request.headers.get("Authorization") != "Bearer " + token: return JSONResponse(content={"error": "Unauthorized"}, status_code=401) return await call_next(request) + if args.enable_request_id_headers: + logger.warning( + "CAUTION: Enabling X-Request-Id headers in the API Server. " + "This can harm performance at high QPS.") + + @app.middleware("http") + async def add_request_id(request: Request, call_next): + request_id = request.headers.get( + "X-Request-Id") or uuid.uuid4().hex + response = await call_next(request) + response.headers["X-Request-Id"] = request_id + return response + for middleware in args.middleware: module_path, object_name = middleware.rsplit(".", 1) imported = getattr(importlib.import_module(module_path), object_name) @@ -488,49 +634,179 @@ def init_app_state( state.engine_client = engine_client state.log_stats = not args.disable_log_stats + resolved_chat_template = load_chat_template(args.chat_template) + logger.info("Using supplied chat template:\n%s", resolved_chat_template) + + state.openai_serving_models = OpenAIServingModels( + model_config=model_config, + base_model_paths=base_model_paths, + lora_modules=args.lora_modules, + prompt_adapters=args.prompt_adapters, + ) + # TODO: The chat template is now broken for lora adapters :( state.openai_serving_chat = OpenAIServingChat( engine_client, model_config, - base_model_paths, + state.openai_serving_models, args.response_role, - lora_modules=args.lora_modules, - prompt_adapters=args.prompt_adapters, request_logger=request_logger, - chat_template=args.chat_template, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, - tool_parser=args.tool_call_parser) + tool_parser=args.tool_call_parser, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + ) if model_config.runner_type == "generate" else None state.openai_serving_completion = OpenAIServingCompletion( engine_client, model_config, - base_model_paths, - lora_modules=args.lora_modules, - prompt_adapters=args.prompt_adapters, + state.openai_serving_models, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, - ) + ) if model_config.runner_type == "generate" else None + state.openai_serving_pooling = OpenAIServingPooling( + engine_client, + model_config, + state.openai_serving_models, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + ) if model_config.runner_type == "pooling" else None state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, - base_model_paths, + state.openai_serving_models, request_logger=request_logger, - ) + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + ) if model_config.task == "embed" else None + state.openai_serving_scores = OpenAIServingScores( + engine_client, + model_config, + state.openai_serving_models, + request_logger=request_logger + ) if model_config.task == "score" else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, model_config, - base_model_paths, - lora_modules=args.lora_modules, + state.openai_serving_models, request_logger=request_logger, - chat_template=args.chat_template, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, ) + state.task = model_config.task + # if args.served_model_name is not None: + # served_model_names = args.served_model_name + # else: + # served_model_names = [args.model] + + # if args.disable_log_requests: + # request_logger = None + # else: + # request_logger = RequestLogger(max_log_len=args.max_log_len) + + # base_model_paths = [ + # BaseModelPath(name=name, model_path=args.model) + # for name in served_model_names + # ] + + # state.engine_client = engine_client + # state.log_stats = not args.disable_log_stats + + # resolved_chat_template = load_chat_template(args.chat_template) + # logger.info("Using supplied chat template:\n%s", resolved_chat_template) + + # state.openai_serving_chat = OpenAIServingChat( + # engine_client, + # model_config, + # base_model_paths, + # args.response_role, + # lora_modules=args.lora_modules, + # prompt_adapters=args.prompt_adapters, + # request_logger=request_logger, + # chat_template=resolved_chat_template, + # chat_template_content_format=args.chat_template_content_format, + # return_tokens_as_token_ids=args.return_tokens_as_token_ids, + # enable_auto_tools=args.enable_auto_tool_choice, + # tool_parser=args.tool_call_parser, + # enable_prompt_tokens_details=args.enable_prompt_tokens_details, + # ) if model_config.runner_type == "generate" else None + # state.openai_serving_completion = OpenAIServingCompletion( + # engine_client, + # model_config, + # base_model_paths, + # lora_modules=args.lora_modules, + # prompt_adapters=args.prompt_adapters, + # request_logger=request_logger, + # return_tokens_as_token_ids=args.return_tokens_as_token_ids, + # ) if model_config.runner_type == "generate" else None + # state.openai_serving_pooling = OpenAIServingPooling( + # engine_client, + # model_config, + # base_model_paths, + # request_logger=request_logger, + # chat_template=resolved_chat_template, + # chat_template_content_format=args.chat_template_content_format, + # ) if model_config.runner_type == "pooling" else None + # state.openai_serving_embedding = OpenAIServingEmbedding( + # engine_client, + # model_config, + # base_model_paths, + # request_logger=request_logger, + # chat_template=resolved_chat_template, + # chat_template_content_format=args.chat_template_content_format, + # ) if model_config.task == "embed" else None + # state.openai_serving_scores = OpenAIServingScores( + # engine_client, + # model_config, + # base_model_paths, + # request_logger=request_logger + # ) if model_config.task == "score" else None + # state.openai_serving_tokenization = OpenAIServingTokenization( + # engine_client, + # model_config, + # base_model_paths, + # lora_modules=args.lora_modules, + # request_logger=request_logger, + # chat_template=resolved_chat_template, + # chat_template_content_format=args.chat_template_content_format, + # ) + + +def create_server_socket(addr: Tuple[str, int]) -> socket.socket: + family = socket.AF_INET + if is_valid_ipv6_address(addr[0]): + family = socket.AF_INET6 + + sock = socket.socket(family=family, type=socket.SOCK_STREAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + sock.bind(addr) + + return sock async def run_server(args, **uvicorn_kwargs) -> None: logger.info("vLLM API server version %s", VLLM_VERSION) logger.info("args: %s", args) - temp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - temp_socket.bind(("", args.port)) + if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: + ToolParserManager.import_tool_parser(args.tool_parser_plugin) + + valide_tool_parses = ToolParserManager.tool_parsers.keys() + if args.enable_auto_tool_choice \ + and args.tool_call_parser not in valide_tool_parses: + raise KeyError(f"invalid tool call parser: {args.tool_call_parser} " + f"(chose from {{ {','.join(valide_tool_parses)} }})") + + # workaround to make sure that we bind the port before the engine is set up. + # This avoids race conditions with ray. + # see https://github.com/vllm-project/vllm/issues/8204 + sock_addr = (args.host or "", args.port) + sock = create_server_socket(sock_addr) + + # workaround to avoid footguns where uvicorn drops requests with too + # many concurrent requests active + set_ulimit() def signal_handler(*_) -> None: # Interrupt server on sigterm while initializing @@ -544,8 +820,6 @@ async def run_server(args, **uvicorn_kwargs) -> None: model_config = await engine_client.get_model_config() init_app_state(engine_client, model_config, app.state, args) - temp_socket.close() - shutdown_task = await serve_http( app, host=args.host, @@ -562,13 +836,23 @@ async def run_server(args, **uvicorn_kwargs) -> None: # NB: Await server shutdown only after the backend context is exited await shutdown_task + sock.close() + if __name__ == "__main__": # NOTE(simon): # This section should be in sync with vllm/scripts.py for CLI entrypoints. + logger.warning("Warning: Please use `ipex_llm.vllm.xpu.entrypoints.openai.api_server` " + "instead of `vllm.entrypoints.openai.api_server` to start the API server") parser = FlexibleArgumentParser( description="vLLM OpenAI-Compatible RESTful API server.") parser = make_arg_parser(parser) + parser.add_argument( + "--load-in-low-bit", + type=str, + default="sym_int4", + help="Low-bit quantization for IPEX-LLM models") args = parser.parse_args() + validate_parsed_serve_args(args) uvloop.run(run_server(args)) diff --git a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py index 4110f11b..92afa12d 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py +++ b/python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py @@ -7,11 +7,14 @@ purposes. import argparse import json import ssl -from typing import List, Optional, Sequence, Union +from typing import List, Optional, Sequence, Union, get_args from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str +from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, + validate_chat_template) from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, PromptAdapterPath) +from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.utils import FlexibleArgumentParser @@ -130,10 +133,23 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help="The file path to the chat template, " "or the template in single-line form " "for the specified model") + parser.add_argument( + '--chat-template-content-format', + type=str, + default="auto", + choices=get_args(ChatTemplateContentFormatOption), + help='The format to render message content within a chat template.' + '\n\n' + '* "string" will render the content as a string. ' + 'Example: "Hello World"\n' + '* "openai" will render the content as a list of dictionaries, ' + 'similar to OpenAI schema. ' + 'Example: [{"type": "text", "text": "Hello world!"}]') parser.add_argument("--response-role", type=nullable_str, default="assistant", - help="The role name to return if `request.add_generation_prompt=true`.") + help="The role name to return if " + "`request.add_generation_prompt=true`.") parser.add_argument("--ssl-keyfile", type=nullable_str, default=None, @@ -180,28 +196,36 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: action="store_true", help="If specified, will run the OpenAI frontend server in the same " "process as the model serving engine.") - + parser.add_argument( + "--enable-request-id-headers", + action="store_true", + help="If specified, API server will add X-Request-Id header to " + "responses. Caution: this hurts performance at high QPS.") parser.add_argument( "--enable-auto-tool-choice", action="store_true", default=False, help="Enable auto tool choice for supported models. Use --tool-call-parser" - "to specify which parser to use") + " to specify which parser to use") + valid_tool_parsers = ToolParserManager.tool_parsers.keys() parser.add_argument( "--tool-call-parser", type=str, - choices=["mistral", "hermes"], + metavar="{" + ",".join(valid_tool_parsers) + "} or name registered in " + "--tool-parser-plugin", default=None, help="Select the tool call parser depending on the model that you're using." " This is used to parse the model-generated tool call into OpenAI API " "format. Required for --enable-auto-tool-choice.") parser.add_argument( - "--load-in-low-bit", + "--tool-parser-plugin", type=str, - default="sym_int4", - help="Low-bit quantization for IPEX-LLM models") + default="", + help="Special the tool parser plugin write to parse the model-generated tool" + " into OpenAI API format, the name register in this plugin can be used " + "in --tool-call-parser.") parser = AsyncEngineArgs.add_cli_args(parser) @@ -218,10 +242,35 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=False, help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint" ) + parser.add_argument( + "--enable-prompt-tokens-details", + action='store_true', + default=False, + help="If set to True, enable prompt_tokens_details in usage.") + + parser.add_argument( + "--load-in-low-bit", + type=str, + default="sym_int4", + help="Low-bit quantization for IPEX-LLM models") return parser +def validate_parsed_serve_args(args: argparse.Namespace): + """Quick checks for model serve args that raise prior to loading.""" # noqa + if hasattr(args, "subparser") and args.subparser != "serve": + return + + # Ensure that the chat template is valid; raises if it likely isn't + validate_chat_template(args.chat_template) + + # Enable auto tool needs a tool call parser to be valid + if args.enable_auto_tool_choice and not args.tool_call_parser: + raise TypeError("Error: --enable-auto-tool-choice requires " # noqa + "--tool-call-parser") + + def create_parser_for_docs() -> FlexibleArgumentParser: parser_for_docs = FlexibleArgumentParser( prog="-m vllm.entrypoints.openai.api_server") diff --git a/python/llm/src/ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py b/python/llm/src/ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py new file mode 100644 index 00000000..b8a04cb4 --- /dev/null +++ b/python/llm/src/ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py @@ -0,0 +1,23 @@ +from vllm.logger import init_logger +from vllm.v1.executor.ray_utils import RayWorkerWrapper + + +logger = init_logger(__name__) + + +class IPEXLLMV1Wrapper(RayWorkerWrapper): + def __init__(self, load_in_low_bit="sym_int4", *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert + _ipex_llm_convert(load_in_low_bit=load_in_low_bit) + self.compiled_dag_cuda_device_set = False + + +def get_ipex_llm_v1_wrapper(load_in_low_bit): + # The reason why we not using functools.partial is that + # ray seems not work well with it. + class WrapperWithLoadBit(IPEXLLMV1Wrapper): + def __init__(self, *args, **kwargs) -> None: + super().__init__(load_in_low_bit=load_in_low_bit, *args, **kwargs) + + return WrapperWithLoadBit diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py index 3d88d8f9..7cce3c38 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py @@ -65,9 +65,14 @@ def _model_sample_convert(): def _ipex_llm_convert(load_in_low_bit): from vllm.worker.xpu_model_runner import XPUModelRunner from ipex_llm.vllm.xpu.ipex_llm_wrapper import get_ipex_llm_wrapper - import vllm.executor.ray_utils as ray_utils + from ipex_llm.vllm.xpu.ipex_llm_v1_wrapper import get_ipex_llm_v1_wrapper + import vllm.executor.ray_utils as ray_utils_v0 + import vllm.v1.executor.ray_utils as ray_utils_v1 + from vllm.v1.worker.gpu_model_runner import GPUModelRunner setattr(XPUModelRunner, "load_model", get_load_function(load_in_low_bit)) - setattr(ray_utils, "RayWorkerWrapper", get_ipex_llm_wrapper(load_in_low_bit)) + setattr(GPUModelRunner, "load_model", get_load_function(load_in_low_bit)) + setattr(ray_utils_v0, "RayWorkerWrapper", get_ipex_llm_wrapper(load_in_low_bit)) + setattr(ray_utils_v1, "RayWorkerWrapper", get_ipex_llm_v1_wrapper(load_in_low_bit)) def get_load_function(low_bit): @@ -77,19 +82,16 @@ def get_load_function(low_bit): # from vllm.utils import measure_device_memory from vllm.utils import DeviceMemoryProfiler with DeviceMemoryProfiler() as m: + from dataclasses import replace + new_device_config = DeviceConfig("cpu") + new_vllm_config = replace(self.vllm_config, device_config=new_device_config) self.model = get_model( - model_config=self.model_config, - device_config=DeviceConfig("cpu"), - load_config=self.load_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - cache_config=self.cache_config, + vllm_config=new_vllm_config ) - if "qwen" in self.model_config.model.lower() or \ - "baichuan" in self.model_config.model.lower() or \ - "codegeex4-all" in self.model_config.model.lower() or \ - "chatglm" in self.model_config.model.lower(): + if "qwen" in self.vllm_config.model_config.model.lower() or \ + "baichuan" in self.vllm_config.model_config.model.lower() or \ + "codegeex4-all" in self.vllm_config.model_config.model.lower() or \ + "chatglm" in self.vllm_config.model_config.model.lower(): self.model.apply(padding_mlp) from ipex_llm import optimize_model import os @@ -99,18 +101,22 @@ def get_load_function(low_bit): modules = ["35.mlp", "36.mlp", "37.mlp", "38.mlp", "39.mlp"] else: modules = None - if "minicpm" in self.model_config.model.lower(): + if "minicpm" in self.vllm_config.model_config.model.lower(): modules = ["vpm", "resampler"] # only for minicpm_2_6 - if "minicpm-v" in self.model_config.model.lower(): + if "minicpm-v" in self.vllm_config.model_config.model.lower(): from ipex_llm.transformers.models.minicpmv import merge_qkv self.model.vpm.apply(merge_qkv) - if "internvl2" in self.model_config.model.lower(): + if "internvl2" in self.vllm_config.model_config.model.lower(): modules = ["vision_model", "mlp1"] - optimize_model(self.model, low_bit=low_bit, torch_dtype=self.model_config.dtype, + if "deepseek-v2" in self.vllm_config.model_config.model.lower(): + modules = ["down_proj"] + optimize_model(self.model, + low_bit=low_bit, + torch_dtype=self.vllm_config.model_config.dtype, modules_to_not_convert=modules) - self.model = self.model.to(device=self.device_config.device, - dtype=self.model_config.dtype) + self.model = self.model.to(device=self.vllm_config.device_config.device, + dtype=self.vllm_config.model_config.dtype) self.model_memory_usage = m.consumed_memory logger = init_logger(__name__)