From 7e1e51d91a0e8851ce6686e45372ca1f603caa84 Mon Sep 17 00:00:00 2001 From: Shaojun Liu <61072813+liu-shaojun@users.noreply.github.com> Date: Wed, 11 Sep 2024 11:45:08 +0800 Subject: [PATCH] Update vllm setting (#12059) * revert * update * update * update --- docker/llm/serving/xpu/docker/Dockerfile | 1 - .../serving/xpu/docker/start-vllm-service.sh | 19 ++++++++++++++----- .../xpu/docker/vllm_online_benchmark.py | 4 ++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index bdaeac50..c34e3b29 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -97,6 +97,5 @@ COPY ./start-pp_serving-service.sh /llm/ COPY ./start-lightweight_serving-service.sh /llm/ ENV LD_LIBRARY_PATH /usr/local/lib/python3.11/dist-packages/intel_extension_for_pytorch/lib/:/opt/intel/oneapi/tbb/2021.12/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/2021.12/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.12/lib:/opt/intel/oneapi/mkl/2024.1/lib:/opt/intel/oneapi/ippcp/2021.11/lib/:/opt/intel/oneapi/ipp/2021.11/lib:/opt/intel/oneapi/dpl/2022.5/lib:/opt/intel/oneapi/dnnl/2024.1/lib:/opt/intel/oneapi/debugger/2024.1/opt/debugger/lib:/opt/intel/oneapi/dal/2024.2/lib:/opt/intel/oneapi/compiler/2024.1/opt/oclfpga/host/linux64/lib:/opt/intel/oneapi/compiler/2024.1/opt/compiler/lib:/opt/intel/oneapi/compiler/2024.1/lib:/opt/intel/oneapi/ccl/2021.12/lib/ -ENV BIGDL_LLM_SDP_IGNORE_MASK 0 WORKDIR /llm/ diff --git a/docker/llm/serving/xpu/docker/start-vllm-service.sh b/docker/llm/serving/xpu/docker/start-vllm-service.sh index a7860a91..749dbcd7 100644 --- a/docker/llm/serving/xpu/docker/start-vllm-service.sh +++ b/docker/llm/serving/xpu/docker/start-vllm-service.sh @@ -1,21 +1,30 @@ #!/bin/bash model="YOUR_MODEL_PATH" served_model_name="YOUR_MODEL_NAME" + +export CCL_WORKER_COUNT=2 +export FI_PROVIDER=shm +export CCL_ATL_TRANSPORT=ofi +export CCL_ZE_IPC_EXCHANGE=sockets +export CCL_ATL_SHM=1 + +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 +export TORCH_LLM_ALLREDUCE=0 source /opt/intel/1ccl-wks/setvars.sh -export BIGDL_LLM_SDP_IGNORE_MASK=0 python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \ --served-model-name $served_model_name \ --port 8000 \ --model $model \ --trust-remote-code \ - --gpu-memory-utilization 0.75 \ + --gpu-memory-utilization 0.9 \ --device xpu \ --dtype float16 \ --enforce-eager \ - --load-in-low-bit sym_int4 \ - --max-model-len 4096 \ - --max-num-batched-tokens 10240 \ + --load-in-low-bit fp8 \ + --max-model-len 2048 \ + --max-num-batched-tokens 4000 \ --max-num-seqs 12 \ --tensor-parallel-size 1 diff --git a/docker/llm/serving/xpu/docker/vllm_online_benchmark.py b/docker/llm/serving/xpu/docker/vllm_online_benchmark.py index 429a9a18..77bd498e 100644 --- a/docker/llm/serving/xpu/docker/vllm_online_benchmark.py +++ b/docker/llm/serving/xpu/docker/vllm_online_benchmark.py @@ -176,7 +176,7 @@ def benchmark(llm_urls, model, prompt, num_requests, max_concurrent_requests, ma cur_len = len(cur_llm_urls) if dataset is None: payload = { - "model": model, + "model": model_name, "prompt": prompt, "n": 1, "best_of": 1, @@ -193,7 +193,7 @@ def benchmark(llm_urls, model, prompt, num_requests, max_concurrent_requests, ma for index in range(num_requests): prompt, prompt_len, output_len = sampled_requests[index] payload = { - "model": model, + "model": model_name, "prompt": prompt, "n": 1, "best_of": 1,