Update docs and scripts to align with new Docker image release (#13156)

* Update vllm_docker_quickstart.md

* Update start-vllm-service.sh

* Update vllm_docker_quickstart.md

* Update start-vllm-service.sh
This commit is contained in:
Shaojun Liu 2025-05-13 17:06:29 +08:00 committed by GitHub
parent f6441b4e3d
commit bd71739e64
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 18 additions and 16 deletions

View file

@ -18,22 +18,22 @@ echo "Max model length: $MAX_MODEL_LEN"
echo "Load in low bit: $LOAD_IN_LOW_BIT" echo "Load in low bit: $LOAD_IN_LOW_BIT"
echo "Port: $PORT" echo "Port: $PORT"
export CCL_WORKER_COUNT=2 export USE_XETLA=OFF
export SYCL_CACHE_PERSISTENT=1 export SYCL_CACHE_PERSISTENT=1
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
export FI_PROVIDER=shm export FI_PROVIDER=shm
export TORCH_LLM_ALLREDUCE=0
export CCL_WORKER_COUNT=2 # On BMG, set CCL_WORKER_COUNT=1; otherwise, internal-oneccl will not function properly
export CCL_ATL_TRANSPORT=ofi export CCL_ATL_TRANSPORT=ofi
export CCL_ZE_IPC_EXCHANGE=sockets export CCL_ZE_IPC_EXCHANGE=sockets
export CCL_ATL_SHM=1 export CCL_ATL_SHM=1
export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
export TORCH_LLM_ALLREDUCE=0
export CCL_SAME_STREAM=1 export CCL_SAME_STREAM=1
export CCL_BLOCKING_WAIT=0 export CCL_BLOCKING_WAIT=0
# export CCL_DG2_USM=1 # Needed on Core to enable USM (Shared Memory GPUDirect). Xeon supports P2P and doesn't need this.
export VLLM_USE_V1=0 export VLLM_USE_V1=0 # Used to select between V0 and V1 engine
export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT # Ensures low-bit info is used for MoE; otherwise, IPEX's default MoE will be used
source /opt/intel/1ccl-wks/setvars.sh source /opt/intel/1ccl-wks/setvars.sh

View file

@ -23,7 +23,7 @@ You can either pull a prebuilt Docker image from DockerHub, depending on your ha
* **For Intel Arc BMG GPUs**, use: * **For Intel Arc BMG GPUs**, use:
```bash ```bash
docker pull intelanalytics/multi-arc-serving:0.2.0-b1 docker pull intelanalytics/ipex-llm-serving-xpu:0.2.0-b2
``` ```
@ -116,20 +116,22 @@ root@ws-arc-001:/llm# sycl-ls
```bash ```bash
export USE_XETLA=OFF export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
export TORCH_LLM_ALLREDUCE=0
export SYCL_CACHE_PERSISTENT=1 export SYCL_CACHE_PERSISTENT=1
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
export FI_PROVIDER=shm export FI_PROVIDER=shm
export CCL_WORKER_COUNT=2 export TORCH_LLM_ALLREDUCE=0
export CCL_WORKER_COUNT=2 # On BMG, set CCL_WORKER_COUNT=1; otherwise, internal-oneccl will not function properly
export CCL_ATL_TRANSPORT=ofi export CCL_ATL_TRANSPORT=ofi
export CCL_ZE_IPC_EXCHANGE=sockets export CCL_ZE_IPC_EXCHANGE=sockets
export CCL_ATL_SHM=1 export CCL_ATL_SHM=1
export CCL_SAME_STREAM=1 export CCL_SAME_STREAM=1
export CCL_BLOCKING_WAIT=0 export CCL_BLOCKING_WAIT=0
# export CCL_DG2_USM=1 # Needed on Core to enable USM (Shared Memory GPUDirect). Xeon supports P2P and doesn't need this.
export VLLM_USE_V1=0 export LOAD_IN_LOW_BIT="fp8"
export IPEX_LLM_LOWBIT="fp8" export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT # Ensures low-bit info is used for MoE; otherwise, IPEX's default MoE will be used
export VLLM_USE_V1=0 # Used to select between V0 and V1 engine
source /opt/intel/1ccl-wks/setvars.sh source /opt/intel/1ccl-wks/setvars.sh
@ -142,7 +144,7 @@ numactl -C 0-11 python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
--device xpu \ --device xpu \
--dtype float16 \ --dtype float16 \
--enforce-eager \ --enforce-eager \
--load-in-low-bit "fp8" \ --load-in-low-bit $LOAD_IN_LOW_BIT \
--max-model-len "2000" \ --max-model-len "2000" \
--max-num-batched-tokens "3000" \ --max-num-batched-tokens "3000" \
--max-num-seqs "256" \ --max-num-seqs "256" \