Update docs and scripts to align with new Docker image release (#13156)
* Update vllm_docker_quickstart.md * Update start-vllm-service.sh * Update vllm_docker_quickstart.md * Update start-vllm-service.sh
This commit is contained in:
parent
f6441b4e3d
commit
bd71739e64
2 changed files with 18 additions and 16 deletions
|
|
@ -18,22 +18,22 @@ echo "Max model length: $MAX_MODEL_LEN"
|
||||||
echo "Load in low bit: $LOAD_IN_LOW_BIT"
|
echo "Load in low bit: $LOAD_IN_LOW_BIT"
|
||||||
echo "Port: $PORT"
|
echo "Port: $PORT"
|
||||||
|
|
||||||
export CCL_WORKER_COUNT=2
|
export USE_XETLA=OFF
|
||||||
export SYCL_CACHE_PERSISTENT=1
|
export SYCL_CACHE_PERSISTENT=1
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
|
||||||
export FI_PROVIDER=shm
|
export FI_PROVIDER=shm
|
||||||
|
export TORCH_LLM_ALLREDUCE=0
|
||||||
|
|
||||||
|
export CCL_WORKER_COUNT=2 # On BMG, set CCL_WORKER_COUNT=1; otherwise, internal-oneccl will not function properly
|
||||||
export CCL_ATL_TRANSPORT=ofi
|
export CCL_ATL_TRANSPORT=ofi
|
||||||
export CCL_ZE_IPC_EXCHANGE=sockets
|
export CCL_ZE_IPC_EXCHANGE=sockets
|
||||||
export CCL_ATL_SHM=1
|
export CCL_ATL_SHM=1
|
||||||
|
|
||||||
export USE_XETLA=OFF
|
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
|
|
||||||
export TORCH_LLM_ALLREDUCE=0
|
|
||||||
|
|
||||||
export CCL_SAME_STREAM=1
|
export CCL_SAME_STREAM=1
|
||||||
export CCL_BLOCKING_WAIT=0
|
export CCL_BLOCKING_WAIT=0
|
||||||
|
# export CCL_DG2_USM=1 # Needed on Core to enable USM (Shared Memory GPUDirect). Xeon supports P2P and doesn't need this.
|
||||||
|
|
||||||
export VLLM_USE_V1=0
|
export VLLM_USE_V1=0 # Used to select between V0 and V1 engine
|
||||||
export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT
|
export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT # Ensures low-bit info is used for MoE; otherwise, IPEX's default MoE will be used
|
||||||
|
|
||||||
source /opt/intel/1ccl-wks/setvars.sh
|
source /opt/intel/1ccl-wks/setvars.sh
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@ You can either pull a prebuilt Docker image from DockerHub, depending on your ha
|
||||||
* **For Intel Arc BMG GPUs**, use:
|
* **For Intel Arc BMG GPUs**, use:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker pull intelanalytics/multi-arc-serving:0.2.0-b1
|
docker pull intelanalytics/ipex-llm-serving-xpu:0.2.0-b2
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -116,20 +116,22 @@ root@ws-arc-001:/llm# sycl-ls
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export USE_XETLA=OFF
|
export USE_XETLA=OFF
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
|
|
||||||
export TORCH_LLM_ALLREDUCE=0
|
|
||||||
export SYCL_CACHE_PERSISTENT=1
|
export SYCL_CACHE_PERSISTENT=1
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
|
||||||
export FI_PROVIDER=shm
|
export FI_PROVIDER=shm
|
||||||
export CCL_WORKER_COUNT=2
|
export TORCH_LLM_ALLREDUCE=0
|
||||||
|
|
||||||
|
export CCL_WORKER_COUNT=2 # On BMG, set CCL_WORKER_COUNT=1; otherwise, internal-oneccl will not function properly
|
||||||
export CCL_ATL_TRANSPORT=ofi
|
export CCL_ATL_TRANSPORT=ofi
|
||||||
export CCL_ZE_IPC_EXCHANGE=sockets
|
export CCL_ZE_IPC_EXCHANGE=sockets
|
||||||
export CCL_ATL_SHM=1
|
export CCL_ATL_SHM=1
|
||||||
export CCL_SAME_STREAM=1
|
export CCL_SAME_STREAM=1
|
||||||
export CCL_BLOCKING_WAIT=0
|
export CCL_BLOCKING_WAIT=0
|
||||||
|
# export CCL_DG2_USM=1 # Needed on Core to enable USM (Shared Memory GPUDirect). Xeon supports P2P and doesn't need this.
|
||||||
|
|
||||||
export VLLM_USE_V1=0
|
export LOAD_IN_LOW_BIT="fp8"
|
||||||
export IPEX_LLM_LOWBIT="fp8"
|
export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT # Ensures low-bit info is used for MoE; otherwise, IPEX's default MoE will be used
|
||||||
|
export VLLM_USE_V1=0 # Used to select between V0 and V1 engine
|
||||||
|
|
||||||
source /opt/intel/1ccl-wks/setvars.sh
|
source /opt/intel/1ccl-wks/setvars.sh
|
||||||
|
|
||||||
|
|
@ -142,7 +144,7 @@ numactl -C 0-11 python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
|
||||||
--device xpu \
|
--device xpu \
|
||||||
--dtype float16 \
|
--dtype float16 \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--load-in-low-bit "fp8" \
|
--load-in-low-bit $LOAD_IN_LOW_BIT \
|
||||||
--max-model-len "2000" \
|
--max-model-len "2000" \
|
||||||
--max-num-batched-tokens "3000" \
|
--max-num-batched-tokens "3000" \
|
||||||
--max-num-seqs "256" \
|
--max-num-seqs "256" \
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue