Update docs and scripts to align with new Docker image release (#13156)
* Update vllm_docker_quickstart.md * Update start-vllm-service.sh * Update vllm_docker_quickstart.md * Update start-vllm-service.sh
This commit is contained in:
		
							parent
							
								
									f6441b4e3d
								
							
						
					
					
						commit
						bd71739e64
					
				
					 2 changed files with 18 additions and 16 deletions
				
			
		| 
						 | 
					@ -18,22 +18,22 @@ echo "Max model length: $MAX_MODEL_LEN"
 | 
				
			||||||
echo "Load in low bit: $LOAD_IN_LOW_BIT"
 | 
					echo "Load in low bit: $LOAD_IN_LOW_BIT"
 | 
				
			||||||
echo "Port: $PORT"
 | 
					echo "Port: $PORT"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export CCL_WORKER_COUNT=2
 | 
					export USE_XETLA=OFF
 | 
				
			||||||
export SYCL_CACHE_PERSISTENT=1
 | 
					export SYCL_CACHE_PERSISTENT=1
 | 
				
			||||||
 | 
					export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 | 
				
			||||||
export FI_PROVIDER=shm
 | 
					export FI_PROVIDER=shm
 | 
				
			||||||
 | 
					export TORCH_LLM_ALLREDUCE=0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export CCL_WORKER_COUNT=2        # On BMG, set CCL_WORKER_COUNT=1; otherwise, internal-oneccl will not function properly
 | 
				
			||||||
export CCL_ATL_TRANSPORT=ofi
 | 
					export CCL_ATL_TRANSPORT=ofi
 | 
				
			||||||
export CCL_ZE_IPC_EXCHANGE=sockets
 | 
					export CCL_ZE_IPC_EXCHANGE=sockets
 | 
				
			||||||
export CCL_ATL_SHM=1
 | 
					export CCL_ATL_SHM=1
 | 
				
			||||||
 | 
					 | 
				
			||||||
export USE_XETLA=OFF
 | 
					 | 
				
			||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 | 
					 | 
				
			||||||
export TORCH_LLM_ALLREDUCE=0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
export CCL_SAME_STREAM=1
 | 
					export CCL_SAME_STREAM=1
 | 
				
			||||||
export CCL_BLOCKING_WAIT=0
 | 
					export CCL_BLOCKING_WAIT=0
 | 
				
			||||||
 | 
					# export CCL_DG2_USM=1         # Needed on Core to enable USM (Shared Memory GPUDirect). Xeon supports P2P and doesn't need this.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export VLLM_USE_V1=0
 | 
					export VLLM_USE_V1=0       # Used to select between V0 and V1 engine
 | 
				
			||||||
export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT
 | 
					export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT        # Ensures low-bit info is used for MoE; otherwise, IPEX's default MoE will be used
 | 
				
			||||||
 | 
					
 | 
				
			||||||
source /opt/intel/1ccl-wks/setvars.sh
 | 
					source /opt/intel/1ccl-wks/setvars.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -23,7 +23,7 @@ You can either pull a prebuilt Docker image from DockerHub, depending on your ha
 | 
				
			||||||
* **For Intel Arc BMG GPUs**, use:
 | 
					* **For Intel Arc BMG GPUs**, use:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ```bash
 | 
					  ```bash
 | 
				
			||||||
  docker pull intelanalytics/multi-arc-serving:0.2.0-b1
 | 
					  docker pull intelanalytics/ipex-llm-serving-xpu:0.2.0-b2
 | 
				
			||||||
  ```
 | 
					  ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -116,20 +116,22 @@ root@ws-arc-001:/llm# sycl-ls
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```bash
 | 
					```bash
 | 
				
			||||||
export USE_XETLA=OFF
 | 
					export USE_XETLA=OFF
 | 
				
			||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 | 
					 | 
				
			||||||
export TORCH_LLM_ALLREDUCE=0
 | 
					 | 
				
			||||||
export SYCL_CACHE_PERSISTENT=1
 | 
					export SYCL_CACHE_PERSISTENT=1
 | 
				
			||||||
 | 
					export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 | 
				
			||||||
export FI_PROVIDER=shm
 | 
					export FI_PROVIDER=shm
 | 
				
			||||||
export CCL_WORKER_COUNT=2
 | 
					export TORCH_LLM_ALLREDUCE=0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export CCL_WORKER_COUNT=2        # On BMG, set CCL_WORKER_COUNT=1; otherwise, internal-oneccl will not function properly
 | 
				
			||||||
export CCL_ATL_TRANSPORT=ofi
 | 
					export CCL_ATL_TRANSPORT=ofi
 | 
				
			||||||
export CCL_ZE_IPC_EXCHANGE=sockets
 | 
					export CCL_ZE_IPC_EXCHANGE=sockets
 | 
				
			||||||
export CCL_ATL_SHM=1
 | 
					export CCL_ATL_SHM=1
 | 
				
			||||||
export CCL_SAME_STREAM=1
 | 
					export CCL_SAME_STREAM=1
 | 
				
			||||||
export CCL_BLOCKING_WAIT=0
 | 
					export CCL_BLOCKING_WAIT=0
 | 
				
			||||||
 | 
					# export CCL_DG2_USM=1         # Needed on Core to enable USM (Shared Memory GPUDirect). Xeon supports P2P and doesn't need this.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export VLLM_USE_V1=0
 | 
					export LOAD_IN_LOW_BIT="fp8"
 | 
				
			||||||
export IPEX_LLM_LOWBIT="fp8"
 | 
					export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT        # Ensures low-bit info is used for MoE; otherwise, IPEX's default MoE will be used
 | 
				
			||||||
 | 
					export VLLM_USE_V1=0        # Used to select between V0 and V1 engine
 | 
				
			||||||
 | 
					
 | 
				
			||||||
source /opt/intel/1ccl-wks/setvars.sh
 | 
					source /opt/intel/1ccl-wks/setvars.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -142,7 +144,7 @@ numactl -C 0-11 python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
 | 
				
			||||||
  --device xpu \
 | 
					  --device xpu \
 | 
				
			||||||
  --dtype float16 \
 | 
					  --dtype float16 \
 | 
				
			||||||
  --enforce-eager \
 | 
					  --enforce-eager \
 | 
				
			||||||
  --load-in-low-bit "fp8" \
 | 
					  --load-in-low-bit $LOAD_IN_LOW_BIT \
 | 
				
			||||||
  --max-model-len "2000" \
 | 
					  --max-model-len "2000" \
 | 
				
			||||||
  --max-num-batched-tokens "3000" \
 | 
					  --max-num-batched-tokens "3000" \
 | 
				
			||||||
  --max-num-seqs "256" \
 | 
					  --max-num-seqs "256" \
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue