Update start-vllm-service.sh (#13109)
This commit is contained in:
		
							parent
							
								
									9808fb1ac2
								
							
						
					
					
						commit
						cf97d8f1d7
					
				
					 1 changed files with 19 additions and 8 deletions
				
			
		| 
						 | 
					@ -1,11 +1,22 @@
 | 
				
			||||||
#!/bin/bash
 | 
					#!/bin/bash
 | 
				
			||||||
MODEL_PATH=${MODEL_PATH:-"default_model_path"}
 | 
					MODEL_PATH=${MODEL_PATH:-"default_model_path"}
 | 
				
			||||||
SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"default_model_name"}
 | 
					SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"default_model_name"}
 | 
				
			||||||
TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:-1}  # Default to 1 if not set
 | 
					TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:-1}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					MAX_NUM_SEQS=${MAX_NUM_SEQS:-256}
 | 
				
			||||||
 | 
					MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-3000}
 | 
				
			||||||
 | 
					MAX_MODEL_LEN=${MAX_MODEL_LEN:-2000}
 | 
				
			||||||
 | 
					LOAD_IN_LOW_BIT=${LOAD_IN_LOW_BIT:-"fp8"}
 | 
				
			||||||
 | 
					PORT=${PORT:-8000}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
echo "Starting service with model: $MODEL_PATH"
 | 
					echo "Starting service with model: $MODEL_PATH"
 | 
				
			||||||
echo "Served model name: $SERVED_MODEL_NAME"
 | 
					echo "Served model name: $SERVED_MODEL_NAME"
 | 
				
			||||||
echo "Tensor parallel size: $TENSOR_PARALLEL_SIZE"
 | 
					echo "Tensor parallel size: $TENSOR_PARALLEL_SIZE"
 | 
				
			||||||
 | 
					echo "Max num sequences: $MAX_NUM_SEQS"
 | 
				
			||||||
 | 
					echo "Max num batched tokens: $MAX_NUM_BATCHED_TOKENS"
 | 
				
			||||||
 | 
					echo "Max model length: $MAX_MODEL_LEN"
 | 
				
			||||||
 | 
					echo "Load in low bit: $LOAD_IN_LOW_BIT"
 | 
				
			||||||
 | 
					echo "Port: $PORT"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export CCL_WORKER_COUNT=2
 | 
					export CCL_WORKER_COUNT=2
 | 
				
			||||||
export SYCL_CACHE_PERSISTENT=1
 | 
					export SYCL_CACHE_PERSISTENT=1
 | 
				
			||||||
| 
						 | 
					@ -25,7 +36,7 @@ source /opt/intel/1ccl-wks/setvars.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
 | 
					python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
 | 
				
			||||||
  --served-model-name $SERVED_MODEL_NAME \
 | 
					  --served-model-name $SERVED_MODEL_NAME \
 | 
				
			||||||
  --port 8000 \
 | 
					  --port $PORT \
 | 
				
			||||||
  --model $MODEL_PATH \
 | 
					  --model $MODEL_PATH \
 | 
				
			||||||
  --trust-remote-code \
 | 
					  --trust-remote-code \
 | 
				
			||||||
  --block-size 8 \
 | 
					  --block-size 8 \
 | 
				
			||||||
| 
						 | 
					@ -33,10 +44,10 @@ python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
 | 
				
			||||||
  --device xpu \
 | 
					  --device xpu \
 | 
				
			||||||
  --dtype float16 \
 | 
					  --dtype float16 \
 | 
				
			||||||
  --enforce-eager \
 | 
					  --enforce-eager \
 | 
				
			||||||
  --load-in-low-bit fp8 \
 | 
					  --load-in-low-bit $LOAD_IN_LOW_BIT \
 | 
				
			||||||
  --max-model-len 2000 \
 | 
					  --max-model-len $MAX_MODEL_LEN \
 | 
				
			||||||
  --max-num-batched-tokens 3000 \
 | 
					  --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
 | 
				
			||||||
  --max-num-seqs 256 \
 | 
					  --max-num-seqs $MAX_NUM_SEQS \
 | 
				
			||||||
  --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
 | 
					  --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
 | 
				
			||||||
  --disable-async-output-proc \
 | 
					  --disable-async-output-proc \
 | 
				
			||||||
  --distributed-executor-backend ray
 | 
					  --distributed-executor-backend ray
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue