Update start-vllm-service.sh (#13109)
This commit is contained in:
parent
9808fb1ac2
commit
cf97d8f1d7
1 changed files with 19 additions and 8 deletions
|
|
@ -1,11 +1,22 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
MODEL_PATH=${MODEL_PATH:-"default_model_path"}
|
MODEL_PATH=${MODEL_PATH:-"default_model_path"}
|
||||||
SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"default_model_name"}
|
SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"default_model_name"}
|
||||||
TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:-1} # Default to 1 if not set
|
TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:-1}
|
||||||
|
|
||||||
|
MAX_NUM_SEQS=${MAX_NUM_SEQS:-256}
|
||||||
|
MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-3000}
|
||||||
|
MAX_MODEL_LEN=${MAX_MODEL_LEN:-2000}
|
||||||
|
LOAD_IN_LOW_BIT=${LOAD_IN_LOW_BIT:-"fp8"}
|
||||||
|
PORT=${PORT:-8000}
|
||||||
|
|
||||||
echo "Starting service with model: $MODEL_PATH"
|
echo "Starting service with model: $MODEL_PATH"
|
||||||
echo "Served model name: $SERVED_MODEL_NAME"
|
echo "Served model name: $SERVED_MODEL_NAME"
|
||||||
echo "Tensor parallel size: $TENSOR_PARALLEL_SIZE"
|
echo "Tensor parallel size: $TENSOR_PARALLEL_SIZE"
|
||||||
|
echo "Max num sequences: $MAX_NUM_SEQS"
|
||||||
|
echo "Max num batched tokens: $MAX_NUM_BATCHED_TOKENS"
|
||||||
|
echo "Max model length: $MAX_MODEL_LEN"
|
||||||
|
echo "Load in low bit: $LOAD_IN_LOW_BIT"
|
||||||
|
echo "Port: $PORT"
|
||||||
|
|
||||||
export CCL_WORKER_COUNT=2
|
export CCL_WORKER_COUNT=2
|
||||||
export SYCL_CACHE_PERSISTENT=1
|
export SYCL_CACHE_PERSISTENT=1
|
||||||
|
|
@ -25,7 +36,7 @@ source /opt/intel/1ccl-wks/setvars.sh
|
||||||
|
|
||||||
python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
|
python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
|
||||||
--served-model-name $SERVED_MODEL_NAME \
|
--served-model-name $SERVED_MODEL_NAME \
|
||||||
--port 8000 \
|
--port $PORT \
|
||||||
--model $MODEL_PATH \
|
--model $MODEL_PATH \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--block-size 8 \
|
--block-size 8 \
|
||||||
|
|
@ -33,10 +44,10 @@ python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
|
||||||
--device xpu \
|
--device xpu \
|
||||||
--dtype float16 \
|
--dtype float16 \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--load-in-low-bit fp8 \
|
--load-in-low-bit $LOAD_IN_LOW_BIT \
|
||||||
--max-model-len 2000 \
|
--max-model-len $MAX_MODEL_LEN \
|
||||||
--max-num-batched-tokens 3000 \
|
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||||
--max-num-seqs 256 \
|
--max-num-seqs $MAX_NUM_SEQS \
|
||||||
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
||||||
--disable-async-output-proc \
|
--disable-async-output-proc \
|
||||||
--distributed-executor-backend ray
|
--distributed-executor-backend ray
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue