parent
30a8680645
commit
7e1e51d91a
3 changed files with 16 additions and 8 deletions
|
|
@ -97,6 +97,5 @@ COPY ./start-pp_serving-service.sh /llm/
|
||||||
COPY ./start-lightweight_serving-service.sh /llm/
|
COPY ./start-lightweight_serving-service.sh /llm/
|
||||||
|
|
||||||
ENV LD_LIBRARY_PATH /usr/local/lib/python3.11/dist-packages/intel_extension_for_pytorch/lib/:/opt/intel/oneapi/tbb/2021.12/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/2021.12/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.12/lib:/opt/intel/oneapi/mkl/2024.1/lib:/opt/intel/oneapi/ippcp/2021.11/lib/:/opt/intel/oneapi/ipp/2021.11/lib:/opt/intel/oneapi/dpl/2022.5/lib:/opt/intel/oneapi/dnnl/2024.1/lib:/opt/intel/oneapi/debugger/2024.1/opt/debugger/lib:/opt/intel/oneapi/dal/2024.2/lib:/opt/intel/oneapi/compiler/2024.1/opt/oclfpga/host/linux64/lib:/opt/intel/oneapi/compiler/2024.1/opt/compiler/lib:/opt/intel/oneapi/compiler/2024.1/lib:/opt/intel/oneapi/ccl/2021.12/lib/
|
ENV LD_LIBRARY_PATH /usr/local/lib/python3.11/dist-packages/intel_extension_for_pytorch/lib/:/opt/intel/oneapi/tbb/2021.12/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/2021.12/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.12/lib:/opt/intel/oneapi/mkl/2024.1/lib:/opt/intel/oneapi/ippcp/2021.11/lib/:/opt/intel/oneapi/ipp/2021.11/lib:/opt/intel/oneapi/dpl/2022.5/lib:/opt/intel/oneapi/dnnl/2024.1/lib:/opt/intel/oneapi/debugger/2024.1/opt/debugger/lib:/opt/intel/oneapi/dal/2024.2/lib:/opt/intel/oneapi/compiler/2024.1/opt/oclfpga/host/linux64/lib:/opt/intel/oneapi/compiler/2024.1/opt/compiler/lib:/opt/intel/oneapi/compiler/2024.1/lib:/opt/intel/oneapi/ccl/2021.12/lib/
|
||||||
ENV BIGDL_LLM_SDP_IGNORE_MASK 0
|
|
||||||
|
|
||||||
WORKDIR /llm/
|
WORKDIR /llm/
|
||||||
|
|
|
||||||
|
|
@ -1,21 +1,30 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
model="YOUR_MODEL_PATH"
|
model="YOUR_MODEL_PATH"
|
||||||
served_model_name="YOUR_MODEL_NAME"
|
served_model_name="YOUR_MODEL_NAME"
|
||||||
|
|
||||||
|
export CCL_WORKER_COUNT=2
|
||||||
|
export FI_PROVIDER=shm
|
||||||
|
export CCL_ATL_TRANSPORT=ofi
|
||||||
|
export CCL_ZE_IPC_EXCHANGE=sockets
|
||||||
|
export CCL_ATL_SHM=1
|
||||||
|
|
||||||
|
export USE_XETLA=OFF
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
|
||||||
|
export TORCH_LLM_ALLREDUCE=0
|
||||||
|
|
||||||
source /opt/intel/1ccl-wks/setvars.sh
|
source /opt/intel/1ccl-wks/setvars.sh
|
||||||
export BIGDL_LLM_SDP_IGNORE_MASK=0
|
|
||||||
|
|
||||||
python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
|
python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
|
||||||
--served-model-name $served_model_name \
|
--served-model-name $served_model_name \
|
||||||
--port 8000 \
|
--port 8000 \
|
||||||
--model $model \
|
--model $model \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.75 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--device xpu \
|
--device xpu \
|
||||||
--dtype float16 \
|
--dtype float16 \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--load-in-low-bit sym_int4 \
|
--load-in-low-bit fp8 \
|
||||||
--max-model-len 4096 \
|
--max-model-len 2048 \
|
||||||
--max-num-batched-tokens 10240 \
|
--max-num-batched-tokens 4000 \
|
||||||
--max-num-seqs 12 \
|
--max-num-seqs 12 \
|
||||||
--tensor-parallel-size 1
|
--tensor-parallel-size 1
|
||||||
|
|
|
||||||
|
|
@ -176,7 +176,7 @@ def benchmark(llm_urls, model, prompt, num_requests, max_concurrent_requests, ma
|
||||||
cur_len = len(cur_llm_urls)
|
cur_len = len(cur_llm_urls)
|
||||||
if dataset is None:
|
if dataset is None:
|
||||||
payload = {
|
payload = {
|
||||||
"model": model,
|
"model": model_name,
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"n": 1,
|
"n": 1,
|
||||||
"best_of": 1,
|
"best_of": 1,
|
||||||
|
|
@ -193,7 +193,7 @@ def benchmark(llm_urls, model, prompt, num_requests, max_concurrent_requests, ma
|
||||||
for index in range(num_requests):
|
for index in range(num_requests):
|
||||||
prompt, prompt_len, output_len = sampled_requests[index]
|
prompt, prompt_len, output_len = sampled_requests[index]
|
||||||
payload = {
|
payload = {
|
||||||
"model": model,
|
"model": model_name,
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"n": 1,
|
"n": 1,
|
||||||
"best_of": 1,
|
"best_of": 1,
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue