parent
30a8680645
commit
7e1e51d91a
3 changed files with 16 additions and 8 deletions
|
|
@ -97,6 +97,5 @@ COPY ./start-pp_serving-service.sh /llm/
|
|||
COPY ./start-lightweight_serving-service.sh /llm/
|
||||
|
||||
ENV LD_LIBRARY_PATH /usr/local/lib/python3.11/dist-packages/intel_extension_for_pytorch/lib/:/opt/intel/oneapi/tbb/2021.12/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/2021.12/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.12/lib:/opt/intel/oneapi/mkl/2024.1/lib:/opt/intel/oneapi/ippcp/2021.11/lib/:/opt/intel/oneapi/ipp/2021.11/lib:/opt/intel/oneapi/dpl/2022.5/lib:/opt/intel/oneapi/dnnl/2024.1/lib:/opt/intel/oneapi/debugger/2024.1/opt/debugger/lib:/opt/intel/oneapi/dal/2024.2/lib:/opt/intel/oneapi/compiler/2024.1/opt/oclfpga/host/linux64/lib:/opt/intel/oneapi/compiler/2024.1/opt/compiler/lib:/opt/intel/oneapi/compiler/2024.1/lib:/opt/intel/oneapi/ccl/2021.12/lib/
|
||||
ENV BIGDL_LLM_SDP_IGNORE_MASK 0
|
||||
|
||||
WORKDIR /llm/
|
||||
|
|
|
|||
|
|
@ -1,21 +1,30 @@
|
|||
#!/bin/bash
|
||||
model="YOUR_MODEL_PATH"
|
||||
served_model_name="YOUR_MODEL_NAME"
|
||||
|
||||
export CCL_WORKER_COUNT=2
|
||||
export FI_PROVIDER=shm
|
||||
export CCL_ATL_TRANSPORT=ofi
|
||||
export CCL_ZE_IPC_EXCHANGE=sockets
|
||||
export CCL_ATL_SHM=1
|
||||
|
||||
export USE_XETLA=OFF
|
||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
|
||||
export TORCH_LLM_ALLREDUCE=0
|
||||
|
||||
source /opt/intel/1ccl-wks/setvars.sh
|
||||
export BIGDL_LLM_SDP_IGNORE_MASK=0
|
||||
|
||||
python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
|
||||
--served-model-name $served_model_name \
|
||||
--port 8000 \
|
||||
--model $model \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.75 \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--device xpu \
|
||||
--dtype float16 \
|
||||
--enforce-eager \
|
||||
--load-in-low-bit sym_int4 \
|
||||
--max-model-len 4096 \
|
||||
--max-num-batched-tokens 10240 \
|
||||
--load-in-low-bit fp8 \
|
||||
--max-model-len 2048 \
|
||||
--max-num-batched-tokens 4000 \
|
||||
--max-num-seqs 12 \
|
||||
--tensor-parallel-size 1
|
||||
|
|
|
|||
|
|
@ -176,7 +176,7 @@ def benchmark(llm_urls, model, prompt, num_requests, max_concurrent_requests, ma
|
|||
cur_len = len(cur_llm_urls)
|
||||
if dataset is None:
|
||||
payload = {
|
||||
"model": model,
|
||||
"model": model_name,
|
||||
"prompt": prompt,
|
||||
"n": 1,
|
||||
"best_of": 1,
|
||||
|
|
@ -193,7 +193,7 @@ def benchmark(llm_urls, model, prompt, num_requests, max_concurrent_requests, ma
|
|||
for index in range(num_requests):
|
||||
prompt, prompt_len, output_len = sampled_requests[index]
|
||||
payload = {
|
||||
"model": model,
|
||||
"model": model_name,
|
||||
"prompt": prompt,
|
||||
"n": 1,
|
||||
"best_of": 1,
|
||||
|
|
|
|||
Loading…
Reference in a new issue