Upgrade oneccl version to 0.0.6.3 (#12560)
* Update Dockerfile * Update Dockerfile * Update start-vllm-service.sh
This commit is contained in:
parent
47da3c999f
commit
51ff9ebd8a
2 changed files with 8 additions and 4 deletions
|
|
@ -64,8 +64,8 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
|
||||||
cd /tmp/ && \
|
cd /tmp/ && \
|
||||||
pip install torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 intel-extension-for-pytorch==2.1.30.post0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
|
pip install torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 intel-extension-for-pytorch==2.1.30.post0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
|
||||||
# Internal oneccl
|
# Internal oneccl
|
||||||
wget https://sourceforge.net/projects/oneccl-wks/files/2024.0.0.6.2-release/oneccl_wks_installer_2024.0.0.6.2.sh && \
|
wget https://sourceforge.net/projects/oneccl-wks/files/2024.0.0.6.3-release/oneccl_wks_installer_2024.0.0.6.3.sh && \
|
||||||
bash oneccl_wks_installer_2024.0.0.6.2.sh && \
|
bash oneccl_wks_installer_2024.0.0.6.3.sh && \
|
||||||
git clone https://github.com/intel/torch-ccl -b v2.1.300+xpu && \
|
git clone https://github.com/intel/torch-ccl -b v2.1.300+xpu && \
|
||||||
cd torch-ccl && \
|
cd torch-ccl && \
|
||||||
patch -p1 < /tmp/oneccl-binding.patch && \
|
patch -p1 < /tmp/oneccl-binding.patch && \
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ model="YOUR_MODEL_PATH"
|
||||||
served_model_name="YOUR_MODEL_NAME"
|
served_model_name="YOUR_MODEL_NAME"
|
||||||
|
|
||||||
export CCL_WORKER_COUNT=2
|
export CCL_WORKER_COUNT=2
|
||||||
|
export SYCL_CACHE_PERSISTENT=1
|
||||||
export FI_PROVIDER=shm
|
export FI_PROVIDER=shm
|
||||||
export CCL_ATL_TRANSPORT=ofi
|
export CCL_ATL_TRANSPORT=ofi
|
||||||
export CCL_ZE_IPC_EXCHANGE=sockets
|
export CCL_ZE_IPC_EXCHANGE=sockets
|
||||||
|
|
@ -12,6 +13,9 @@ export USE_XETLA=OFF
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
|
||||||
export TORCH_LLM_ALLREDUCE=0
|
export TORCH_LLM_ALLREDUCE=0
|
||||||
|
|
||||||
|
export CCL_SAME_STREAM=1
|
||||||
|
export CCL_BLOCKING_WAIT=0
|
||||||
|
|
||||||
source /opt/intel/1ccl-wks/setvars.sh
|
source /opt/intel/1ccl-wks/setvars.sh
|
||||||
|
|
||||||
python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
|
python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
|
||||||
|
|
@ -20,14 +24,14 @@ python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
|
||||||
--model $model \
|
--model $model \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--block-size 8 \
|
--block-size 8 \
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.95 \
|
||||||
--device xpu \
|
--device xpu \
|
||||||
--dtype float16 \
|
--dtype float16 \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--load-in-low-bit fp8 \
|
--load-in-low-bit fp8 \
|
||||||
--max-model-len 2048 \
|
--max-model-len 2048 \
|
||||||
--max-num-batched-tokens 4000 \
|
--max-num-batched-tokens 4000 \
|
||||||
--max-num-seqs 12 \
|
--max-num-seqs 256 \
|
||||||
--tensor-parallel-size 1 \
|
--tensor-parallel-size 1 \
|
||||||
--disable-async-output-proc \
|
--disable-async-output-proc \
|
||||||
--distributed-executor-backend ray
|
--distributed-executor-backend ray
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue