Reenable pp and lightweight-serving serving on 0.6.6 (#12814)

* reenable pp ang lightweight serving on 066

* update readme

* updat

* update tag
This commit is contained in:
Wang, Jian4 2025-02-13 10:16:00 +08:00 committed by GitHub
parent af693425f1
commit 1083fe5508
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 22 additions and 5 deletions

View file

@ -93,6 +93,12 @@ RUN apt-get update && \
cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \ cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \
cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \ cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \
cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \ cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
# Download pp_serving
mkdir -p /llm/pp_serving && \
cp ./ipex-llm/python/llm/example/GPU/Pipeline-Parallel-Serving/*.py /llm/pp_serving/ && \
# Download lightweight_serving
mkdir -p /llm/lightweight_serving && \
cp ./ipex-llm/python/llm/example/GPU/Lightweight-Serving/*.py /llm/lightweight_serving/ && \
rm -rf ./ipex-llm && \ rm -rf ./ipex-llm && \
# Install vllm dependencies # Install vllm dependencies
pip install --upgrade fastapi && \ pip install --upgrade fastapi && \
@ -120,7 +126,7 @@ RUN apt-get update && \
cd /llm && \ cd /llm && \
rm -rf /tmp/neo && \ rm -rf /tmp/neo && \
# Install vllm # Install vllm
git clone -b 0.6.6-pre https://github.com/analytics-zoo/vllm.git /llm/vllm && \ git clone -b 0.6.6 https://github.com/analytics-zoo/vllm.git /llm/vllm && \
cd /llm/vllm && \ cd /llm/vllm && \
pip install setuptools-scm && \ pip install setuptools-scm && \
pip install --upgrade cmake && \ pip install --upgrade cmake && \

View file

@ -49,14 +49,14 @@ Currently, we provide two different serving engines in the image, which are Fast
To run Lightweight serving on one intel gpu using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Lightweight-Serving). To run Lightweight serving on one intel gpu using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Lightweight-Serving).
For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image. For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image. And need to install the appropriate transformers version first, like `pip install transformers==4.37.0`.
#### Pipeline parallel serving engine #### Pipeline parallel serving engine
To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-FastAPI). To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-FastAPI).
For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image. For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image. And need to install the appropriate transformers version first, like `pip install transformers==4.37.0`.
#### vLLM serving engine #### vLLM serving engine

View file

@ -0,0 +1,7 @@
# need to update transformers version first
# pip install transformers==4.37.0
cd /llm/lightweight_serving
export IPEX_LLM_NOT_USE_VLLM=True
model_path="/llm/models/Llama-2-7b-chat-hf"
low_bit="sym_int4"
python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit

View file

@ -1,9 +1,12 @@
# update transformers version first
# pip install transformers==4.37.0
source /opt/intel/oneapi/setvars.sh --force source /opt/intel/oneapi/setvars.sh --force
export IPEX_LLM_NOT_USE_VLLM=True
export no_proxy=localhost export no_proxy=localhost
export FI_PROVIDER=tcp export FI_PROVIDER=tcp
export OMP_NUM_THREADS=32 export OMP_NUM_THREADS=32
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so #export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
basekit_root=/opt/intel/oneapi basekit_root=/opt/intel/oneapi
source $basekit_root/setvars.sh --force source $basekit_root/setvars.sh --force
# source $basekit_root/ccl/latest/env/vars.sh --force # source $basekit_root/ccl/latest/env/vars.sh --force

View file

@ -70,8 +70,9 @@ def is_auto_awq_available():
def is_vllm_available(): def is_vllm_available():
global _IS_VLLM_AVAILABLE global _IS_VLLM_AVAILABLE
_IS_VLLM_AVAILABLE = os.getenv("IPEX_LLM_NOT_USE_VLLM", None)
if _IS_VLLM_AVAILABLE is not None: if _IS_VLLM_AVAILABLE is not None:
return _IS_VLLM_AVAILABLE return False
import sys import sys
original_path = sys.path original_path = sys.path
# Temporally remove current directory # Temporally remove current directory