Reenable pp and lightweight-serving serving on 0.6.6 (#12814)

* reenable pp ang lightweight serving on 066 * update readme * updat * update tag
2025-02-13 10:16:00 +08:00 · 2025-02-13 10:16:00 +08:00 · 1083fe5508
commit 1083fe5508
parent af693425f1
5 changed files with 22 additions and 5 deletions
--- a/docker/llm/serving/xpu/docker/Dockerfile
+++ b/docker/llm/serving/xpu/docker/Dockerfile
@ -93,6 +93,12 @@ RUN apt-get update && \
    cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \
    cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \
    cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
+    # Download pp_serving
+    mkdir -p /llm/pp_serving && \
+    cp ./ipex-llm/python/llm/example/GPU/Pipeline-Parallel-Serving/*.py /llm/pp_serving/ && \
+    # Download lightweight_serving
+    mkdir -p /llm/lightweight_serving && \
+    cp ./ipex-llm/python/llm/example/GPU/Lightweight-Serving/*.py /llm/lightweight_serving/ && \
    rm -rf ./ipex-llm && \
    # Install vllm dependencies
    pip install --upgrade fastapi && \
@ -120,7 +126,7 @@ RUN apt-get update && \
    cd /llm && \
    rm -rf /tmp/neo && \
    # Install vllm
-    git clone -b 0.6.6-pre https://github.com/analytics-zoo/vllm.git /llm/vllm && \
+    git clone -b 0.6.6 https://github.com/analytics-zoo/vllm.git /llm/vllm && \
    cd /llm/vllm && \
    pip install setuptools-scm && \
    pip install --upgrade cmake && \
--- a/docker/llm/serving/xpu/docker/README.md
+++ b/docker/llm/serving/xpu/docker/README.md
@ -49,14 +49,14 @@ Currently, we provide two different serving engines in the image, which are Fast

 To run Lightweight serving on one intel gpu using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Lightweight-Serving).

-For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image.
+For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image. And need to install the appropriate transformers version first, like `pip install transformers==4.37.0`.


 #### Pipeline parallel serving engine

 To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-FastAPI).

-For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image.
+For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image. And need to install the appropriate transformers version first, like `pip install transformers==4.37.0`.


 #### vLLM serving engine
--- a/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh
+++ b/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh
@ -0,0 +1,7 @@
+# need to update transformers version first
+# pip install transformers==4.37.0
+cd /llm/lightweight_serving
+export IPEX_LLM_NOT_USE_VLLM=True
+model_path="/llm/models/Llama-2-7b-chat-hf"
+low_bit="sym_int4"
+python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit
--- a/docker/llm/serving/xpu/docker/start-pp_serving-service.sh
+++ b/docker/llm/serving/xpu/docker/start-pp_serving-service.sh
@ -1,9 +1,12 @@
+# update transformers version first
+# pip install transformers==4.37.0
 source /opt/intel/oneapi/setvars.sh --force
+export IPEX_LLM_NOT_USE_VLLM=True
 export no_proxy=localhost
 export FI_PROVIDER=tcp
 export OMP_NUM_THREADS=32

-export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
+#export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 basekit_root=/opt/intel/oneapi
 source $basekit_root/setvars.sh --force
 # source $basekit_root/ccl/latest/env/vars.sh --force
--- a/python/llm/src/ipex_llm/transformers/convert.py
+++ b/python/llm/src/ipex_llm/transformers/convert.py
@ -70,8 +70,9 @@ def is_auto_awq_available():

 def is_vllm_available():
    global _IS_VLLM_AVAILABLE
+    _IS_VLLM_AVAILABLE = os.getenv("IPEX_LLM_NOT_USE_VLLM", None)
    if _IS_VLLM_AVAILABLE is not None:
-        return _IS_VLLM_AVAILABLE
+        return False
    import sys
    original_path = sys.path
    # Temporally remove current directory