From 1083fe55083f7b48b58b8e9eebe3c605dd2ada71 Mon Sep 17 00:00:00 2001
From: "Wang, Jian4" <61138589+hzjane@users.noreply.github.com>
Date: Thu, 13 Feb 2025 10:16:00 +0800
Subject: [PATCH] Reenable pp and lightweight-serving serving on 0.6.6 (#12814)

* reenable pp ang lightweight serving on 066

* update readme

* updat

* update tag
---
 docker/llm/serving/xpu/docker/Dockerfile                  | 8 +++++++-
 docker/llm/serving/xpu/docker/README.md                   | 4 ++--
 .../xpu/docker/start-lightweight_serving-service.sh       | 7 +++++++
 docker/llm/serving/xpu/docker/start-pp_serving-service.sh | 5 ++++-
 python/llm/src/ipex_llm/transformers/convert.py           | 3 ++-
 5 files changed, 22 insertions(+), 5 deletions(-)
 create mode 100644 docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh

diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile
index 6b517eda..fd6ee473 100644
--- a/docker/llm/serving/xpu/docker/Dockerfile
+++ b/docker/llm/serving/xpu/docker/Dockerfile
@@ -93,6 +93,12 @@ RUN apt-get update && \
     cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \
     cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \
     cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
+    # Download pp_serving
+    mkdir -p /llm/pp_serving && \
+    cp ./ipex-llm/python/llm/example/GPU/Pipeline-Parallel-Serving/*.py /llm/pp_serving/ && \
+    # Download lightweight_serving
+    mkdir -p /llm/lightweight_serving && \
+    cp ./ipex-llm/python/llm/example/GPU/Lightweight-Serving/*.py /llm/lightweight_serving/ && \
     rm -rf ./ipex-llm && \
     # Install vllm dependencies
     pip install --upgrade fastapi && \
@@ -120,7 +126,7 @@ RUN apt-get update && \
     cd /llm && \
     rm -rf /tmp/neo && \
     # Install vllm
-    git clone -b 0.6.6-pre https://github.com/analytics-zoo/vllm.git /llm/vllm && \
+    git clone -b 0.6.6 https://github.com/analytics-zoo/vllm.git /llm/vllm && \
     cd /llm/vllm && \
     pip install setuptools-scm && \
     pip install --upgrade cmake && \
diff --git a/docker/llm/serving/xpu/docker/README.md b/docker/llm/serving/xpu/docker/README.md
index ba71fb4a..665067b1 100644
--- a/docker/llm/serving/xpu/docker/README.md
+++ b/docker/llm/serving/xpu/docker/README.md
@@ -49,14 +49,14 @@ Currently, we provide two different serving engines in the image, which are Fast
 
 To run Lightweight serving on one intel gpu using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Lightweight-Serving).
 
-For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image.
+For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image. And need to install the appropriate transformers version first, like `pip install transformers==4.37.0`.
 
 
 #### Pipeline parallel serving engine
 
 To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-FastAPI).
 
-For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image.
+For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image. And need to install the appropriate transformers version first, like `pip install transformers==4.37.0`.
 
 
 #### vLLM serving engine
diff --git a/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh b/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh
new file mode 100644
index 00000000..ce03dbc9
--- /dev/null
+++ b/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh
@@ -0,0 +1,7 @@
+# need to update transformers version first
+# pip install transformers==4.37.0
+cd /llm/lightweight_serving
+export IPEX_LLM_NOT_USE_VLLM=True
+model_path="/llm/models/Llama-2-7b-chat-hf"
+low_bit="sym_int4"
+python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit
\ No newline at end of file
diff --git a/docker/llm/serving/xpu/docker/start-pp_serving-service.sh b/docker/llm/serving/xpu/docker/start-pp_serving-service.sh
index 588f2922..8bb20a9d 100644
--- a/docker/llm/serving/xpu/docker/start-pp_serving-service.sh
+++ b/docker/llm/serving/xpu/docker/start-pp_serving-service.sh
@@ -1,9 +1,12 @@
+# update transformers version first
+# pip install transformers==4.37.0
 source /opt/intel/oneapi/setvars.sh --force
+export IPEX_LLM_NOT_USE_VLLM=True
 export no_proxy=localhost
 export FI_PROVIDER=tcp
 export OMP_NUM_THREADS=32
 
-export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
+#export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 basekit_root=/opt/intel/oneapi
 source $basekit_root/setvars.sh --force
 # source $basekit_root/ccl/latest/env/vars.sh --force
diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
index ef3f6cf7..168b5ab9 100644
--- a/python/llm/src/ipex_llm/transformers/convert.py
+++ b/python/llm/src/ipex_llm/transformers/convert.py
@@ -70,8 +70,9 @@ def is_auto_awq_available():
 
 def is_vllm_available():
     global _IS_VLLM_AVAILABLE
+    _IS_VLLM_AVAILABLE = os.getenv("IPEX_LLM_NOT_USE_VLLM", None)
     if _IS_VLLM_AVAILABLE is not None:
-        return _IS_VLLM_AVAILABLE
+        return False
     import sys
     original_path = sys.path
     # Temporally remove current directory