From 1083fe55083f7b48b58b8e9eebe3c605dd2ada71 Mon Sep 17 00:00:00 2001 From: "Wang, Jian4" <61138589+hzjane@users.noreply.github.com> Date: Thu, 13 Feb 2025 10:16:00 +0800 Subject: [PATCH] Reenable pp and lightweight-serving serving on 0.6.6 (#12814) * reenable pp ang lightweight serving on 066 * update readme * updat * update tag --- docker/llm/serving/xpu/docker/Dockerfile | 8 +++++++- docker/llm/serving/xpu/docker/README.md | 4 ++-- .../xpu/docker/start-lightweight_serving-service.sh | 7 +++++++ docker/llm/serving/xpu/docker/start-pp_serving-service.sh | 5 ++++- python/llm/src/ipex_llm/transformers/convert.py | 3 ++- 5 files changed, 22 insertions(+), 5 deletions(-) create mode 100644 docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index 6b517eda..fd6ee473 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -93,6 +93,12 @@ RUN apt-get update && \ cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \ cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \ cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \ + # Download pp_serving + mkdir -p /llm/pp_serving && \ + cp ./ipex-llm/python/llm/example/GPU/Pipeline-Parallel-Serving/*.py /llm/pp_serving/ && \ + # Download lightweight_serving + mkdir -p /llm/lightweight_serving && \ + cp ./ipex-llm/python/llm/example/GPU/Lightweight-Serving/*.py /llm/lightweight_serving/ && \ rm -rf ./ipex-llm && \ # Install vllm dependencies pip install --upgrade fastapi && \ @@ -120,7 +126,7 @@ RUN apt-get update && \ cd /llm && \ rm -rf /tmp/neo && \ # Install vllm - git clone -b 0.6.6-pre https://github.com/analytics-zoo/vllm.git /llm/vllm && \ + git clone -b 0.6.6 https://github.com/analytics-zoo/vllm.git /llm/vllm && \ cd /llm/vllm && \ pip install setuptools-scm && \ pip install --upgrade cmake && \ diff --git a/docker/llm/serving/xpu/docker/README.md b/docker/llm/serving/xpu/docker/README.md index ba71fb4a..665067b1 100644 --- a/docker/llm/serving/xpu/docker/README.md +++ b/docker/llm/serving/xpu/docker/README.md @@ -49,14 +49,14 @@ Currently, we provide two different serving engines in the image, which are Fast To run Lightweight serving on one intel gpu using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Lightweight-Serving). -For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image. +For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image. And need to install the appropriate transformers version first, like `pip install transformers==4.37.0`. #### Pipeline parallel serving engine To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-FastAPI). -For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image. +For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image. And need to install the appropriate transformers version first, like `pip install transformers==4.37.0`. #### vLLM serving engine diff --git a/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh b/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh new file mode 100644 index 00000000..ce03dbc9 --- /dev/null +++ b/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh @@ -0,0 +1,7 @@ +# need to update transformers version first +# pip install transformers==4.37.0 +cd /llm/lightweight_serving +export IPEX_LLM_NOT_USE_VLLM=True +model_path="/llm/models/Llama-2-7b-chat-hf" +low_bit="sym_int4" +python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit \ No newline at end of file diff --git a/docker/llm/serving/xpu/docker/start-pp_serving-service.sh b/docker/llm/serving/xpu/docker/start-pp_serving-service.sh index 588f2922..8bb20a9d 100644 --- a/docker/llm/serving/xpu/docker/start-pp_serving-service.sh +++ b/docker/llm/serving/xpu/docker/start-pp_serving-service.sh @@ -1,9 +1,12 @@ +# update transformers version first +# pip install transformers==4.37.0 source /opt/intel/oneapi/setvars.sh --force +export IPEX_LLM_NOT_USE_VLLM=True export no_proxy=localhost export FI_PROVIDER=tcp export OMP_NUM_THREADS=32 -export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so +#export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force # source $basekit_root/ccl/latest/env/vars.sh --force diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index ef3f6cf7..168b5ab9 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -70,8 +70,9 @@ def is_auto_awq_available(): def is_vllm_available(): global _IS_VLLM_AVAILABLE + _IS_VLLM_AVAILABLE = os.getenv("IPEX_LLM_NOT_USE_VLLM", None) if _IS_VLLM_AVAILABLE is not None: - return _IS_VLLM_AVAILABLE + return False import sys original_path = sys.path # Temporally remove current directory