Reenable pp and lightweight-serving serving on 0.6.6 (#12814)
* reenable pp ang lightweight serving on 066 * update readme * updat * update tag
This commit is contained in:
parent
af693425f1
commit
1083fe5508
5 changed files with 22 additions and 5 deletions
|
|
@ -93,6 +93,12 @@ RUN apt-get update && \
|
||||||
cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \
|
cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \
|
||||||
cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \
|
cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \
|
||||||
cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
|
cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
|
||||||
|
# Download pp_serving
|
||||||
|
mkdir -p /llm/pp_serving && \
|
||||||
|
cp ./ipex-llm/python/llm/example/GPU/Pipeline-Parallel-Serving/*.py /llm/pp_serving/ && \
|
||||||
|
# Download lightweight_serving
|
||||||
|
mkdir -p /llm/lightweight_serving && \
|
||||||
|
cp ./ipex-llm/python/llm/example/GPU/Lightweight-Serving/*.py /llm/lightweight_serving/ && \
|
||||||
rm -rf ./ipex-llm && \
|
rm -rf ./ipex-llm && \
|
||||||
# Install vllm dependencies
|
# Install vllm dependencies
|
||||||
pip install --upgrade fastapi && \
|
pip install --upgrade fastapi && \
|
||||||
|
|
@ -120,7 +126,7 @@ RUN apt-get update && \
|
||||||
cd /llm && \
|
cd /llm && \
|
||||||
rm -rf /tmp/neo && \
|
rm -rf /tmp/neo && \
|
||||||
# Install vllm
|
# Install vllm
|
||||||
git clone -b 0.6.6-pre https://github.com/analytics-zoo/vllm.git /llm/vllm && \
|
git clone -b 0.6.6 https://github.com/analytics-zoo/vllm.git /llm/vllm && \
|
||||||
cd /llm/vllm && \
|
cd /llm/vllm && \
|
||||||
pip install setuptools-scm && \
|
pip install setuptools-scm && \
|
||||||
pip install --upgrade cmake && \
|
pip install --upgrade cmake && \
|
||||||
|
|
|
||||||
|
|
@ -49,14 +49,14 @@ Currently, we provide two different serving engines in the image, which are Fast
|
||||||
|
|
||||||
To run Lightweight serving on one intel gpu using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Lightweight-Serving).
|
To run Lightweight serving on one intel gpu using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Lightweight-Serving).
|
||||||
|
|
||||||
For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image.
|
For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image. And need to install the appropriate transformers version first, like `pip install transformers==4.37.0`.
|
||||||
|
|
||||||
|
|
||||||
#### Pipeline parallel serving engine
|
#### Pipeline parallel serving engine
|
||||||
|
|
||||||
To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-FastAPI).
|
To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-FastAPI).
|
||||||
|
|
||||||
For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image.
|
For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image. And need to install the appropriate transformers version first, like `pip install transformers==4.37.0`.
|
||||||
|
|
||||||
|
|
||||||
#### vLLM serving engine
|
#### vLLM serving engine
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
# need to update transformers version first
|
||||||
|
# pip install transformers==4.37.0
|
||||||
|
cd /llm/lightweight_serving
|
||||||
|
export IPEX_LLM_NOT_USE_VLLM=True
|
||||||
|
model_path="/llm/models/Llama-2-7b-chat-hf"
|
||||||
|
low_bit="sym_int4"
|
||||||
|
python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit
|
||||||
|
|
@ -1,9 +1,12 @@
|
||||||
|
# update transformers version first
|
||||||
|
# pip install transformers==4.37.0
|
||||||
source /opt/intel/oneapi/setvars.sh --force
|
source /opt/intel/oneapi/setvars.sh --force
|
||||||
|
export IPEX_LLM_NOT_USE_VLLM=True
|
||||||
export no_proxy=localhost
|
export no_proxy=localhost
|
||||||
export FI_PROVIDER=tcp
|
export FI_PROVIDER=tcp
|
||||||
export OMP_NUM_THREADS=32
|
export OMP_NUM_THREADS=32
|
||||||
|
|
||||||
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
|
#export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
|
||||||
basekit_root=/opt/intel/oneapi
|
basekit_root=/opt/intel/oneapi
|
||||||
source $basekit_root/setvars.sh --force
|
source $basekit_root/setvars.sh --force
|
||||||
# source $basekit_root/ccl/latest/env/vars.sh --force
|
# source $basekit_root/ccl/latest/env/vars.sh --force
|
||||||
|
|
|
||||||
|
|
@ -70,8 +70,9 @@ def is_auto_awq_available():
|
||||||
|
|
||||||
def is_vllm_available():
|
def is_vllm_available():
|
||||||
global _IS_VLLM_AVAILABLE
|
global _IS_VLLM_AVAILABLE
|
||||||
|
_IS_VLLM_AVAILABLE = os.getenv("IPEX_LLM_NOT_USE_VLLM", None)
|
||||||
if _IS_VLLM_AVAILABLE is not None:
|
if _IS_VLLM_AVAILABLE is not None:
|
||||||
return _IS_VLLM_AVAILABLE
|
return False
|
||||||
import sys
|
import sys
|
||||||
original_path = sys.path
|
original_path = sys.path
|
||||||
# Temporally remove current directory
|
# Temporally remove current directory
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue