Add pp_serving example to serving image (#11433)

* init pp

* update

* update

* no clone ipex-llm again
This commit is contained in:
Wang, Jian4 2024-06-28 16:45:25 +08:00 committed by GitHub
parent fd933c92d8
commit e000ac90c4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 40 additions and 1 deletions

View file

@ -58,6 +58,9 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
pip install --upgrade "uvicorn[standard]" && \ pip install --upgrade "uvicorn[standard]" && \
# Download vLLM-Serving # Download vLLM-Serving
cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \ cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
# Download pp_serving
mkdir -p /llm/pp_serving && \
cp ./ipex-llm/python/llm/example/GPU/Pipeline-Parallel-FastAPI/*.py /llm/pp_serving/ && \
# Install related library of benchmarking # Install related library of benchmarking
pip install pandas omegaconf && \ pip install pandas omegaconf && \
chmod +x /llm/benchmark.sh && \ chmod +x /llm/benchmark.sh && \

View file

@ -21,7 +21,10 @@ RUN apt-get update && \
pip install outlines==0.0.34 --no-deps && \ pip install outlines==0.0.34 --no-deps && \
pip install interegular cloudpickle diskcache joblib lark nest-asyncio numba scipy && \ pip install interegular cloudpickle diskcache joblib lark nest-asyncio numba scipy && \
# For Qwen series models support # For Qwen series models support
pip install transformers_stream_generator einops tiktoken pip install transformers_stream_generator einops tiktoken && \
# For pipeline serving support
pip install mpi4py fastapi uvicorn openai && \
pip install gradio # for gradio web UI
COPY ./vllm_online_benchmark.py /llm/ COPY ./vllm_online_benchmark.py /llm/
COPY ./vllm_offline_inference.py /llm/ COPY ./vllm_offline_inference.py /llm/
@ -29,5 +32,7 @@ COPY ./payload-1024.lua /llm/
COPY ./start-vllm-service.sh /llm/ COPY ./start-vllm-service.sh /llm/
COPY ./benchmark_vllm_throughput.py /llm/ COPY ./benchmark_vllm_throughput.py /llm/
COPY ./start-fastchat-service.sh /llm/ COPY ./start-fastchat-service.sh /llm/
COPY ./start-pp_serving-service.sh /llm/
WORKDIR /llm/ WORKDIR /llm/

View file

@ -45,6 +45,13 @@ After the container is booted, you could get into the container through `docker
Currently, we provide two different serving engines in the image, which are FastChat serving engine and vLLM serving engine. Currently, we provide two different serving engines in the image, which are FastChat serving engine and vLLM serving engine.
#### Pipeline parallel serving engine
To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-FastAPI).
For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image.
#### FastChat serving engine #### FastChat serving engine
To run model-serving using `IPEX-LLM` as backend using FastChat, you can refer to this [quickstart](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/fastchat_quickstart.html#). To run model-serving using `IPEX-LLM` as backend using FastChat, you can refer to this [quickstart](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/fastchat_quickstart.html#).

View file

@ -0,0 +1,24 @@
source /opt/intel/oneapi/setvars.sh --force
export no_proxy=localhost
export FI_PROVIDER=tcp
export OMP_NUM_THREADS=32
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
basekit_root=/opt/intel/oneapi
source $basekit_root/setvars.sh --force
source $basekit_root/ccl/latest/env/vars.sh --force
export USE_XETLA=OFF
if [[ $KERNEL_VERSION != *"6.5"* ]]; then
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
fi
export TORCH_LLM_ALLREDUCE=0
export IPEX_LLM_QUANTIZE_KV_CACHE=1
export num_gpus=2
export model_path="/llm/models/Llama-2-7b-chat-hf"
export low_bit="fp8"
# max requests = max_num_reqs * rank_num
export max_num_seqs="4"
cd /llm/pp_serving
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $num_gpus pipeline_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit --max-num-seqs $max_num_seqs