diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index 87e1d585..325239b0 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -27,10 +27,10 @@ RUN cd /llm &&\ # For Qwen series models support pip install transformers_stream_generator einops tiktoken -ADD ./offline_inference.py /llm/vllm-examples/ -ADD ./payload-1024.lua /llm/vllm-examples/ -ADD ./start-vllm-service.sh /llm/vllm-examples/ -ADD ./benchmark_throughput.py /llm/vllm-examples/ -ADD ./start-fastchat-service.sh /llm/fastchat-examples/ +ADD ./vllm_offline_inference.py /llm/ +ADD ./payload-1024.lua /llm/ +ADD ./start-vllm-service.sh /llm/ +ADD ./benchmark_vllm_throughput.py /llm/ +ADD ./start-fastchat-service.sh /llm/ WORKDIR /llm/ diff --git a/docker/llm/serving/xpu/docker/README.md b/docker/llm/serving/xpu/docker/README.md index 6ddf2d82..c14aa3ab 100644 --- a/docker/llm/serving/xpu/docker/README.md +++ b/docker/llm/serving/xpu/docker/README.md @@ -57,9 +57,9 @@ You can modify this script to using fastchat with either `ipex_llm_worker` or `v To run vLLM engine using `IPEX-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/ipex-llm/blob/main/python/llm/example/GPU/vLLM-Serving/README.md). -We have included multiple example files in `/llm/vllm-examples`: -1. `offline_inference.py`: Used for offline inference example -2. `benchmark_throughput.py`: Used for benchmarking throughput +We have included multiple example files in `/llm/`: +1. `vllm_offline_inference.py`: Used for vLLM offline inference example +2. `benchmark_vllm_throughput.py`: Used for benchmarking throughput 3. `payload-1024.lua`: Used for testing request per second using 1k-128 request 4. `start-vllm-service.sh`: Used for template for starting vLLM service @@ -69,19 +69,19 @@ We can benchmark the api_server to get an estimation about TPS (transactions per In container, do the following: -1. modify the `/llm/vllm-examples/payload-1024.lua` so that the "model" attribute is correct. By default, we use a prompt that is roughly 1024 token long, you can change it if needed. +1. modify the `/llm/payload-1024.lua` so that the "model" attribute is correct. By default, we use a prompt that is roughly 1024 token long, you can change it if needed. 2. Start the benchmark using `wrk` using the script below: ```bash -cd /llm/vllm-examples +cd /llm # You can change -t and -c to control the concurrency. # By default, we use 12 connections to benchmark the service. wrk -t12 -c12 -d15m -s payload-1024.lua http://localhost:8000/v1/completions --timeout 1h ``` -#### Offline benchmark through benchmark_throughput.py +#### Offline benchmark through benchmark_vllm_throughput.py -We have included the benchmark_throughput script provied by `vllm` in our image as `/llm/benchmark_throughput.py`. To use the benchmark_throughput script, you will need to download the test dataset through: +We have included the benchmark_throughput script provied by `vllm` in our image as `/llm/benchmark_vllm_throughput.py`. To use the benchmark_throughput script, you will need to download the test dataset through: ```bash wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json @@ -89,7 +89,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r The full example looks like this: ```bash -cd /llm/vllm-examples +cd /llm/ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json @@ -97,9 +97,9 @@ export MODEL="YOUR_MODEL" # You can change load-in-low-bit from values in [sym_int4, fp8, fp16] -python3 /llm/vllm-examples/benchmark_throughput.py \ +python3 /llm/benchmark_vllm_throughput.py \ --backend vllm \ - --dataset /llm/vllm-examples/ShareGPT_V3_unfiltered_cleaned_split.json \ + --dataset /llm/ShareGPT_V3_unfiltered_cleaned_split.json \ --model $MODEL \ --num-prompts 1000 \ --seed 42 \ @@ -147,9 +147,9 @@ for MODEL in "${MODELS[@]}"; do # Execute the command and redirect output to the log file # Sometimes you might need to set --max-model-len if memory is not enough # load-in-low-bit accepts inputs [sym_int4, fp8, fp16] - python3 /llm/vllm-examples/benchmark_throughput.py \ + python3 /llm/benchmark_vllm_throughput.py \ --backend vllm \ - --dataset /llm/vllm-examples/ShareGPT_V3_unfiltered_cleaned_split.json \ + --dataset /llm/ShareGPT_V3_unfiltered_cleaned_split.json \ --model $MODEL \ --num-prompts 1000 \ --seed 42 \ diff --git a/docker/llm/serving/xpu/docker/benchmark_throughput.py b/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py similarity index 100% rename from docker/llm/serving/xpu/docker/benchmark_throughput.py rename to docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py diff --git a/docker/llm/serving/xpu/docker/offline_inference.py b/docker/llm/serving/xpu/docker/vllm_offline_inference.py similarity index 100% rename from docker/llm/serving/xpu/docker/offline_inference.py rename to docker/llm/serving/xpu/docker/vllm_offline_inference.py