Add Kubernetes support for BigDL-LLM-serving CPU. (#9071)

This commit is contained in:
ZehuaCao 2023-10-07 09:37:48 +08:00 committed by GitHub
parent 36dd4afd61
commit b773d67dd4
6 changed files with 580 additions and 1 deletions

View file

@ -12,6 +12,7 @@ on:
- all
- bigdl-llm-xpu
- bigdl-llm-cpu
- bigdl-llm-serving-cpu
- bigdl-ppml-gramine-base
- bigdl-ppml-trusted-bigdl-llm-gramine-base
- bigdl-ppml-trusted-bigdl-llm-gramine-ref
@ -114,6 +115,32 @@ jobs:
sudo docker push 10.239.45.10/arda/${image}:${TAG}
sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
bigdl-llm-serving-cpu:
if: ${{ github.event.inputs.artifact == 'bigdl-llm-serving-cpu' || github.event.inputs.artifact == 'all' }}
runs-on: [self-hosted, Shire]
steps:
- uses: actions/checkout@v3
- name: docker login
run: |
docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
- name: bigdl-llm-serving-cpu
run: |
echo "##############################################################"
echo "####### bigdl-llm-serving-cpu ########"
echo "##############################################################"
export image=intelanalytics/bigdl-llm-serving-cpu
cd docker/llm/serving/cpu/docker
sudo docker build \
--no-cache=true \
--build-arg http_proxy=${HTTP_PROXY} \
--build-arg https_proxy=${HTTPS_PROXY} \
--build-arg no_proxy=${NO_PROXY} \
-t ${image}:${TAG} -f ./Dockerfile .
sudo docker push ${image}:${TAG}
sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
sudo docker push 10.239.45.10/arda/${image}:${TAG}
sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
bigdl-ppml-gramine-base:
if: ${{ github.event.inputs.artifact == 'bigdl-ppml-gramine-base' || github.event.inputs.artifact == 'all' }}
runs-on: [self-hosted, Shire]

View file

@ -2,10 +2,13 @@ FROM intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT
ARG http_proxy
ARG https_proxy
ARG TINI_VERSION=v0.18.0
# Disable pip's cache behavior
ARG PIP_NO_CACHE_DIR=false
ADD ./entrypoint.sh /opt/entrypoint.sh
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini
# Install Serving Dependencies
RUN mkdir /llm && \
cd /llm && \
@ -13,7 +16,11 @@ RUN mkdir /llm && \
cd FastChat && \
git checkout dev-2023-09-22 && \
pip3 install -e ".[model_worker,webui]" && \
cd /llm
cd /llm && \
chmod +x /opt/entrypoint.sh && \
chmod +x /sbin/tini && \
cp /sbin/tini /usr/bin/tini
WORKDIR /llm/
ENTRYPOINT [ "/opt/entrypoint.sh" ]

View file

@ -0,0 +1,200 @@
#!/bin/bash
usage() {
echo "Usage: $0 [-m --mode <controller|worker>] [-h --help]"
echo "-h: Print help message."
echo "Controller mode reads the following env:"
echo "CONTROLLER_HOST (default: localhost)."
echo "CONTROLLER_PORT (default: 21001)."
echo "API_HOST (default: localhost)."
echo "API_PORT (default: 8000)."
echo "Worker mode reads the following env:"
echo "CONTROLLER_HOST (default: localhost)."
echo "CONTROLLER_PORT (default: 21001)."
echo "WORKER_HOST (default: localhost)."
echo "WORKER_PORT (default: 21002)."
echo "MODEL_PATH (default: empty)."
exit 1
}
# Acquire correct core_nums if using cpuset-cpus, return -1 if file not exist
calculate_total_cores() {
local cpuset_file="/sys/fs/cgroup/cpuset/cpuset.cpus"
if [[ -f "$cpuset_file" ]]; then
local cpuset_cpus=$(cat "$cpuset_file")
cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
local total_cores=0
IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
for cpu in "${cpu_list[@]}"; do
if [[ $cpu =~ - ]]; then
# Range of CPUs
local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
local range_cores=$((end_cpu - start_cpu + 1))
total_cores=$((total_cores + range_cores))
else
# Single CPU
total_cores=$((total_cores + 1))
fi
done
echo $total_cores
return
fi
# Kubernetes core-binding will use this file
cpuset_file="/sys/fs/cgroup/cpuset.cpus"
if [[ -f "$cpuset_file" ]]; then
local cpuset_cpus=$(cat "$cpuset_file")
cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
local total_cores=0
IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
for cpu in "${cpu_list[@]}"; do
if [[ $cpu =~ - ]]; then
# Range of CPUs
local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
local range_cores=$((end_cpu - start_cpu + 1))
total_cores=$((total_cores + range_cores))
else
# Single CPU
total_cores=$((total_cores + 1))
fi
done
echo $total_cores
return
else
echo -1
return
fi
}
# Default values
controller_host="localhost"
controller_port="21001"
api_host="localhost"
api_port="8000"
worker_host="localhost"
worker_port="21002"
model_path=""
mode=""
omp_num_threads=""
dispatch_method="shortest_queue" # shortest_queue or lottery
# Update rootCA config if needed
update-ca-certificates
# Remember the value of `OMP_NUM_THREADS`:
if [[ -n "${OMP_NUM_THREADS}" ]]; then
omp_num_threads="${OMP_NUM_THREADS}"
fi
# We do not have any arguments, just run bash
if [ "$#" == 0 ]; then
echo "[INFO] no command is passed in"
echo "[INFO] enter pass-through mode"
exec /usr/bin/tini -s -- "bash"
else
# Parse command-line options
options=$(getopt -o "m:h" --long "mode:,help" -n "$0" -- "$@")
if [ $? != 0 ]; then
usage
fi
eval set -- "$options"
while true; do
case "$1" in
-m|--mode)
mode="$2"
[[ $mode == "controller" || $mode == "worker" ]] || usage
shift 2
;;
-h|--help)
usage
;;
--)
shift
break
;;
*)
usage
;;
esac
done
if [[ -n $CONTROLLER_HOST ]]; then
controller_host=$CONTROLLER_HOST
fi
if [[ -n $CONTROLLER_PORT ]]; then
controller_port=$CONTROLLER_PORT
fi
if [[ -n $API_HOST ]]; then
api_host=$API_HOST
fi
if [[ -n $API_PORT ]]; then
api_port=$API_PORT
fi
if [[ -n $WORKER_HOST ]]; then
worker_host=$WORKER_HOST
fi
if [[ -n $WORKER_PORT ]]; then
worker_port=$WORKER_PORT
fi
if [[ -n $MODEL_PATH ]]; then
model_path=$MODEL_PATH
fi
if [[ -n $DISPATCH_METHOD ]]; then
dispatch_method=$DISPATCH_METHOD
fi
controller_address="http://$controller_host:$controller_port"
# Execute logic based on options
if [[ $mode == "controller" ]]; then
# Logic for controller mode
# Boot Controller
api_address="http://$api_host:$api_port"
echo "Controller address: $controller_address"
echo "OpenAI API address: $api_address"
python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
# Boot openai api server
python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address
else
# Logic for non-controller(worker) mode
worker_address="http://$worker_host:$worker_port"
# Apply optimizations from bigdl-nano
source bigdl-nano-init -t
# First check if user have set OMP_NUM_THREADS by themselves
if [[ -n "${omp_num_threads}" ]]; then
echo "Setting OMP_NUM_THREADS to its original value: $omp_num_threads"
export OMP_NUM_THREADS=$omp_num_threads
else
# Use calculate_total_cores to acquire cpuset settings
# Set OMP_NUM_THREADS to correct numbers
cores=$(calculate_total_cores)
if [[ $cores == -1 || $cores == 0 ]]; then
echo "Failed to obtain the number of cores, will use the default settings OMP_NUM_THREADS=$OMP_NUM_THREADS"
else
echo "Setting OMP_NUM_THREADS to $cores"
export OMP_NUM_THREADS=$cores
fi
fi
if [[ -z "${model_path}" ]]; then
echo "Please set env MODEL_PATH used for worker"
usage
fi
echo "Worker address: $worker_address"
echo "Controller address: $controller_address"
python3 -m fastchat.serve.model_worker --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
fi
fi

View file

@ -0,0 +1,235 @@
## Deployment bigdl-llm serving service in K8S environment
## Image
To deploy BigDL-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT`
## Before deployment
### Models
In this document, we will use `vicuna-7b-v1.5` as the deployment model.
After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-bigdl` to use `bigdl-llm` as the backend. The `bigdl-llm` backend will be used if model path contains `bigdl`. Otherwise, the original transformer-backend will be used.
You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5).
### Kubernetes config
We recommend to setup your kubernetes cluster before deployment. Mostly importantly, please set `cpu-management-policy` to `static` by using this [tutorial](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/). Also, it would be great to also set the `topology management policy` to `single-numa-node`.
### Machine config
Set hyper-threading to off, ensure that only physical cores are used during deployment.
## Deployment
### Reminder on `OMP_NUM_THREADS`
The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct number by reading configs from the `runtime`. However, this only happens correctly if the `core-binding` feature is enabled. If not, please set environment variable `OMP_NUM_THREADS` manually in the yaml file.
### Controller
We use the following yaml file for controller deployment:
```yaml
apiVersion: v1
kind: Pod
metadata:
name: bigdl-fschat-a1234bd-controller
labels:
fastchat-appid: a1234bd
fastchat-app-type: controller
spec:
dnsPolicy: "ClusterFirst"
containers:
- name: fastchat-controller # fixed
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
imagePullPolicy: IfNotPresent
env:
- name: CONTROLLER_HOST # fixed
value: "0.0.0.0"
- name: CONTROLLER_PORT # fixed
value: "21005"
- name: API_HOST # fixed
value: "0.0.0.0"
- name: API_PORT # fixed
value: "8000"
ports:
- containerPort: 21005
name: con-port
- containerPort: 8000
name: api-port
resources:
requests:
memory: 16Gi
cpu: 4
limits:
memory: 16Gi
cpu: 4
args: ["-m", "controller"]
restartPolicy: "Never"
---
# Service for the controller
apiVersion: v1
kind: Service
metadata:
name: bigdl-a1234bd-fschat-controller-service
spec:
# You may also want to change this to use the cluster's feature
type: NodePort
selector:
fastchat-appid: a1234bd
fastchat-app-type: controller
ports:
- name: cont-port
protocol: TCP
port: 21005
targetPort: 21005
- name: api-port
protocol: TCP
port: 8000
targetPort: 8000
```
### Worker
We use the following deployment for worker deployment:
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: bigdl-fschat-a1234bd-worker-deployment
spec:
# Change this to the number you want
replicas: 1
selector:
matchLabels:
fastchat: worker
template:
metadata:
labels:
fastchat: worker
spec:
dnsPolicy: "ClusterFirst"
containers:
- name: fastchat-worker # fixed
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
imagePullPolicy: IfNotPresent
env:
- name: CONTROLLER_HOST # fixed
value: bigdl-a1234bd-fschat-controller-service
- name: CONTROLLER_PORT # fixed
value: "21005"
- name: WORKER_HOST # fixed
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: WORKER_PORT # fixed
value: "21841"
- name: MODEL_PATH # Change this
value: "/llm/models/vicuna-7b-v1.5-bigdl/"
- name: OMP_NUM_THREADS
value: "16"
resources:
requests:
memory: 32Gi
cpu: 16
limits:
memory: 32Gi
cpu: 16
args: ["-m", "worker"]
volumeMounts:
- name: llm-models
mountPath: /llm/models/
restartPolicy: "Always"
volumes:
- name: llm-models
hostPath:
path: /home/llm/models # change this in other envs
```
You may want to change the `MODEL_PATH` variable in the yaml. Also, please remember to change the volume path accordingly.
### Testing
#### Using openai-python
First, install openai-python:
```bash
pip install --upgrade openai
```
Then, interact with model vicuna-7b-v1.5-bigdl:
```python
import openai
openai.api_key = "EMPTY"
openai.api_base = "http://localhost:8000/v1"
model = "vicuna-7b-v1.5-bigdl"
prompt = "Once upon a time"
# create a completion
completion = openai.Completion.create(model=model, prompt=prompt, max_tokens=64)
# print the completion
print(prompt + completion.choices[0].text)
# create a chat completion
completion = openai.ChatCompletion.create(
model=model,
messages=[{"role": "user", "content": "Hello! What is your name?"}]
)
# print the completion
print(completion.choices[0].message.content)
```
#### cURL
cURL is another good tool for observing the output of the api.
For the following examples, you may also change the service deployment address.
List Models:
```bash
curl http://localhost:8000/v1/models
```
If you have `jq` installed, you can use it to format the output like this:
```bash
curl http://localhost:8000/v1/models | jq
```
Chat Completions:
```bash
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "YOUR_MODEL",
"messages": [{"role": "user", "content": "Hello! What is your name?"}]
}'
```
Text Completions:
```bash
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "YOUR_MODEL",
"prompt": "Once upon a time",
"max_tokens": 41,
"temperature": 0.5
}'
```
Embeddings:
```bash
curl http://localhost:8000/v1/embeddings \
-H "Content-Type: application/json" \
-d '{
"model": "YOUR_MODEL",
"input": "Hello world!"
}'
```

View file

@ -0,0 +1 @@
kubectl delete -f deployment.yaml

View file

@ -0,0 +1,109 @@
apiVersion: v1
kind: Pod
metadata:
name: bigdl-fschat-a1234bd-controller
labels:
fastchat-appid: a1234bd
fastchat-app-type: controller
spec:
dnsPolicy: "ClusterFirst"
containers:
- name: fastchat-controller # fixed
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
imagePullPolicy: IfNotPresent
env:
- name: CONTROLLER_HOST # fixed
value: "0.0.0.0"
- name: CONTROLLER_PORT # fixed
value: "21005"
- name: API_HOST # fixed
value: "0.0.0.0"
- name: API_PORT # fixed
value: "8000"
ports:
- containerPort: 21005
name: con-port
- containerPort: 8000
name: api-port
resources:
requests:
memory: 16Gi
cpu: 4
limits:
memory: 16Gi
cpu: 4
args: ["-m", "controller"]
restartPolicy: "Never"
---
# Service for the controller
apiVersion: v1
kind: Service
metadata:
name: bigdl-a1234bd-fschat-controller-service
spec:
# You may also want to change this to use the cluster's feature
type: NodePort
selector:
fastchat-appid: a1234bd
fastchat-app-type: controller
ports:
- name: cont-port
protocol: TCP
port: 21005
targetPort: 21005
- name: api-port
protocol: TCP
port: 8000
targetPort: 8000
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: bigdl-fschat-a1234bd-worker-deployment
spec:
# Change this to the number you want
replicas: 1
selector:
matchLabels:
fastchat: worker
template:
metadata:
labels:
fastchat: worker
spec:
dnsPolicy: "ClusterFirst"
containers:
- name: fastchat-worker # fixed
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
imagePullPolicy: IfNotPresent
env:
- name: CONTROLLER_HOST # fixed
value: bigdl-a1234bd-fschat-controller-service
- name: CONTROLLER_PORT # fixed
value: "21005"
- name: WORKER_HOST # fixed
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: WORKER_PORT # fixed
value: "21841"
- name: MODEL_PATH # Change this
value: "/llm/models/vicuna-7b-v1.5-bigdl/"
- name: OMP_NUM_THREADS
value: "16"
resources:
requests:
memory: 32Gi
cpu: 16
limits:
memory: 32Gi
cpu: 16
args: ["-m", "worker"]
volumeMounts:
- name: llm-models
mountPath: /llm/models/
restartPolicy: "Always"
volumes:
- name: llm-models
hostPath:
path: /home/llm/models # change this in other envs