From b773d67dd460876bdae2835fa8646b37f486dc19 Mon Sep 17 00:00:00 2001
From: ZehuaCao <47251317+Romanticoseu@users.noreply.github.com>
Date: Sat, 7 Oct 2023 09:37:48 +0800
Subject: [PATCH] Add Kubernetes support for BigDL-LLM-serving CPU. (#9071)

---
 .github/workflows/manually_build.yml          |  27 ++
 docker/llm/serving/cpu/docker/Dockerfile      |   9 +-
 docker/llm/serving/cpu/docker/entrypoint.sh   | 200 +++++++++++++++
 docker/llm/serving/cpu/kubernetes/README.md   | 235 ++++++++++++++++++
 docker/llm/serving/cpu/kubernetes/clean.sh    |   1 +
 .../serving/cpu/kubernetes/deployment.yaml    | 109 ++++++++
 6 files changed, 580 insertions(+), 1 deletion(-)
 create mode 100644 docker/llm/serving/cpu/docker/entrypoint.sh
 create mode 100644 docker/llm/serving/cpu/kubernetes/README.md
 create mode 100644 docker/llm/serving/cpu/kubernetes/clean.sh
 create mode 100644 docker/llm/serving/cpu/kubernetes/deployment.yaml

diff --git a/.github/workflows/manually_build.yml b/.github/workflows/manually_build.yml
index e5fe603d..5b2cc7af 100644
--- a/.github/workflows/manually_build.yml
+++ b/.github/workflows/manually_build.yml
@@ -12,6 +12,7 @@ on:
         - all
         - bigdl-llm-xpu
         - bigdl-llm-cpu
+        - bigdl-llm-serving-cpu
         - bigdl-ppml-gramine-base
         - bigdl-ppml-trusted-bigdl-llm-gramine-base
         - bigdl-ppml-trusted-bigdl-llm-gramine-ref
@@ -114,6 +115,32 @@ jobs:
         sudo docker push 10.239.45.10/arda/${image}:${TAG}
         sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
 
+  bigdl-llm-serving-cpu:
+    if: ${{ github.event.inputs.artifact == 'bigdl-llm-serving-cpu' || github.event.inputs.artifact == 'all' }}
+    runs-on: [self-hosted, Shire]
+    steps:
+    - uses: actions/checkout@v3
+    - name: docker login
+      run: |
+        docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
+    - name: bigdl-llm-serving-cpu
+      run: |
+        echo "##############################################################"
+        echo "####### bigdl-llm-serving-cpu ########"
+        echo "##############################################################"
+        export image=intelanalytics/bigdl-llm-serving-cpu
+        cd docker/llm/serving/cpu/docker
+        sudo docker build \
+          --no-cache=true \
+          --build-arg http_proxy=${HTTP_PROXY} \
+          --build-arg https_proxy=${HTTPS_PROXY} \
+          --build-arg no_proxy=${NO_PROXY} \
+          -t ${image}:${TAG} -f ./Dockerfile .
+        sudo docker push ${image}:${TAG}
+        sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
+        sudo docker push 10.239.45.10/arda/${image}:${TAG}
+        sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
+
   bigdl-ppml-gramine-base:
     if: ${{ github.event.inputs.artifact == 'bigdl-ppml-gramine-base' || github.event.inputs.artifact == 'all' }}
     runs-on: [self-hosted, Shire]
diff --git a/docker/llm/serving/cpu/docker/Dockerfile b/docker/llm/serving/cpu/docker/Dockerfile
index ede2b733..e058bed5 100644
--- a/docker/llm/serving/cpu/docker/Dockerfile
+++ b/docker/llm/serving/cpu/docker/Dockerfile
@@ -2,10 +2,13 @@ FROM intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT
 
 ARG http_proxy
 ARG https_proxy
+ARG TINI_VERSION=v0.18.0
 
 # Disable pip's cache behavior
 ARG PIP_NO_CACHE_DIR=false
 
+ADD ./entrypoint.sh /opt/entrypoint.sh
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini
 # Install Serving Dependencies
 RUN mkdir /llm && \
     cd /llm && \
@@ -13,7 +16,11 @@ RUN mkdir /llm && \
     cd FastChat && \
     git checkout dev-2023-09-22 && \
     pip3 install -e ".[model_worker,webui]" && \
-    cd /llm
+    cd /llm && \
+    chmod +x /opt/entrypoint.sh && \
+    chmod +x /sbin/tini && \
+    cp /sbin/tini /usr/bin/tini
 
 
 WORKDIR /llm/
+ENTRYPOINT [ "/opt/entrypoint.sh" ]
\ No newline at end of file
diff --git a/docker/llm/serving/cpu/docker/entrypoint.sh b/docker/llm/serving/cpu/docker/entrypoint.sh
new file mode 100644
index 00000000..99370654
--- /dev/null
+++ b/docker/llm/serving/cpu/docker/entrypoint.sh
@@ -0,0 +1,200 @@
+#!/bin/bash
+
+usage() {
+  echo "Usage: $0 [-m --mode <controller|worker>] [-h --help]"
+  echo "-h: Print help message."
+  echo "Controller mode reads the following env:"
+  echo "CONTROLLER_HOST (default: localhost)."
+  echo "CONTROLLER_PORT (default: 21001)."
+  echo "API_HOST (default: localhost)."
+  echo "API_PORT (default: 8000)."
+  echo "Worker mode reads the following env:"
+  echo "CONTROLLER_HOST (default: localhost)."
+  echo "CONTROLLER_PORT (default: 21001)."
+  echo "WORKER_HOST (default: localhost)."
+  echo "WORKER_PORT (default: 21002)."
+  echo "MODEL_PATH (default: empty)."
+  exit 1
+}
+
+# Acquire correct core_nums if using cpuset-cpus, return -1 if file not exist
+calculate_total_cores() {
+  local cpuset_file="/sys/fs/cgroup/cpuset/cpuset.cpus"
+
+  if [[ -f "$cpuset_file" ]]; then
+    local cpuset_cpus=$(cat "$cpuset_file")
+    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
+
+    local total_cores=0
+    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
+    for cpu in "${cpu_list[@]}"; do
+      if [[ $cpu =~ - ]]; then
+        # Range of CPUs
+        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
+        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
+        local range_cores=$((end_cpu - start_cpu + 1))
+        total_cores=$((total_cores + range_cores))
+      else
+        # Single CPU
+        total_cores=$((total_cores + 1))
+      fi
+    done
+
+    echo $total_cores
+    return
+  fi
+  # Kubernetes core-binding will use this file
+  cpuset_file="/sys/fs/cgroup/cpuset.cpus"
+  if [[ -f "$cpuset_file" ]]; then
+    local cpuset_cpus=$(cat "$cpuset_file")
+    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
+
+    local total_cores=0
+    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
+    for cpu in "${cpu_list[@]}"; do
+      if [[ $cpu =~ - ]]; then
+        # Range of CPUs
+        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
+        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
+        local range_cores=$((end_cpu - start_cpu + 1))
+        total_cores=$((total_cores + range_cores))
+      else
+        # Single CPU
+        total_cores=$((total_cores + 1))
+      fi
+    done
+
+    echo $total_cores
+    return
+  else
+    echo -1
+    return
+  fi
+}
+
+# Default values
+controller_host="localhost"
+controller_port="21001"
+api_host="localhost"
+api_port="8000"
+worker_host="localhost"
+worker_port="21002"
+model_path=""
+mode=""
+omp_num_threads=""
+dispatch_method="shortest_queue" # shortest_queue or lottery
+
+# Update rootCA config if needed
+update-ca-certificates
+
+# Remember the value of `OMP_NUM_THREADS`:
+if [[ -n "${OMP_NUM_THREADS}" ]]; then
+  omp_num_threads="${OMP_NUM_THREADS}"
+fi
+
+# We do not have any arguments, just run bash
+if [ "$#" == 0 ]; then
+  echo "[INFO] no command is passed in"
+  echo "[INFO] enter pass-through mode"
+  exec /usr/bin/tini -s -- "bash"
+else
+  # Parse command-line options
+  options=$(getopt -o "m:h" --long "mode:,help" -n "$0" -- "$@")
+  if [ $? != 0 ]; then
+    usage
+  fi
+  eval set -- "$options"
+
+  while true; do
+    case "$1" in
+      -m|--mode)
+        mode="$2"
+        [[ $mode == "controller" || $mode == "worker" ]] || usage
+        shift 2
+        ;;
+      -h|--help)
+        usage
+        ;;
+      --)
+        shift
+        break
+        ;;
+      *)
+        usage
+        ;;
+    esac
+  done
+
+  if [[ -n $CONTROLLER_HOST ]]; then
+    controller_host=$CONTROLLER_HOST
+  fi
+
+  if [[ -n $CONTROLLER_PORT ]]; then
+    controller_port=$CONTROLLER_PORT
+  fi
+
+  if [[ -n $API_HOST ]]; then
+    api_host=$API_HOST
+  fi
+
+  if [[ -n $API_PORT ]]; then
+    api_port=$API_PORT
+  fi
+
+  if [[ -n $WORKER_HOST ]]; then
+    worker_host=$WORKER_HOST
+  fi
+
+  if [[ -n $WORKER_PORT ]]; then
+    worker_port=$WORKER_PORT
+  fi
+
+  if [[ -n $MODEL_PATH ]]; then
+    model_path=$MODEL_PATH
+  fi
+
+  if [[ -n $DISPATCH_METHOD ]]; then
+    dispatch_method=$DISPATCH_METHOD
+  fi
+
+  controller_address="http://$controller_host:$controller_port"
+  # Execute logic based on options
+  if [[ $mode == "controller" ]]; then
+    # Logic for controller mode
+    # Boot Controller
+    api_address="http://$api_host:$api_port"
+    echo "Controller address: $controller_address"
+    echo "OpenAI API address: $api_address"
+    python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
+    # Boot openai api server
+    python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address
+  else
+    # Logic for non-controller(worker) mode
+    worker_address="http://$worker_host:$worker_port"
+    # Apply optimizations from bigdl-nano
+    source bigdl-nano-init -t
+    # First check if user have set OMP_NUM_THREADS by themselves
+    if [[ -n "${omp_num_threads}" ]]; then
+      echo "Setting OMP_NUM_THREADS to its original value: $omp_num_threads"
+      export OMP_NUM_THREADS=$omp_num_threads
+    else
+      # Use calculate_total_cores to acquire cpuset settings
+      # Set OMP_NUM_THREADS to correct numbers
+      cores=$(calculate_total_cores)
+      if [[ $cores == -1 || $cores == 0 ]]; then
+        echo "Failed to obtain the number of cores, will use the default settings OMP_NUM_THREADS=$OMP_NUM_THREADS"
+      else
+        echo "Setting OMP_NUM_THREADS to $cores"
+        export OMP_NUM_THREADS=$cores
+      fi
+    fi
+    if [[ -z "${model_path}" ]]; then
+          echo "Please set env MODEL_PATH used for worker"
+          usage
+    fi
+    echo "Worker address: $worker_address"
+    echo "Controller address: $controller_address"
+    python3 -m fastchat.serve.model_worker --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
+  fi
+fi
+
diff --git a/docker/llm/serving/cpu/kubernetes/README.md b/docker/llm/serving/cpu/kubernetes/README.md
new file mode 100644
index 00000000..b0027f12
--- /dev/null
+++ b/docker/llm/serving/cpu/kubernetes/README.md
@@ -0,0 +1,235 @@
+## Deployment bigdl-llm serving service in K8S environment
+
+
+## Image
+
+To deploy BigDL-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT`
+
+## Before deployment
+
+### Models
+
+In this document, we will use `vicuna-7b-v1.5` as the deployment model.
+
+After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-bigdl` to use `bigdl-llm` as the backend. The `bigdl-llm` backend will be used if model path contains `bigdl`. Otherwise, the original transformer-backend will be used.
+
+You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5).
+
+### Kubernetes config
+
+We recommend to setup your kubernetes cluster before deployment.  Mostly importantly, please set `cpu-management-policy` to `static` by using this [tutorial](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/).  Also, it would be great to also set the `topology management policy` to `single-numa-node`.
+
+### Machine config
+
+Set hyper-threading to off, ensure that only physical cores are used during deployment.
+
+## Deployment
+
+### Reminder on `OMP_NUM_THREADS`
+
+The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct number by reading configs from the `runtime`.  However, this only happens correctly if the `core-binding` feature is enabled.  If not, please set environment variable `OMP_NUM_THREADS` manually in the yaml file.
+
+
+### Controller
+
+We use the following yaml file for controller deployment:
+
+```yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: bigdl-fschat-a1234bd-controller
+  labels:
+    fastchat-appid: a1234bd
+    fastchat-app-type: controller
+spec:
+  dnsPolicy: "ClusterFirst"
+  containers:
+  - name: fastchat-controller # fixed
+    image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+    imagePullPolicy: IfNotPresent
+    env:
+    - name: CONTROLLER_HOST # fixed
+      value: "0.0.0.0"
+    - name: CONTROLLER_PORT # fixed
+      value: "21005"
+    - name: API_HOST # fixed
+      value: "0.0.0.0"
+    - name: API_PORT # fixed
+      value: "8000"
+    ports:
+      - containerPort: 21005
+        name: con-port
+      - containerPort: 8000
+        name: api-port
+    resources:
+      requests:
+        memory: 16Gi
+        cpu: 4
+      limits:
+        memory: 16Gi
+        cpu: 4
+    args: ["-m", "controller"]
+  restartPolicy: "Never"
+---
+# Service for the controller
+apiVersion: v1
+kind: Service
+metadata:
+  name: bigdl-a1234bd-fschat-controller-service
+spec:
+  # You may also want to change this to use the cluster's feature
+  type: NodePort
+  selector:
+    fastchat-appid: a1234bd
+    fastchat-app-type: controller
+  ports:
+    - name: cont-port
+      protocol: TCP
+      port: 21005
+      targetPort: 21005
+    - name: api-port
+      protocol: TCP
+      port: 8000
+      targetPort: 8000
+```
+
+### Worker
+
+We use the following deployment for worker deployment:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: bigdl-fschat-a1234bd-worker-deployment
+spec:
+  # Change this to the number you want
+  replicas: 1
+  selector:
+    matchLabels:
+      fastchat: worker
+  template:
+    metadata:
+      labels:
+        fastchat: worker
+    spec:
+      dnsPolicy: "ClusterFirst"
+      containers:
+      - name: fastchat-worker # fixed
+        image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+        imagePullPolicy: IfNotPresent
+        env:
+        - name: CONTROLLER_HOST # fixed
+          value: bigdl-a1234bd-fschat-controller-service
+        - name: CONTROLLER_PORT # fixed
+          value: "21005"
+        - name: WORKER_HOST # fixed
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
+        - name: WORKER_PORT # fixed
+          value: "21841"
+        - name: MODEL_PATH # Change this
+          value: "/llm/models/vicuna-7b-v1.5-bigdl/"
+        - name: OMP_NUM_THREADS
+          value: "16"
+        resources:
+          requests:
+            memory: 32Gi
+            cpu: 16
+          limits:
+            memory: 32Gi
+            cpu: 16
+        args: ["-m", "worker"]
+        volumeMounts:
+          - name: llm-models
+            mountPath: /llm/models/
+      restartPolicy: "Always"
+      volumes:
+      - name: llm-models
+        hostPath:
+          path: /home/llm/models # change this in other envs
+```
+
+You may want to change the `MODEL_PATH` variable in the yaml.  Also, please remember to change the volume path accordingly.
+
+
+### Testing
+
+#### Using openai-python
+
+First, install openai-python:
+```bash
+pip install --upgrade openai
+```
+
+Then, interact with model vicuna-7b-v1.5-bigdl:
+```python
+import openai
+openai.api_key = "EMPTY"
+openai.api_base = "http://localhost:8000/v1"
+
+model = "vicuna-7b-v1.5-bigdl"
+prompt = "Once upon a time"
+
+# create a completion
+completion = openai.Completion.create(model=model, prompt=prompt, max_tokens=64)
+# print the completion
+print(prompt + completion.choices[0].text)
+
+# create a chat completion
+completion = openai.ChatCompletion.create(
+  model=model,
+  messages=[{"role": "user", "content": "Hello! What is your name?"}]
+)
+# print the completion
+print(completion.choices[0].message.content)
+```
+
+#### cURL
+cURL is another good tool for observing the output of the api.
+
+For the following examples, you may also change the service deployment address.
+
+List Models:
+```bash
+curl http://localhost:8000/v1/models
+```
+
+If you have `jq` installed, you can use it to format the output like this:
+```bash
+curl http://localhost:8000/v1/models | jq
+```
+
+Chat Completions:
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "YOUR_MODEL",
+    "messages": [{"role": "user", "content": "Hello! What is your name?"}]
+  }'
+```
+
+Text Completions:
+```bash
+curl http://localhost:8000/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "YOUR_MODEL",
+    "prompt": "Once upon a time",
+    "max_tokens": 41,
+    "temperature": 0.5
+  }'
+```
+
+Embeddings:
+```bash
+curl http://localhost:8000/v1/embeddings \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "YOUR_MODEL",
+    "input": "Hello world!"
+  }'
+```
\ No newline at end of file
diff --git a/docker/llm/serving/cpu/kubernetes/clean.sh b/docker/llm/serving/cpu/kubernetes/clean.sh
new file mode 100644
index 00000000..d5d1729d
--- /dev/null
+++ b/docker/llm/serving/cpu/kubernetes/clean.sh
@@ -0,0 +1 @@
+kubectl delete -f deployment.yaml
\ No newline at end of file
diff --git a/docker/llm/serving/cpu/kubernetes/deployment.yaml b/docker/llm/serving/cpu/kubernetes/deployment.yaml
new file mode 100644
index 00000000..bd659fd4
--- /dev/null
+++ b/docker/llm/serving/cpu/kubernetes/deployment.yaml
@@ -0,0 +1,109 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: bigdl-fschat-a1234bd-controller
+  labels:
+    fastchat-appid: a1234bd
+    fastchat-app-type: controller
+spec:
+  dnsPolicy: "ClusterFirst"
+  containers:
+  - name: fastchat-controller # fixed
+    image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+    imagePullPolicy: IfNotPresent
+    env:
+    - name: CONTROLLER_HOST # fixed
+      value: "0.0.0.0"
+    - name: CONTROLLER_PORT # fixed
+      value: "21005"
+    - name: API_HOST # fixed
+      value: "0.0.0.0"
+    - name: API_PORT # fixed
+      value: "8000"
+    ports:
+      - containerPort: 21005
+        name: con-port
+      - containerPort: 8000
+        name: api-port
+    resources:
+      requests:
+        memory: 16Gi
+        cpu: 4
+      limits:
+        memory: 16Gi
+        cpu: 4
+    args: ["-m", "controller"]
+  restartPolicy: "Never"
+---
+# Service for the controller
+apiVersion: v1
+kind: Service
+metadata:
+  name: bigdl-a1234bd-fschat-controller-service
+spec:
+  # You may also want to change this to use the cluster's feature
+  type: NodePort
+  selector:
+    fastchat-appid: a1234bd
+    fastchat-app-type: controller
+  ports:
+    - name: cont-port
+      protocol: TCP
+      port: 21005
+      targetPort: 21005
+    - name: api-port
+      protocol: TCP
+      port: 8000
+      targetPort: 8000
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: bigdl-fschat-a1234bd-worker-deployment
+spec:
+  # Change this to the number you want
+  replicas: 1
+  selector:
+    matchLabels:
+      fastchat: worker
+  template:
+    metadata:
+      labels:
+        fastchat: worker
+    spec:
+      dnsPolicy: "ClusterFirst"
+      containers:
+      - name: fastchat-worker # fixed
+        image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
+        imagePullPolicy: IfNotPresent
+        env:
+        - name: CONTROLLER_HOST # fixed
+          value: bigdl-a1234bd-fschat-controller-service
+        - name: CONTROLLER_PORT # fixed
+          value: "21005"
+        - name: WORKER_HOST # fixed
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
+        - name: WORKER_PORT # fixed
+          value: "21841"
+        - name: MODEL_PATH # Change this
+          value: "/llm/models/vicuna-7b-v1.5-bigdl/"
+        - name: OMP_NUM_THREADS
+          value: "16"
+        resources:
+          requests:
+            memory: 32Gi
+            cpu: 16
+          limits:
+            memory: 32Gi
+            cpu: 16
+        args: ["-m", "worker"]
+        volumeMounts:
+          - name: llm-models
+            mountPath: /llm/models/
+      restartPolicy: "Always"
+      volumes:
+      - name: llm-models
+        hostPath:
+          path: /home/llm/models # change this in other envs
\ No newline at end of file