Add Kubernetes support for BigDL-LLM-serving CPU. (#9071)

2023-10-07 09:37:48 +08:00 · 2023-10-07 09:37:48 +08:00 · b773d67dd4
commit b773d67dd4
parent 36dd4afd61
6 changed files with 580 additions and 1 deletions
--- a/.github/workflows/manually_build.yml
+++ b/.github/workflows/manually_build.yml
@ -12,6 +12,7 @@ on:
        - all
        - bigdl-llm-xpu
        - bigdl-llm-cpu
        - bigdl-llm-serving-cpu
        - bigdl-ppml-gramine-base
        - bigdl-ppml-trusted-bigdl-llm-gramine-base
        - bigdl-ppml-trusted-bigdl-llm-gramine-ref
@ -114,6 +115,32 @@ jobs:
        sudo docker push 10.239.45.10/arda/${image}:${TAG}
        sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
  bigdl-llm-serving-cpu:
    if: ${{ github.event.inputs.artifact == 'bigdl-llm-serving-cpu' || github.event.inputs.artifact == 'all' }}
    runs-on: [self-hosted, Shire]
    steps:
    - uses: actions/checkout@v3
    - name: docker login
      run: |
        docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
    - name: bigdl-llm-serving-cpu
      run: |
        echo "##############################################################"
        echo "####### bigdl-llm-serving-cpu ########"
        echo "##############################################################"
        export image=intelanalytics/bigdl-llm-serving-cpu
        cd docker/llm/serving/cpu/docker
        sudo docker build \
          --no-cache=true \
          --build-arg http_proxy=${HTTP_PROXY} \
          --build-arg https_proxy=${HTTPS_PROXY} \
          --build-arg no_proxy=${NO_PROXY} \
          -t ${image}:${TAG} -f ./Dockerfile .
        sudo docker push ${image}:${TAG}
        sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
        sudo docker push 10.239.45.10/arda/${image}:${TAG}
        sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
  bigdl-ppml-gramine-base:
    if: ${{ github.event.inputs.artifact == 'bigdl-ppml-gramine-base' || github.event.inputs.artifact == 'all' }}
    runs-on: [self-hosted, Shire]
--- a/docker/llm/serving/cpu/docker/Dockerfile
+++ b/docker/llm/serving/cpu/docker/Dockerfile
@ -2,10 +2,13 @@ FROM intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT
 ARG http_proxy
 ARG https_proxy
 ARG TINI_VERSION=v0.18.0
 # Disable pip's cache behavior
 ARG PIP_NO_CACHE_DIR=false
 ADD ./entrypoint.sh /opt/entrypoint.sh
 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini
 # Install Serving Dependencies
 RUN mkdir /llm && \
    cd /llm && \
@ -13,7 +16,11 @@ RUN mkdir /llm && \
    cd FastChat && \
    git checkout dev-2023-09-22 && \
    pip3 install -e ".[model_worker,webui]" && \
-    cd /llm
+    cd /llm && \
    chmod +x /opt/entrypoint.sh && \
    chmod +x /sbin/tini && \
    cp /sbin/tini /usr/bin/tini
 WORKDIR /llm/
 ENTRYPOINT [ "/opt/entrypoint.sh" ]
--- a/docker/llm/serving/cpu/docker/entrypoint.sh
+++ b/docker/llm/serving/cpu/docker/entrypoint.sh
@ -0,0 +1,200 @@
 #!/bin/bash
 usage() {
  echo "Usage: $0 [-m --mode <controller|worker>] [-h --help]"
  echo "-h: Print help message."
  echo "Controller mode reads the following env:"
  echo "CONTROLLER_HOST (default: localhost)."
  echo "CONTROLLER_PORT (default: 21001)."
  echo "API_HOST (default: localhost)."
  echo "API_PORT (default: 8000)."
  echo "Worker mode reads the following env:"
  echo "CONTROLLER_HOST (default: localhost)."
  echo "CONTROLLER_PORT (default: 21001)."
  echo "WORKER_HOST (default: localhost)."
  echo "WORKER_PORT (default: 21002)."
  echo "MODEL_PATH (default: empty)."
  exit 1
 }
 # Acquire correct core_nums if using cpuset-cpus, return -1 if file not exist
 calculate_total_cores() {
  local cpuset_file="/sys/fs/cgroup/cpuset/cpuset.cpus"
  if [[ -f "$cpuset_file" ]]; then
    local cpuset_cpus=$(cat "$cpuset_file")
    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
    local total_cores=0
    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
    for cpu in "${cpu_list[@]}"; do
      if [[ $cpu =~ - ]]; then
        # Range of CPUs
        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
        local range_cores=$((end_cpu - start_cpu + 1))
        total_cores=$((total_cores + range_cores))
      else
        # Single CPU
        total_cores=$((total_cores + 1))
      fi
    done
    echo $total_cores
    return
  fi
  # Kubernetes core-binding will use this file
  cpuset_file="/sys/fs/cgroup/cpuset.cpus"
  if [[ -f "$cpuset_file" ]]; then
    local cpuset_cpus=$(cat "$cpuset_file")
    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
    local total_cores=0
    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
    for cpu in "${cpu_list[@]}"; do
      if [[ $cpu =~ - ]]; then
        # Range of CPUs
        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
        local range_cores=$((end_cpu - start_cpu + 1))
        total_cores=$((total_cores + range_cores))
      else
        # Single CPU
        total_cores=$((total_cores + 1))
      fi
    done
    echo $total_cores
    return
  else
    echo -1
    return
  fi
 }
 # Default values
 controller_host="localhost"
 controller_port="21001"
 api_host="localhost"
 api_port="8000"
 worker_host="localhost"
 worker_port="21002"
 model_path=""
 mode=""
 omp_num_threads=""
 dispatch_method="shortest_queue" # shortest_queue or lottery
 # Update rootCA config if needed
 update-ca-certificates
 # Remember the value of `OMP_NUM_THREADS`:
 if [[ -n "${OMP_NUM_THREADS}" ]]; then
  omp_num_threads="${OMP_NUM_THREADS}"
 fi
 # We do not have any arguments, just run bash
 if [ "$#" == 0 ]; then
  echo "[INFO] no command is passed in"
  echo "[INFO] enter pass-through mode"
  exec /usr/bin/tini -s -- "bash"
 else
  # Parse command-line options
  options=$(getopt -o "m:h" --long "mode:,help" -n "$0" -- "$@")
  if [ $? != 0 ]; then
    usage
  fi
  eval set -- "$options"
  while true; do
    case "$1" in
      -m|--mode)
        mode="$2"
        [[ $mode == "controller" || $mode == "worker" ]] || usage
        shift 2
        ;;
      -h|--help)
        usage
        ;;
      --)
        shift
        break
        ;;
      *)
        usage
        ;;
    esac
  done
  if [[ -n $CONTROLLER_HOST ]]; then
    controller_host=$CONTROLLER_HOST
  fi
  if [[ -n $CONTROLLER_PORT ]]; then
    controller_port=$CONTROLLER_PORT
  fi
  if [[ -n $API_HOST ]]; then
    api_host=$API_HOST
  fi
  if [[ -n $API_PORT ]]; then
    api_port=$API_PORT
  fi
  if [[ -n $WORKER_HOST ]]; then
    worker_host=$WORKER_HOST
  fi
  if [[ -n $WORKER_PORT ]]; then
    worker_port=$WORKER_PORT
  fi
  if [[ -n $MODEL_PATH ]]; then
    model_path=$MODEL_PATH
  fi
  if [[ -n $DISPATCH_METHOD ]]; then
    dispatch_method=$DISPATCH_METHOD
  fi
  controller_address="http://$controller_host:$controller_port"
  # Execute logic based on options
  if [[ $mode == "controller" ]]; then
    # Logic for controller mode
    # Boot Controller
    api_address="http://$api_host:$api_port"
    echo "Controller address: $controller_address"
    echo "OpenAI API address: $api_address"
    python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
    # Boot openai api server
    python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address
  else
    # Logic for non-controller(worker) mode
    worker_address="http://$worker_host:$worker_port"
    # Apply optimizations from bigdl-nano
    source bigdl-nano-init -t
    # First check if user have set OMP_NUM_THREADS by themselves
    if [[ -n "${omp_num_threads}" ]]; then
      echo "Setting OMP_NUM_THREADS to its original value: $omp_num_threads"
      export OMP_NUM_THREADS=$omp_num_threads
    else
      # Use calculate_total_cores to acquire cpuset settings
      # Set OMP_NUM_THREADS to correct numbers
      cores=$(calculate_total_cores)
      if [[ $cores == -1 || $cores == 0 ]]; then
        echo "Failed to obtain the number of cores, will use the default settings OMP_NUM_THREADS=$OMP_NUM_THREADS"
      else
        echo "Setting OMP_NUM_THREADS to $cores"
        export OMP_NUM_THREADS=$cores
      fi
    fi
    if [[ -z "${model_path}" ]]; then
          echo "Please set env MODEL_PATH used for worker"
          usage
    fi
    echo "Worker address: $worker_address"
    echo "Controller address: $controller_address"
    python3 -m fastchat.serve.model_worker --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
  fi
 fi
--- a/docker/llm/serving/cpu/kubernetes/README.md
+++ b/docker/llm/serving/cpu/kubernetes/README.md
@ -0,0 +1,235 @@
 ## Deployment bigdl-llm serving service in K8S environment
 ## Image
 To deploy BigDL-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT`
 ## Before deployment
 ### Models
 In this document, we will use `vicuna-7b-v1.5` as the deployment model.
 After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-bigdl` to use `bigdl-llm` as the backend. The `bigdl-llm` backend will be used if model path contains `bigdl`. Otherwise, the original transformer-backend will be used.
 You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5).
 ### Kubernetes config
 We recommend to setup your kubernetes cluster before deployment.  Mostly importantly, please set `cpu-management-policy` to `static` by using this [tutorial](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/).  Also, it would be great to also set the `topology management policy` to `single-numa-node`.
 ### Machine config
 Set hyper-threading to off, ensure that only physical cores are used during deployment.
 ## Deployment
 ### Reminder on `OMP_NUM_THREADS`
 The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct number by reading configs from the `runtime`.  However, this only happens correctly if the `core-binding` feature is enabled.  If not, please set environment variable `OMP_NUM_THREADS` manually in the yaml file.
 ### Controller
 We use the following yaml file for controller deployment:
 ```yaml
 apiVersion: v1
 kind: Pod
 metadata:
  name: bigdl-fschat-a1234bd-controller
  labels:
    fastchat-appid: a1234bd
    fastchat-app-type: controller
 spec:
  dnsPolicy: "ClusterFirst"
  containers:
  - name: fastchat-controller # fixed
    image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
    imagePullPolicy: IfNotPresent
    env:
    - name: CONTROLLER_HOST # fixed
      value: "0.0.0.0"
    - name: CONTROLLER_PORT # fixed
      value: "21005"
    - name: API_HOST # fixed
      value: "0.0.0.0"
    - name: API_PORT # fixed
      value: "8000"
    ports:
      - containerPort: 21005
        name: con-port
      - containerPort: 8000
        name: api-port
    resources:
      requests:
        memory: 16Gi
        cpu: 4
      limits:
        memory: 16Gi
        cpu: 4
    args: ["-m", "controller"]
  restartPolicy: "Never"
 ---
 # Service for the controller
 apiVersion: v1
 kind: Service
 metadata:
  name: bigdl-a1234bd-fschat-controller-service
 spec:
  # You may also want to change this to use the cluster's feature
  type: NodePort
  selector:
    fastchat-appid: a1234bd
    fastchat-app-type: controller
  ports:
    - name: cont-port
      protocol: TCP
      port: 21005
      targetPort: 21005
    - name: api-port
      protocol: TCP
      port: 8000
      targetPort: 8000
 ```
 ### Worker
 We use the following deployment for worker deployment:
 ```yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: bigdl-fschat-a1234bd-worker-deployment
 spec:
  # Change this to the number you want
  replicas: 1
  selector:
    matchLabels:
      fastchat: worker
  template:
    metadata:
      labels:
        fastchat: worker
    spec:
      dnsPolicy: "ClusterFirst"
      containers:
      - name: fastchat-worker # fixed
        image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
        imagePullPolicy: IfNotPresent
        env:
        - name: CONTROLLER_HOST # fixed
          value: bigdl-a1234bd-fschat-controller-service
        - name: CONTROLLER_PORT # fixed
          value: "21005"
        - name: WORKER_HOST # fixed
          valueFrom:
            fieldRef:
              fieldPath: status.podIP
        - name: WORKER_PORT # fixed
          value: "21841"
        - name: MODEL_PATH # Change this
          value: "/llm/models/vicuna-7b-v1.5-bigdl/"
        - name: OMP_NUM_THREADS
          value: "16"
        resources:
          requests:
            memory: 32Gi
            cpu: 16
          limits:
            memory: 32Gi
            cpu: 16
        args: ["-m", "worker"]
        volumeMounts:
          - name: llm-models
            mountPath: /llm/models/
      restartPolicy: "Always"
      volumes:
      - name: llm-models
        hostPath:
          path: /home/llm/models # change this in other envs
 ```
 You may want to change the `MODEL_PATH` variable in the yaml.  Also, please remember to change the volume path accordingly.
 ### Testing
 #### Using openai-python
 First, install openai-python:
 ```bash
 pip install --upgrade openai
 ```
 Then, interact with model vicuna-7b-v1.5-bigdl:
 ```python
 import openai
 openai.api_key = "EMPTY"
 openai.api_base = "http://localhost:8000/v1"
 model = "vicuna-7b-v1.5-bigdl"
 prompt = "Once upon a time"
 # create a completion
 completion = openai.Completion.create(model=model, prompt=prompt, max_tokens=64)
 # print the completion
 print(prompt + completion.choices[0].text)
 # create a chat completion
 completion = openai.ChatCompletion.create(
  model=model,
  messages=[{"role": "user", "content": "Hello! What is your name?"}]
 )
 # print the completion
 print(completion.choices[0].message.content)
 ```
 #### cURL
 cURL is another good tool for observing the output of the api.
 For the following examples, you may also change the service deployment address.
 List Models:
 ```bash
 curl http://localhost:8000/v1/models
 ```
 If you have `jq` installed, you can use it to format the output like this:
 ```bash
 curl http://localhost:8000/v1/models | jq
 ```
 Chat Completions:
 ```bash
 curl http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "YOUR_MODEL",
    "messages": [{"role": "user", "content": "Hello! What is your name?"}]
  }'
 ```
 Text Completions:
 ```bash
 curl http://localhost:8000/v1/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "YOUR_MODEL",
    "prompt": "Once upon a time",
    "max_tokens": 41,
    "temperature": 0.5
  }'
 ```
 Embeddings:
 ```bash
 curl http://localhost:8000/v1/embeddings \
  -H "Content-Type: application/json" \
  -d '{
    "model": "YOUR_MODEL",
    "input": "Hello world!"
  }'
 ```
--- a/docker/llm/serving/cpu/kubernetes/clean.sh
+++ b/docker/llm/serving/cpu/kubernetes/clean.sh
@ -0,0 +1 @@
 kubectl delete -f deployment.yaml
--- a/docker/llm/serving/cpu/kubernetes/deployment.yaml
+++ b/docker/llm/serving/cpu/kubernetes/deployment.yaml
@ -0,0 +1,109 @@
 apiVersion: v1
 kind: Pod
 metadata:
  name: bigdl-fschat-a1234bd-controller
  labels:
    fastchat-appid: a1234bd
    fastchat-app-type: controller
 spec:
  dnsPolicy: "ClusterFirst"
  containers:
  - name: fastchat-controller # fixed
    image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
    imagePullPolicy: IfNotPresent
    env:
    - name: CONTROLLER_HOST # fixed
      value: "0.0.0.0"
    - name: CONTROLLER_PORT # fixed
      value: "21005"
    - name: API_HOST # fixed
      value: "0.0.0.0"
    - name: API_PORT # fixed
      value: "8000"
    ports:
      - containerPort: 21005
        name: con-port
      - containerPort: 8000
        name: api-port
    resources:
      requests:
        memory: 16Gi
        cpu: 4
      limits:
        memory: 16Gi
        cpu: 4
    args: ["-m", "controller"]
  restartPolicy: "Never"
 ---
 # Service for the controller
 apiVersion: v1
 kind: Service
 metadata:
  name: bigdl-a1234bd-fschat-controller-service
 spec:
  # You may also want to change this to use the cluster's feature
  type: NodePort
  selector:
    fastchat-appid: a1234bd
    fastchat-app-type: controller
  ports:
    - name: cont-port
      protocol: TCP
      port: 21005
      targetPort: 21005
    - name: api-port
      protocol: TCP
      port: 8000
      targetPort: 8000
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: bigdl-fschat-a1234bd-worker-deployment
 spec:
  # Change this to the number you want
  replicas: 1
  selector:
    matchLabels:
      fastchat: worker
  template:
    metadata:
      labels:
        fastchat: worker
    spec:
      dnsPolicy: "ClusterFirst"
      containers:
      - name: fastchat-worker # fixed
        image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
        imagePullPolicy: IfNotPresent
        env:
        - name: CONTROLLER_HOST # fixed
          value: bigdl-a1234bd-fschat-controller-service
        - name: CONTROLLER_PORT # fixed
          value: "21005"
        - name: WORKER_HOST # fixed
          valueFrom:
            fieldRef:
              fieldPath: status.podIP
        - name: WORKER_PORT # fixed
          value: "21841"
        - name: MODEL_PATH # Change this
          value: "/llm/models/vicuna-7b-v1.5-bigdl/"
        - name: OMP_NUM_THREADS
          value: "16"
        resources:
          requests:
            memory: 32Gi
            cpu: 16
          limits:
            memory: 32Gi
            cpu: 16
        args: ["-m", "worker"]
        volumeMounts:
          - name: llm-models
            mountPath: /llm/models/
      restartPolicy: "Always"
      volumes:
      - name: llm-models
        hostPath:
          path: /home/llm/models # change this in other envs