From b773d67dd460876bdae2835fa8646b37f486dc19 Mon Sep 17 00:00:00 2001 From: ZehuaCao <47251317+Romanticoseu@users.noreply.github.com> Date: Sat, 7 Oct 2023 09:37:48 +0800 Subject: [PATCH] Add Kubernetes support for BigDL-LLM-serving CPU. (#9071) --- .github/workflows/manually_build.yml | 27 ++ docker/llm/serving/cpu/docker/Dockerfile | 9 +- docker/llm/serving/cpu/docker/entrypoint.sh | 200 +++++++++++++++ docker/llm/serving/cpu/kubernetes/README.md | 235 ++++++++++++++++++ docker/llm/serving/cpu/kubernetes/clean.sh | 1 + .../serving/cpu/kubernetes/deployment.yaml | 109 ++++++++ 6 files changed, 580 insertions(+), 1 deletion(-) create mode 100644 docker/llm/serving/cpu/docker/entrypoint.sh create mode 100644 docker/llm/serving/cpu/kubernetes/README.md create mode 100644 docker/llm/serving/cpu/kubernetes/clean.sh create mode 100644 docker/llm/serving/cpu/kubernetes/deployment.yaml diff --git a/.github/workflows/manually_build.yml b/.github/workflows/manually_build.yml index e5fe603d..5b2cc7af 100644 --- a/.github/workflows/manually_build.yml +++ b/.github/workflows/manually_build.yml @@ -12,6 +12,7 @@ on: - all - bigdl-llm-xpu - bigdl-llm-cpu + - bigdl-llm-serving-cpu - bigdl-ppml-gramine-base - bigdl-ppml-trusted-bigdl-llm-gramine-base - bigdl-ppml-trusted-bigdl-llm-gramine-ref @@ -114,6 +115,32 @@ jobs: sudo docker push 10.239.45.10/arda/${image}:${TAG} sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG} + bigdl-llm-serving-cpu: + if: ${{ github.event.inputs.artifact == 'bigdl-llm-serving-cpu' || github.event.inputs.artifact == 'all' }} + runs-on: [self-hosted, Shire] + steps: + - uses: actions/checkout@v3 + - name: docker login + run: | + docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD} + - name: bigdl-llm-serving-cpu + run: | + echo "##############################################################" + echo "####### bigdl-llm-serving-cpu ########" + echo "##############################################################" + export image=intelanalytics/bigdl-llm-serving-cpu + cd docker/llm/serving/cpu/docker + sudo docker build \ + --no-cache=true \ + --build-arg http_proxy=${HTTP_PROXY} \ + --build-arg https_proxy=${HTTPS_PROXY} \ + --build-arg no_proxy=${NO_PROXY} \ + -t ${image}:${TAG} -f ./Dockerfile . + sudo docker push ${image}:${TAG} + sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG} + sudo docker push 10.239.45.10/arda/${image}:${TAG} + sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG} + bigdl-ppml-gramine-base: if: ${{ github.event.inputs.artifact == 'bigdl-ppml-gramine-base' || github.event.inputs.artifact == 'all' }} runs-on: [self-hosted, Shire] diff --git a/docker/llm/serving/cpu/docker/Dockerfile b/docker/llm/serving/cpu/docker/Dockerfile index ede2b733..e058bed5 100644 --- a/docker/llm/serving/cpu/docker/Dockerfile +++ b/docker/llm/serving/cpu/docker/Dockerfile @@ -2,10 +2,13 @@ FROM intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT ARG http_proxy ARG https_proxy +ARG TINI_VERSION=v0.18.0 # Disable pip's cache behavior ARG PIP_NO_CACHE_DIR=false +ADD ./entrypoint.sh /opt/entrypoint.sh +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini # Install Serving Dependencies RUN mkdir /llm && \ cd /llm && \ @@ -13,7 +16,11 @@ RUN mkdir /llm && \ cd FastChat && \ git checkout dev-2023-09-22 && \ pip3 install -e ".[model_worker,webui]" && \ - cd /llm + cd /llm && \ + chmod +x /opt/entrypoint.sh && \ + chmod +x /sbin/tini && \ + cp /sbin/tini /usr/bin/tini WORKDIR /llm/ +ENTRYPOINT [ "/opt/entrypoint.sh" ] \ No newline at end of file diff --git a/docker/llm/serving/cpu/docker/entrypoint.sh b/docker/llm/serving/cpu/docker/entrypoint.sh new file mode 100644 index 00000000..99370654 --- /dev/null +++ b/docker/llm/serving/cpu/docker/entrypoint.sh @@ -0,0 +1,200 @@ +#!/bin/bash + +usage() { + echo "Usage: $0 [-m --mode ] [-h --help]" + echo "-h: Print help message." + echo "Controller mode reads the following env:" + echo "CONTROLLER_HOST (default: localhost)." + echo "CONTROLLER_PORT (default: 21001)." + echo "API_HOST (default: localhost)." + echo "API_PORT (default: 8000)." + echo "Worker mode reads the following env:" + echo "CONTROLLER_HOST (default: localhost)." + echo "CONTROLLER_PORT (default: 21001)." + echo "WORKER_HOST (default: localhost)." + echo "WORKER_PORT (default: 21002)." + echo "MODEL_PATH (default: empty)." + exit 1 +} + +# Acquire correct core_nums if using cpuset-cpus, return -1 if file not exist +calculate_total_cores() { + local cpuset_file="/sys/fs/cgroup/cpuset/cpuset.cpus" + + if [[ -f "$cpuset_file" ]]; then + local cpuset_cpus=$(cat "$cpuset_file") + cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n') + + local total_cores=0 + IFS=',' read -ra cpu_list <<< "$cpuset_cpus" + for cpu in "${cpu_list[@]}"; do + if [[ $cpu =~ - ]]; then + # Range of CPUs + local start_cpu=$(echo "$cpu" | cut -d'-' -f1) + local end_cpu=$(echo "$cpu" | cut -d'-' -f2) + local range_cores=$((end_cpu - start_cpu + 1)) + total_cores=$((total_cores + range_cores)) + else + # Single CPU + total_cores=$((total_cores + 1)) + fi + done + + echo $total_cores + return + fi + # Kubernetes core-binding will use this file + cpuset_file="/sys/fs/cgroup/cpuset.cpus" + if [[ -f "$cpuset_file" ]]; then + local cpuset_cpus=$(cat "$cpuset_file") + cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n') + + local total_cores=0 + IFS=',' read -ra cpu_list <<< "$cpuset_cpus" + for cpu in "${cpu_list[@]}"; do + if [[ $cpu =~ - ]]; then + # Range of CPUs + local start_cpu=$(echo "$cpu" | cut -d'-' -f1) + local end_cpu=$(echo "$cpu" | cut -d'-' -f2) + local range_cores=$((end_cpu - start_cpu + 1)) + total_cores=$((total_cores + range_cores)) + else + # Single CPU + total_cores=$((total_cores + 1)) + fi + done + + echo $total_cores + return + else + echo -1 + return + fi +} + +# Default values +controller_host="localhost" +controller_port="21001" +api_host="localhost" +api_port="8000" +worker_host="localhost" +worker_port="21002" +model_path="" +mode="" +omp_num_threads="" +dispatch_method="shortest_queue" # shortest_queue or lottery + +# Update rootCA config if needed +update-ca-certificates + +# Remember the value of `OMP_NUM_THREADS`: +if [[ -n "${OMP_NUM_THREADS}" ]]; then + omp_num_threads="${OMP_NUM_THREADS}" +fi + +# We do not have any arguments, just run bash +if [ "$#" == 0 ]; then + echo "[INFO] no command is passed in" + echo "[INFO] enter pass-through mode" + exec /usr/bin/tini -s -- "bash" +else + # Parse command-line options + options=$(getopt -o "m:h" --long "mode:,help" -n "$0" -- "$@") + if [ $? != 0 ]; then + usage + fi + eval set -- "$options" + + while true; do + case "$1" in + -m|--mode) + mode="$2" + [[ $mode == "controller" || $mode == "worker" ]] || usage + shift 2 + ;; + -h|--help) + usage + ;; + --) + shift + break + ;; + *) + usage + ;; + esac + done + + if [[ -n $CONTROLLER_HOST ]]; then + controller_host=$CONTROLLER_HOST + fi + + if [[ -n $CONTROLLER_PORT ]]; then + controller_port=$CONTROLLER_PORT + fi + + if [[ -n $API_HOST ]]; then + api_host=$API_HOST + fi + + if [[ -n $API_PORT ]]; then + api_port=$API_PORT + fi + + if [[ -n $WORKER_HOST ]]; then + worker_host=$WORKER_HOST + fi + + if [[ -n $WORKER_PORT ]]; then + worker_port=$WORKER_PORT + fi + + if [[ -n $MODEL_PATH ]]; then + model_path=$MODEL_PATH + fi + + if [[ -n $DISPATCH_METHOD ]]; then + dispatch_method=$DISPATCH_METHOD + fi + + controller_address="http://$controller_host:$controller_port" + # Execute logic based on options + if [[ $mode == "controller" ]]; then + # Logic for controller mode + # Boot Controller + api_address="http://$api_host:$api_port" + echo "Controller address: $controller_address" + echo "OpenAI API address: $api_address" + python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method & + # Boot openai api server + python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address + else + # Logic for non-controller(worker) mode + worker_address="http://$worker_host:$worker_port" + # Apply optimizations from bigdl-nano + source bigdl-nano-init -t + # First check if user have set OMP_NUM_THREADS by themselves + if [[ -n "${omp_num_threads}" ]]; then + echo "Setting OMP_NUM_THREADS to its original value: $omp_num_threads" + export OMP_NUM_THREADS=$omp_num_threads + else + # Use calculate_total_cores to acquire cpuset settings + # Set OMP_NUM_THREADS to correct numbers + cores=$(calculate_total_cores) + if [[ $cores == -1 || $cores == 0 ]]; then + echo "Failed to obtain the number of cores, will use the default settings OMP_NUM_THREADS=$OMP_NUM_THREADS" + else + echo "Setting OMP_NUM_THREADS to $cores" + export OMP_NUM_THREADS=$cores + fi + fi + if [[ -z "${model_path}" ]]; then + echo "Please set env MODEL_PATH used for worker" + usage + fi + echo "Worker address: $worker_address" + echo "Controller address: $controller_address" + python3 -m fastchat.serve.model_worker --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address + fi +fi + diff --git a/docker/llm/serving/cpu/kubernetes/README.md b/docker/llm/serving/cpu/kubernetes/README.md new file mode 100644 index 00000000..b0027f12 --- /dev/null +++ b/docker/llm/serving/cpu/kubernetes/README.md @@ -0,0 +1,235 @@ +## Deployment bigdl-llm serving service in K8S environment + + +## Image + +To deploy BigDL-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT` + +## Before deployment + +### Models + +In this document, we will use `vicuna-7b-v1.5` as the deployment model. + +After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-bigdl` to use `bigdl-llm` as the backend. The `bigdl-llm` backend will be used if model path contains `bigdl`. Otherwise, the original transformer-backend will be used. + +You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5). + +### Kubernetes config + +We recommend to setup your kubernetes cluster before deployment. Mostly importantly, please set `cpu-management-policy` to `static` by using this [tutorial](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/). Also, it would be great to also set the `topology management policy` to `single-numa-node`. + +### Machine config + +Set hyper-threading to off, ensure that only physical cores are used during deployment. + +## Deployment + +### Reminder on `OMP_NUM_THREADS` + +The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct number by reading configs from the `runtime`. However, this only happens correctly if the `core-binding` feature is enabled. If not, please set environment variable `OMP_NUM_THREADS` manually in the yaml file. + + +### Controller + +We use the following yaml file for controller deployment: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: bigdl-fschat-a1234bd-controller + labels: + fastchat-appid: a1234bd + fastchat-app-type: controller +spec: + dnsPolicy: "ClusterFirst" + containers: + - name: fastchat-controller # fixed + image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT + imagePullPolicy: IfNotPresent + env: + - name: CONTROLLER_HOST # fixed + value: "0.0.0.0" + - name: CONTROLLER_PORT # fixed + value: "21005" + - name: API_HOST # fixed + value: "0.0.0.0" + - name: API_PORT # fixed + value: "8000" + ports: + - containerPort: 21005 + name: con-port + - containerPort: 8000 + name: api-port + resources: + requests: + memory: 16Gi + cpu: 4 + limits: + memory: 16Gi + cpu: 4 + args: ["-m", "controller"] + restartPolicy: "Never" +--- +# Service for the controller +apiVersion: v1 +kind: Service +metadata: + name: bigdl-a1234bd-fschat-controller-service +spec: + # You may also want to change this to use the cluster's feature + type: NodePort + selector: + fastchat-appid: a1234bd + fastchat-app-type: controller + ports: + - name: cont-port + protocol: TCP + port: 21005 + targetPort: 21005 + - name: api-port + protocol: TCP + port: 8000 + targetPort: 8000 +``` + +### Worker + +We use the following deployment for worker deployment: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: bigdl-fschat-a1234bd-worker-deployment +spec: + # Change this to the number you want + replicas: 1 + selector: + matchLabels: + fastchat: worker + template: + metadata: + labels: + fastchat: worker + spec: + dnsPolicy: "ClusterFirst" + containers: + - name: fastchat-worker # fixed + image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT + imagePullPolicy: IfNotPresent + env: + - name: CONTROLLER_HOST # fixed + value: bigdl-a1234bd-fschat-controller-service + - name: CONTROLLER_PORT # fixed + value: "21005" + - name: WORKER_HOST # fixed + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: WORKER_PORT # fixed + value: "21841" + - name: MODEL_PATH # Change this + value: "/llm/models/vicuna-7b-v1.5-bigdl/" + - name: OMP_NUM_THREADS + value: "16" + resources: + requests: + memory: 32Gi + cpu: 16 + limits: + memory: 32Gi + cpu: 16 + args: ["-m", "worker"] + volumeMounts: + - name: llm-models + mountPath: /llm/models/ + restartPolicy: "Always" + volumes: + - name: llm-models + hostPath: + path: /home/llm/models # change this in other envs +``` + +You may want to change the `MODEL_PATH` variable in the yaml. Also, please remember to change the volume path accordingly. + + +### Testing + +#### Using openai-python + +First, install openai-python: +```bash +pip install --upgrade openai +``` + +Then, interact with model vicuna-7b-v1.5-bigdl: +```python +import openai +openai.api_key = "EMPTY" +openai.api_base = "http://localhost:8000/v1" + +model = "vicuna-7b-v1.5-bigdl" +prompt = "Once upon a time" + +# create a completion +completion = openai.Completion.create(model=model, prompt=prompt, max_tokens=64) +# print the completion +print(prompt + completion.choices[0].text) + +# create a chat completion +completion = openai.ChatCompletion.create( + model=model, + messages=[{"role": "user", "content": "Hello! What is your name?"}] +) +# print the completion +print(completion.choices[0].message.content) +``` + +#### cURL +cURL is another good tool for observing the output of the api. + +For the following examples, you may also change the service deployment address. + +List Models: +```bash +curl http://localhost:8000/v1/models +``` + +If you have `jq` installed, you can use it to format the output like this: +```bash +curl http://localhost:8000/v1/models | jq +``` + +Chat Completions: +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "YOUR_MODEL", + "messages": [{"role": "user", "content": "Hello! What is your name?"}] + }' +``` + +Text Completions: +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "YOUR_MODEL", + "prompt": "Once upon a time", + "max_tokens": 41, + "temperature": 0.5 + }' +``` + +Embeddings: +```bash +curl http://localhost:8000/v1/embeddings \ + -H "Content-Type: application/json" \ + -d '{ + "model": "YOUR_MODEL", + "input": "Hello world!" + }' +``` \ No newline at end of file diff --git a/docker/llm/serving/cpu/kubernetes/clean.sh b/docker/llm/serving/cpu/kubernetes/clean.sh new file mode 100644 index 00000000..d5d1729d --- /dev/null +++ b/docker/llm/serving/cpu/kubernetes/clean.sh @@ -0,0 +1 @@ +kubectl delete -f deployment.yaml \ No newline at end of file diff --git a/docker/llm/serving/cpu/kubernetes/deployment.yaml b/docker/llm/serving/cpu/kubernetes/deployment.yaml new file mode 100644 index 00000000..bd659fd4 --- /dev/null +++ b/docker/llm/serving/cpu/kubernetes/deployment.yaml @@ -0,0 +1,109 @@ +apiVersion: v1 +kind: Pod +metadata: + name: bigdl-fschat-a1234bd-controller + labels: + fastchat-appid: a1234bd + fastchat-app-type: controller +spec: + dnsPolicy: "ClusterFirst" + containers: + - name: fastchat-controller # fixed + image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT + imagePullPolicy: IfNotPresent + env: + - name: CONTROLLER_HOST # fixed + value: "0.0.0.0" + - name: CONTROLLER_PORT # fixed + value: "21005" + - name: API_HOST # fixed + value: "0.0.0.0" + - name: API_PORT # fixed + value: "8000" + ports: + - containerPort: 21005 + name: con-port + - containerPort: 8000 + name: api-port + resources: + requests: + memory: 16Gi + cpu: 4 + limits: + memory: 16Gi + cpu: 4 + args: ["-m", "controller"] + restartPolicy: "Never" +--- +# Service for the controller +apiVersion: v1 +kind: Service +metadata: + name: bigdl-a1234bd-fschat-controller-service +spec: + # You may also want to change this to use the cluster's feature + type: NodePort + selector: + fastchat-appid: a1234bd + fastchat-app-type: controller + ports: + - name: cont-port + protocol: TCP + port: 21005 + targetPort: 21005 + - name: api-port + protocol: TCP + port: 8000 + targetPort: 8000 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: bigdl-fschat-a1234bd-worker-deployment +spec: + # Change this to the number you want + replicas: 1 + selector: + matchLabels: + fastchat: worker + template: + metadata: + labels: + fastchat: worker + spec: + dnsPolicy: "ClusterFirst" + containers: + - name: fastchat-worker # fixed + image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT + imagePullPolicy: IfNotPresent + env: + - name: CONTROLLER_HOST # fixed + value: bigdl-a1234bd-fschat-controller-service + - name: CONTROLLER_PORT # fixed + value: "21005" + - name: WORKER_HOST # fixed + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: WORKER_PORT # fixed + value: "21841" + - name: MODEL_PATH # Change this + value: "/llm/models/vicuna-7b-v1.5-bigdl/" + - name: OMP_NUM_THREADS + value: "16" + resources: + requests: + memory: 32Gi + cpu: 16 + limits: + memory: 32Gi + cpu: 16 + args: ["-m", "worker"] + volumeMounts: + - name: llm-models + mountPath: /llm/models/ + restartPolicy: "Always" + volumes: + - name: llm-models + hostPath: + path: /home/llm/models # change this in other envs \ No newline at end of file