Add Kubernetes support for BigDL-LLM-serving CPU. (#9071)
This commit is contained in:
		
							parent
							
								
									36dd4afd61
								
							
						
					
					
						commit
						b773d67dd4
					
				
					 6 changed files with 580 additions and 1 deletions
				
			
		
							
								
								
									
										27
									
								
								.github/workflows/manually_build.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										27
									
								
								.github/workflows/manually_build.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
				
			
			@ -12,6 +12,7 @@ on:
 | 
			
		|||
        - all
 | 
			
		||||
        - bigdl-llm-xpu
 | 
			
		||||
        - bigdl-llm-cpu
 | 
			
		||||
        - bigdl-llm-serving-cpu
 | 
			
		||||
        - bigdl-ppml-gramine-base
 | 
			
		||||
        - bigdl-ppml-trusted-bigdl-llm-gramine-base
 | 
			
		||||
        - bigdl-ppml-trusted-bigdl-llm-gramine-ref
 | 
			
		||||
| 
						 | 
				
			
			@ -114,6 +115,32 @@ jobs:
 | 
			
		|||
        sudo docker push 10.239.45.10/arda/${image}:${TAG}
 | 
			
		||||
        sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
 | 
			
		||||
 | 
			
		||||
  bigdl-llm-serving-cpu:
 | 
			
		||||
    if: ${{ github.event.inputs.artifact == 'bigdl-llm-serving-cpu' || github.event.inputs.artifact == 'all' }}
 | 
			
		||||
    runs-on: [self-hosted, Shire]
 | 
			
		||||
    steps:
 | 
			
		||||
    - uses: actions/checkout@v3
 | 
			
		||||
    - name: docker login
 | 
			
		||||
      run: |
 | 
			
		||||
        docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
 | 
			
		||||
    - name: bigdl-llm-serving-cpu
 | 
			
		||||
      run: |
 | 
			
		||||
        echo "##############################################################"
 | 
			
		||||
        echo "####### bigdl-llm-serving-cpu ########"
 | 
			
		||||
        echo "##############################################################"
 | 
			
		||||
        export image=intelanalytics/bigdl-llm-serving-cpu
 | 
			
		||||
        cd docker/llm/serving/cpu/docker
 | 
			
		||||
        sudo docker build \
 | 
			
		||||
          --no-cache=true \
 | 
			
		||||
          --build-arg http_proxy=${HTTP_PROXY} \
 | 
			
		||||
          --build-arg https_proxy=${HTTPS_PROXY} \
 | 
			
		||||
          --build-arg no_proxy=${NO_PROXY} \
 | 
			
		||||
          -t ${image}:${TAG} -f ./Dockerfile .
 | 
			
		||||
        sudo docker push ${image}:${TAG}
 | 
			
		||||
        sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
 | 
			
		||||
        sudo docker push 10.239.45.10/arda/${image}:${TAG}
 | 
			
		||||
        sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
 | 
			
		||||
 | 
			
		||||
  bigdl-ppml-gramine-base:
 | 
			
		||||
    if: ${{ github.event.inputs.artifact == 'bigdl-ppml-gramine-base' || github.event.inputs.artifact == 'all' }}
 | 
			
		||||
    runs-on: [self-hosted, Shire]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -2,10 +2,13 @@ FROM intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT
 | 
			
		|||
 | 
			
		||||
ARG http_proxy
 | 
			
		||||
ARG https_proxy
 | 
			
		||||
ARG TINI_VERSION=v0.18.0
 | 
			
		||||
 | 
			
		||||
# Disable pip's cache behavior
 | 
			
		||||
ARG PIP_NO_CACHE_DIR=false
 | 
			
		||||
 | 
			
		||||
ADD ./entrypoint.sh /opt/entrypoint.sh
 | 
			
		||||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini
 | 
			
		||||
# Install Serving Dependencies
 | 
			
		||||
RUN mkdir /llm && \
 | 
			
		||||
    cd /llm && \
 | 
			
		||||
| 
						 | 
				
			
			@ -13,7 +16,11 @@ RUN mkdir /llm && \
 | 
			
		|||
    cd FastChat && \
 | 
			
		||||
    git checkout dev-2023-09-22 && \
 | 
			
		||||
    pip3 install -e ".[model_worker,webui]" && \
 | 
			
		||||
    cd /llm
 | 
			
		||||
    cd /llm && \
 | 
			
		||||
    chmod +x /opt/entrypoint.sh && \
 | 
			
		||||
    chmod +x /sbin/tini && \
 | 
			
		||||
    cp /sbin/tini /usr/bin/tini
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
WORKDIR /llm/
 | 
			
		||||
ENTRYPOINT [ "/opt/entrypoint.sh" ]
 | 
			
		||||
							
								
								
									
										200
									
								
								docker/llm/serving/cpu/docker/entrypoint.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										200
									
								
								docker/llm/serving/cpu/docker/entrypoint.sh
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,200 @@
 | 
			
		|||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
usage() {
 | 
			
		||||
  echo "Usage: $0 [-m --mode <controller|worker>] [-h --help]"
 | 
			
		||||
  echo "-h: Print help message."
 | 
			
		||||
  echo "Controller mode reads the following env:"
 | 
			
		||||
  echo "CONTROLLER_HOST (default: localhost)."
 | 
			
		||||
  echo "CONTROLLER_PORT (default: 21001)."
 | 
			
		||||
  echo "API_HOST (default: localhost)."
 | 
			
		||||
  echo "API_PORT (default: 8000)."
 | 
			
		||||
  echo "Worker mode reads the following env:"
 | 
			
		||||
  echo "CONTROLLER_HOST (default: localhost)."
 | 
			
		||||
  echo "CONTROLLER_PORT (default: 21001)."
 | 
			
		||||
  echo "WORKER_HOST (default: localhost)."
 | 
			
		||||
  echo "WORKER_PORT (default: 21002)."
 | 
			
		||||
  echo "MODEL_PATH (default: empty)."
 | 
			
		||||
  exit 1
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Acquire correct core_nums if using cpuset-cpus, return -1 if file not exist
 | 
			
		||||
calculate_total_cores() {
 | 
			
		||||
  local cpuset_file="/sys/fs/cgroup/cpuset/cpuset.cpus"
 | 
			
		||||
 | 
			
		||||
  if [[ -f "$cpuset_file" ]]; then
 | 
			
		||||
    local cpuset_cpus=$(cat "$cpuset_file")
 | 
			
		||||
    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
 | 
			
		||||
 | 
			
		||||
    local total_cores=0
 | 
			
		||||
    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
 | 
			
		||||
    for cpu in "${cpu_list[@]}"; do
 | 
			
		||||
      if [[ $cpu =~ - ]]; then
 | 
			
		||||
        # Range of CPUs
 | 
			
		||||
        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
 | 
			
		||||
        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
 | 
			
		||||
        local range_cores=$((end_cpu - start_cpu + 1))
 | 
			
		||||
        total_cores=$((total_cores + range_cores))
 | 
			
		||||
      else
 | 
			
		||||
        # Single CPU
 | 
			
		||||
        total_cores=$((total_cores + 1))
 | 
			
		||||
      fi
 | 
			
		||||
    done
 | 
			
		||||
 | 
			
		||||
    echo $total_cores
 | 
			
		||||
    return
 | 
			
		||||
  fi
 | 
			
		||||
  # Kubernetes core-binding will use this file
 | 
			
		||||
  cpuset_file="/sys/fs/cgroup/cpuset.cpus"
 | 
			
		||||
  if [[ -f "$cpuset_file" ]]; then
 | 
			
		||||
    local cpuset_cpus=$(cat "$cpuset_file")
 | 
			
		||||
    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
 | 
			
		||||
 | 
			
		||||
    local total_cores=0
 | 
			
		||||
    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
 | 
			
		||||
    for cpu in "${cpu_list[@]}"; do
 | 
			
		||||
      if [[ $cpu =~ - ]]; then
 | 
			
		||||
        # Range of CPUs
 | 
			
		||||
        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
 | 
			
		||||
        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
 | 
			
		||||
        local range_cores=$((end_cpu - start_cpu + 1))
 | 
			
		||||
        total_cores=$((total_cores + range_cores))
 | 
			
		||||
      else
 | 
			
		||||
        # Single CPU
 | 
			
		||||
        total_cores=$((total_cores + 1))
 | 
			
		||||
      fi
 | 
			
		||||
    done
 | 
			
		||||
 | 
			
		||||
    echo $total_cores
 | 
			
		||||
    return
 | 
			
		||||
  else
 | 
			
		||||
    echo -1
 | 
			
		||||
    return
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Default values
 | 
			
		||||
controller_host="localhost"
 | 
			
		||||
controller_port="21001"
 | 
			
		||||
api_host="localhost"
 | 
			
		||||
api_port="8000"
 | 
			
		||||
worker_host="localhost"
 | 
			
		||||
worker_port="21002"
 | 
			
		||||
model_path=""
 | 
			
		||||
mode=""
 | 
			
		||||
omp_num_threads=""
 | 
			
		||||
dispatch_method="shortest_queue" # shortest_queue or lottery
 | 
			
		||||
 | 
			
		||||
# Update rootCA config if needed
 | 
			
		||||
update-ca-certificates
 | 
			
		||||
 | 
			
		||||
# Remember the value of `OMP_NUM_THREADS`:
 | 
			
		||||
if [[ -n "${OMP_NUM_THREADS}" ]]; then
 | 
			
		||||
  omp_num_threads="${OMP_NUM_THREADS}"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# We do not have any arguments, just run bash
 | 
			
		||||
if [ "$#" == 0 ]; then
 | 
			
		||||
  echo "[INFO] no command is passed in"
 | 
			
		||||
  echo "[INFO] enter pass-through mode"
 | 
			
		||||
  exec /usr/bin/tini -s -- "bash"
 | 
			
		||||
else
 | 
			
		||||
  # Parse command-line options
 | 
			
		||||
  options=$(getopt -o "m:h" --long "mode:,help" -n "$0" -- "$@")
 | 
			
		||||
  if [ $? != 0 ]; then
 | 
			
		||||
    usage
 | 
			
		||||
  fi
 | 
			
		||||
  eval set -- "$options"
 | 
			
		||||
 | 
			
		||||
  while true; do
 | 
			
		||||
    case "$1" in
 | 
			
		||||
      -m|--mode)
 | 
			
		||||
        mode="$2"
 | 
			
		||||
        [[ $mode == "controller" || $mode == "worker" ]] || usage
 | 
			
		||||
        shift 2
 | 
			
		||||
        ;;
 | 
			
		||||
      -h|--help)
 | 
			
		||||
        usage
 | 
			
		||||
        ;;
 | 
			
		||||
      --)
 | 
			
		||||
        shift
 | 
			
		||||
        break
 | 
			
		||||
        ;;
 | 
			
		||||
      *)
 | 
			
		||||
        usage
 | 
			
		||||
        ;;
 | 
			
		||||
    esac
 | 
			
		||||
  done
 | 
			
		||||
 | 
			
		||||
  if [[ -n $CONTROLLER_HOST ]]; then
 | 
			
		||||
    controller_host=$CONTROLLER_HOST
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [[ -n $CONTROLLER_PORT ]]; then
 | 
			
		||||
    controller_port=$CONTROLLER_PORT
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [[ -n $API_HOST ]]; then
 | 
			
		||||
    api_host=$API_HOST
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [[ -n $API_PORT ]]; then
 | 
			
		||||
    api_port=$API_PORT
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [[ -n $WORKER_HOST ]]; then
 | 
			
		||||
    worker_host=$WORKER_HOST
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [[ -n $WORKER_PORT ]]; then
 | 
			
		||||
    worker_port=$WORKER_PORT
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [[ -n $MODEL_PATH ]]; then
 | 
			
		||||
    model_path=$MODEL_PATH
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [[ -n $DISPATCH_METHOD ]]; then
 | 
			
		||||
    dispatch_method=$DISPATCH_METHOD
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  controller_address="http://$controller_host:$controller_port"
 | 
			
		||||
  # Execute logic based on options
 | 
			
		||||
  if [[ $mode == "controller" ]]; then
 | 
			
		||||
    # Logic for controller mode
 | 
			
		||||
    # Boot Controller
 | 
			
		||||
    api_address="http://$api_host:$api_port"
 | 
			
		||||
    echo "Controller address: $controller_address"
 | 
			
		||||
    echo "OpenAI API address: $api_address"
 | 
			
		||||
    python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
 | 
			
		||||
    # Boot openai api server
 | 
			
		||||
    python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address
 | 
			
		||||
  else
 | 
			
		||||
    # Logic for non-controller(worker) mode
 | 
			
		||||
    worker_address="http://$worker_host:$worker_port"
 | 
			
		||||
    # Apply optimizations from bigdl-nano
 | 
			
		||||
    source bigdl-nano-init -t
 | 
			
		||||
    # First check if user have set OMP_NUM_THREADS by themselves
 | 
			
		||||
    if [[ -n "${omp_num_threads}" ]]; then
 | 
			
		||||
      echo "Setting OMP_NUM_THREADS to its original value: $omp_num_threads"
 | 
			
		||||
      export OMP_NUM_THREADS=$omp_num_threads
 | 
			
		||||
    else
 | 
			
		||||
      # Use calculate_total_cores to acquire cpuset settings
 | 
			
		||||
      # Set OMP_NUM_THREADS to correct numbers
 | 
			
		||||
      cores=$(calculate_total_cores)
 | 
			
		||||
      if [[ $cores == -1 || $cores == 0 ]]; then
 | 
			
		||||
        echo "Failed to obtain the number of cores, will use the default settings OMP_NUM_THREADS=$OMP_NUM_THREADS"
 | 
			
		||||
      else
 | 
			
		||||
        echo "Setting OMP_NUM_THREADS to $cores"
 | 
			
		||||
        export OMP_NUM_THREADS=$cores
 | 
			
		||||
      fi
 | 
			
		||||
    fi
 | 
			
		||||
    if [[ -z "${model_path}" ]]; then
 | 
			
		||||
          echo "Please set env MODEL_PATH used for worker"
 | 
			
		||||
          usage
 | 
			
		||||
    fi
 | 
			
		||||
    echo "Worker address: $worker_address"
 | 
			
		||||
    echo "Controller address: $controller_address"
 | 
			
		||||
    python3 -m fastchat.serve.model_worker --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
 | 
			
		||||
  fi
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										235
									
								
								docker/llm/serving/cpu/kubernetes/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										235
									
								
								docker/llm/serving/cpu/kubernetes/README.md
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,235 @@
 | 
			
		|||
## Deployment bigdl-llm serving service in K8S environment
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Image
 | 
			
		||||
 | 
			
		||||
To deploy BigDL-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT`
 | 
			
		||||
 | 
			
		||||
## Before deployment
 | 
			
		||||
 | 
			
		||||
### Models
 | 
			
		||||
 | 
			
		||||
In this document, we will use `vicuna-7b-v1.5` as the deployment model.
 | 
			
		||||
 | 
			
		||||
After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-bigdl` to use `bigdl-llm` as the backend. The `bigdl-llm` backend will be used if model path contains `bigdl`. Otherwise, the original transformer-backend will be used.
 | 
			
		||||
 | 
			
		||||
You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5).
 | 
			
		||||
 | 
			
		||||
### Kubernetes config
 | 
			
		||||
 | 
			
		||||
We recommend to setup your kubernetes cluster before deployment.  Mostly importantly, please set `cpu-management-policy` to `static` by using this [tutorial](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/).  Also, it would be great to also set the `topology management policy` to `single-numa-node`.
 | 
			
		||||
 | 
			
		||||
### Machine config
 | 
			
		||||
 | 
			
		||||
Set hyper-threading to off, ensure that only physical cores are used during deployment.
 | 
			
		||||
 | 
			
		||||
## Deployment
 | 
			
		||||
 | 
			
		||||
### Reminder on `OMP_NUM_THREADS`
 | 
			
		||||
 | 
			
		||||
The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct number by reading configs from the `runtime`.  However, this only happens correctly if the `core-binding` feature is enabled.  If not, please set environment variable `OMP_NUM_THREADS` manually in the yaml file.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Controller
 | 
			
		||||
 | 
			
		||||
We use the following yaml file for controller deployment:
 | 
			
		||||
 | 
			
		||||
```yaml
 | 
			
		||||
apiVersion: v1
 | 
			
		||||
kind: Pod
 | 
			
		||||
metadata:
 | 
			
		||||
  name: bigdl-fschat-a1234bd-controller
 | 
			
		||||
  labels:
 | 
			
		||||
    fastchat-appid: a1234bd
 | 
			
		||||
    fastchat-app-type: controller
 | 
			
		||||
spec:
 | 
			
		||||
  dnsPolicy: "ClusterFirst"
 | 
			
		||||
  containers:
 | 
			
		||||
  - name: fastchat-controller # fixed
 | 
			
		||||
    image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
 | 
			
		||||
    imagePullPolicy: IfNotPresent
 | 
			
		||||
    env:
 | 
			
		||||
    - name: CONTROLLER_HOST # fixed
 | 
			
		||||
      value: "0.0.0.0"
 | 
			
		||||
    - name: CONTROLLER_PORT # fixed
 | 
			
		||||
      value: "21005"
 | 
			
		||||
    - name: API_HOST # fixed
 | 
			
		||||
      value: "0.0.0.0"
 | 
			
		||||
    - name: API_PORT # fixed
 | 
			
		||||
      value: "8000"
 | 
			
		||||
    ports:
 | 
			
		||||
      - containerPort: 21005
 | 
			
		||||
        name: con-port
 | 
			
		||||
      - containerPort: 8000
 | 
			
		||||
        name: api-port
 | 
			
		||||
    resources:
 | 
			
		||||
      requests:
 | 
			
		||||
        memory: 16Gi
 | 
			
		||||
        cpu: 4
 | 
			
		||||
      limits:
 | 
			
		||||
        memory: 16Gi
 | 
			
		||||
        cpu: 4
 | 
			
		||||
    args: ["-m", "controller"]
 | 
			
		||||
  restartPolicy: "Never"
 | 
			
		||||
---
 | 
			
		||||
# Service for the controller
 | 
			
		||||
apiVersion: v1
 | 
			
		||||
kind: Service
 | 
			
		||||
metadata:
 | 
			
		||||
  name: bigdl-a1234bd-fschat-controller-service
 | 
			
		||||
spec:
 | 
			
		||||
  # You may also want to change this to use the cluster's feature
 | 
			
		||||
  type: NodePort
 | 
			
		||||
  selector:
 | 
			
		||||
    fastchat-appid: a1234bd
 | 
			
		||||
    fastchat-app-type: controller
 | 
			
		||||
  ports:
 | 
			
		||||
    - name: cont-port
 | 
			
		||||
      protocol: TCP
 | 
			
		||||
      port: 21005
 | 
			
		||||
      targetPort: 21005
 | 
			
		||||
    - name: api-port
 | 
			
		||||
      protocol: TCP
 | 
			
		||||
      port: 8000
 | 
			
		||||
      targetPort: 8000
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### Worker
 | 
			
		||||
 | 
			
		||||
We use the following deployment for worker deployment:
 | 
			
		||||
 | 
			
		||||
```yaml
 | 
			
		||||
apiVersion: apps/v1
 | 
			
		||||
kind: Deployment
 | 
			
		||||
metadata:
 | 
			
		||||
  name: bigdl-fschat-a1234bd-worker-deployment
 | 
			
		||||
spec:
 | 
			
		||||
  # Change this to the number you want
 | 
			
		||||
  replicas: 1
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      fastchat: worker
 | 
			
		||||
  template:
 | 
			
		||||
    metadata:
 | 
			
		||||
      labels:
 | 
			
		||||
        fastchat: worker
 | 
			
		||||
    spec:
 | 
			
		||||
      dnsPolicy: "ClusterFirst"
 | 
			
		||||
      containers:
 | 
			
		||||
      - name: fastchat-worker # fixed
 | 
			
		||||
        image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
 | 
			
		||||
        imagePullPolicy: IfNotPresent
 | 
			
		||||
        env:
 | 
			
		||||
        - name: CONTROLLER_HOST # fixed
 | 
			
		||||
          value: bigdl-a1234bd-fschat-controller-service
 | 
			
		||||
        - name: CONTROLLER_PORT # fixed
 | 
			
		||||
          value: "21005"
 | 
			
		||||
        - name: WORKER_HOST # fixed
 | 
			
		||||
          valueFrom:
 | 
			
		||||
            fieldRef:
 | 
			
		||||
              fieldPath: status.podIP
 | 
			
		||||
        - name: WORKER_PORT # fixed
 | 
			
		||||
          value: "21841"
 | 
			
		||||
        - name: MODEL_PATH # Change this
 | 
			
		||||
          value: "/llm/models/vicuna-7b-v1.5-bigdl/"
 | 
			
		||||
        - name: OMP_NUM_THREADS
 | 
			
		||||
          value: "16"
 | 
			
		||||
        resources:
 | 
			
		||||
          requests:
 | 
			
		||||
            memory: 32Gi
 | 
			
		||||
            cpu: 16
 | 
			
		||||
          limits:
 | 
			
		||||
            memory: 32Gi
 | 
			
		||||
            cpu: 16
 | 
			
		||||
        args: ["-m", "worker"]
 | 
			
		||||
        volumeMounts:
 | 
			
		||||
          - name: llm-models
 | 
			
		||||
            mountPath: /llm/models/
 | 
			
		||||
      restartPolicy: "Always"
 | 
			
		||||
      volumes:
 | 
			
		||||
      - name: llm-models
 | 
			
		||||
        hostPath:
 | 
			
		||||
          path: /home/llm/models # change this in other envs
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
You may want to change the `MODEL_PATH` variable in the yaml.  Also, please remember to change the volume path accordingly.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Testing
 | 
			
		||||
 | 
			
		||||
#### Using openai-python
 | 
			
		||||
 | 
			
		||||
First, install openai-python:
 | 
			
		||||
```bash
 | 
			
		||||
pip install --upgrade openai
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Then, interact with model vicuna-7b-v1.5-bigdl:
 | 
			
		||||
```python
 | 
			
		||||
import openai
 | 
			
		||||
openai.api_key = "EMPTY"
 | 
			
		||||
openai.api_base = "http://localhost:8000/v1"
 | 
			
		||||
 | 
			
		||||
model = "vicuna-7b-v1.5-bigdl"
 | 
			
		||||
prompt = "Once upon a time"
 | 
			
		||||
 | 
			
		||||
# create a completion
 | 
			
		||||
completion = openai.Completion.create(model=model, prompt=prompt, max_tokens=64)
 | 
			
		||||
# print the completion
 | 
			
		||||
print(prompt + completion.choices[0].text)
 | 
			
		||||
 | 
			
		||||
# create a chat completion
 | 
			
		||||
completion = openai.ChatCompletion.create(
 | 
			
		||||
  model=model,
 | 
			
		||||
  messages=[{"role": "user", "content": "Hello! What is your name?"}]
 | 
			
		||||
)
 | 
			
		||||
# print the completion
 | 
			
		||||
print(completion.choices[0].message.content)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
#### cURL
 | 
			
		||||
cURL is another good tool for observing the output of the api.
 | 
			
		||||
 | 
			
		||||
For the following examples, you may also change the service deployment address.
 | 
			
		||||
 | 
			
		||||
List Models:
 | 
			
		||||
```bash
 | 
			
		||||
curl http://localhost:8000/v1/models
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
If you have `jq` installed, you can use it to format the output like this:
 | 
			
		||||
```bash
 | 
			
		||||
curl http://localhost:8000/v1/models | jq
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Chat Completions:
 | 
			
		||||
```bash
 | 
			
		||||
curl http://localhost:8000/v1/chat/completions \
 | 
			
		||||
  -H "Content-Type: application/json" \
 | 
			
		||||
  -d '{
 | 
			
		||||
    "model": "YOUR_MODEL",
 | 
			
		||||
    "messages": [{"role": "user", "content": "Hello! What is your name?"}]
 | 
			
		||||
  }'
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Text Completions:
 | 
			
		||||
```bash
 | 
			
		||||
curl http://localhost:8000/v1/completions \
 | 
			
		||||
  -H "Content-Type: application/json" \
 | 
			
		||||
  -d '{
 | 
			
		||||
    "model": "YOUR_MODEL",
 | 
			
		||||
    "prompt": "Once upon a time",
 | 
			
		||||
    "max_tokens": 41,
 | 
			
		||||
    "temperature": 0.5
 | 
			
		||||
  }'
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Embeddings:
 | 
			
		||||
```bash
 | 
			
		||||
curl http://localhost:8000/v1/embeddings \
 | 
			
		||||
  -H "Content-Type: application/json" \
 | 
			
		||||
  -d '{
 | 
			
		||||
    "model": "YOUR_MODEL",
 | 
			
		||||
    "input": "Hello world!"
 | 
			
		||||
  }'
 | 
			
		||||
```
 | 
			
		||||
							
								
								
									
										1
									
								
								docker/llm/serving/cpu/kubernetes/clean.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								docker/llm/serving/cpu/kubernetes/clean.sh
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
kubectl delete -f deployment.yaml
 | 
			
		||||
							
								
								
									
										109
									
								
								docker/llm/serving/cpu/kubernetes/deployment.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										109
									
								
								docker/llm/serving/cpu/kubernetes/deployment.yaml
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,109 @@
 | 
			
		|||
apiVersion: v1
 | 
			
		||||
kind: Pod
 | 
			
		||||
metadata:
 | 
			
		||||
  name: bigdl-fschat-a1234bd-controller
 | 
			
		||||
  labels:
 | 
			
		||||
    fastchat-appid: a1234bd
 | 
			
		||||
    fastchat-app-type: controller
 | 
			
		||||
spec:
 | 
			
		||||
  dnsPolicy: "ClusterFirst"
 | 
			
		||||
  containers:
 | 
			
		||||
  - name: fastchat-controller # fixed
 | 
			
		||||
    image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
 | 
			
		||||
    imagePullPolicy: IfNotPresent
 | 
			
		||||
    env:
 | 
			
		||||
    - name: CONTROLLER_HOST # fixed
 | 
			
		||||
      value: "0.0.0.0"
 | 
			
		||||
    - name: CONTROLLER_PORT # fixed
 | 
			
		||||
      value: "21005"
 | 
			
		||||
    - name: API_HOST # fixed
 | 
			
		||||
      value: "0.0.0.0"
 | 
			
		||||
    - name: API_PORT # fixed
 | 
			
		||||
      value: "8000"
 | 
			
		||||
    ports:
 | 
			
		||||
      - containerPort: 21005
 | 
			
		||||
        name: con-port
 | 
			
		||||
      - containerPort: 8000
 | 
			
		||||
        name: api-port
 | 
			
		||||
    resources:
 | 
			
		||||
      requests:
 | 
			
		||||
        memory: 16Gi
 | 
			
		||||
        cpu: 4
 | 
			
		||||
      limits:
 | 
			
		||||
        memory: 16Gi
 | 
			
		||||
        cpu: 4
 | 
			
		||||
    args: ["-m", "controller"]
 | 
			
		||||
  restartPolicy: "Never"
 | 
			
		||||
---
 | 
			
		||||
# Service for the controller
 | 
			
		||||
apiVersion: v1
 | 
			
		||||
kind: Service
 | 
			
		||||
metadata:
 | 
			
		||||
  name: bigdl-a1234bd-fschat-controller-service
 | 
			
		||||
spec:
 | 
			
		||||
  # You may also want to change this to use the cluster's feature
 | 
			
		||||
  type: NodePort
 | 
			
		||||
  selector:
 | 
			
		||||
    fastchat-appid: a1234bd
 | 
			
		||||
    fastchat-app-type: controller
 | 
			
		||||
  ports:
 | 
			
		||||
    - name: cont-port
 | 
			
		||||
      protocol: TCP
 | 
			
		||||
      port: 21005
 | 
			
		||||
      targetPort: 21005
 | 
			
		||||
    - name: api-port
 | 
			
		||||
      protocol: TCP
 | 
			
		||||
      port: 8000
 | 
			
		||||
      targetPort: 8000
 | 
			
		||||
---
 | 
			
		||||
apiVersion: apps/v1
 | 
			
		||||
kind: Deployment
 | 
			
		||||
metadata:
 | 
			
		||||
  name: bigdl-fschat-a1234bd-worker-deployment
 | 
			
		||||
spec:
 | 
			
		||||
  # Change this to the number you want
 | 
			
		||||
  replicas: 1
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      fastchat: worker
 | 
			
		||||
  template:
 | 
			
		||||
    metadata:
 | 
			
		||||
      labels:
 | 
			
		||||
        fastchat: worker
 | 
			
		||||
    spec:
 | 
			
		||||
      dnsPolicy: "ClusterFirst"
 | 
			
		||||
      containers:
 | 
			
		||||
      - name: fastchat-worker # fixed
 | 
			
		||||
        image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
 | 
			
		||||
        imagePullPolicy: IfNotPresent
 | 
			
		||||
        env:
 | 
			
		||||
        - name: CONTROLLER_HOST # fixed
 | 
			
		||||
          value: bigdl-a1234bd-fschat-controller-service
 | 
			
		||||
        - name: CONTROLLER_PORT # fixed
 | 
			
		||||
          value: "21005"
 | 
			
		||||
        - name: WORKER_HOST # fixed
 | 
			
		||||
          valueFrom:
 | 
			
		||||
            fieldRef:
 | 
			
		||||
              fieldPath: status.podIP
 | 
			
		||||
        - name: WORKER_PORT # fixed
 | 
			
		||||
          value: "21841"
 | 
			
		||||
        - name: MODEL_PATH # Change this
 | 
			
		||||
          value: "/llm/models/vicuna-7b-v1.5-bigdl/"
 | 
			
		||||
        - name: OMP_NUM_THREADS
 | 
			
		||||
          value: "16"
 | 
			
		||||
        resources:
 | 
			
		||||
          requests:
 | 
			
		||||
            memory: 32Gi
 | 
			
		||||
            cpu: 16
 | 
			
		||||
          limits:
 | 
			
		||||
            memory: 32Gi
 | 
			
		||||
            cpu: 16
 | 
			
		||||
        args: ["-m", "worker"]
 | 
			
		||||
        volumeMounts:
 | 
			
		||||
          - name: llm-models
 | 
			
		||||
            mountPath: /llm/models/
 | 
			
		||||
      restartPolicy: "Always"
 | 
			
		||||
      volumes:
 | 
			
		||||
      - name: llm-models
 | 
			
		||||
        hostPath:
 | 
			
		||||
          path: /home/llm/models # change this in other envs
 | 
			
		||||
		Loading…
	
		Reference in a new issue