ipex-llm/docker/llm/serving/cpu/docker/entrypoint.sh

#!/bin/bash

usage() {
  echo "Usage: $0 [-m --mode <controller|worker>] [-h --help] [-w --worker <model_worker|vllm_worker>]"
  echo "-h: Print help message."
  echo "Controller mode reads the following env:"
  echo "CONTROLLER_HOST (default: localhost)."
  echo "CONTROLLER_PORT (default: 21001)."
  echo "API_HOST (default: localhost)."
  echo "API_PORT (default: 8000)."
  echo "Worker mode reads the following env:"
  echo "CONTROLLER_HOST (default: localhost)."
  echo "CONTROLLER_PORT (default: 21001)."
  echo "WORKER_HOST (default: localhost)."
  echo "WORKER_PORT (default: 21002)."
  echo "MODEL_PATH (default: empty)."
  echo "STREAM_INTERVAL (default: 1)."
  exit 1
}

# Acquire correct core_nums if using cpuset-cpus, return -1 if file not exist
calculate_total_cores() {
  local cpuset_file="/sys/fs/cgroup/cpuset/cpuset.cpus"

  if [[ -f "$cpuset_file" ]]; then
    local cpuset_cpus=$(cat "$cpuset_file")
    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')

    local total_cores=0
    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
    for cpu in "${cpu_list[@]}"; do
      if [[ $cpu =~ - ]]; then
        # Range of CPUs
        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
        local range_cores=$((end_cpu - start_cpu + 1))
        total_cores=$((total_cores + range_cores))
      else
        # Single CPU
        total_cores=$((total_cores + 1))
      fi
    done

    echo $total_cores
    return
  fi
  # Kubernetes core-binding will use this file
  cpuset_file="/sys/fs/cgroup/cpuset.cpus"
  if [[ -f "$cpuset_file" ]]; then
    local cpuset_cpus=$(cat "$cpuset_file")
    cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')

    local total_cores=0
    IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
    for cpu in "${cpu_list[@]}"; do
      if [[ $cpu =~ - ]]; then
        # Range of CPUs
        local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
        local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
        local range_cores=$((end_cpu - start_cpu + 1))
        total_cores=$((total_cores + range_cores))
      else
        # Single CPU
        total_cores=$((total_cores + 1))
      fi
    done

    echo $total_cores
    return
  else
    echo -1
    return
  fi
}

# Default values
controller_host="localhost"
controller_port="21001"
api_host="localhost"
api_port="8000"
worker_host="localhost"
worker_port="21002"
model_path=""
mode=""
omp_num_threads=""
dispatch_method="shortest_queue" # shortest_queue or lottery
stream_interval=1
worker_type="model_worker"

# Update rootCA config if needed
update-ca-certificates

# Remember the value of `OMP_NUM_THREADS`:
if [[ -n "${OMP_NUM_THREADS}" ]]; then
  omp_num_threads="${OMP_NUM_THREADS}"
fi

# We do not have any arguments, just run bash
if [ "$#" == 0 ]; then
  echo "[INFO] no command is passed in"
  echo "[INFO] enter pass-through mode"
  exec /usr/bin/tini -s -- "bash"
else
  # Parse command-line options
  options=$(getopt -o "m:hw:" --long "mode:,help,worker:" -n "$0" -- "$@")
  if [ $? != 0 ]; then
    usage
  fi
  eval set -- "$options"

  while true; do
    case "$1" in
      -m|--mode)
        mode="$2"
        [[ $mode == "controller" || $mode == "worker" ]] || usage
        shift 2
        ;;
      -w|--worker)
        worker_type="$2"
        [[ $worker_type == "model_worker" || $worker_type == "vllm_worker" ]] || usage
        shift 2
        ;;
      -h|--help)
        usage
        ;;
      --)
        shift
        break
        ;;
      *)
        usage
        ;;
    esac
  done

  if [ "$worker_type" == "model_worker" ]; then
      worker_type="bigdl.llm.serving.model_worker"
  elif [ "$worker_type" == "vllm_worker" ]; then
      worker_type="bigdl.llm.serving.vllm_worker"
  fi

  if [[ -n $CONTROLLER_HOST ]]; then
    controller_host=$CONTROLLER_HOST
  fi

  if [[ -n $CONTROLLER_PORT ]]; then
    controller_port=$CONTROLLER_PORT
  fi

  if [[ -n $API_HOST ]]; then
    api_host=$API_HOST
  fi

  if [[ -n $API_PORT ]]; then
    api_port=$API_PORT
  fi

  if [[ -n $WORKER_HOST ]]; then
    worker_host=$WORKER_HOST
  fi

  if [[ -n $WORKER_PORT ]]; then
    worker_port=$WORKER_PORT
  fi

  if [[ -n $MODEL_PATH ]]; then
    model_path=$MODEL_PATH
  fi

  if [[ -n $DISPATCH_METHOD ]]; then
    dispatch_method=$DISPATCH_METHOD
  fi

  if [[ -n $STREAM_INTERVAL ]]; then
    stream_interval=$STREAM_INTERVAL
  fi

  controller_address="http://$controller_host:$controller_port"
  # Execute logic based on options
  if [[ $mode == "controller" ]]; then
    # Logic for controller mode
    # Boot Controller
    api_address="http://$api_host:$api_port"
    echo "Controller address: $controller_address"
    echo "OpenAI API address: $api_address"
    python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
    # Boot openai api server
    python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address
  else
    # Logic for non-controller(worker) mode
    worker_address="http://$worker_host:$worker_port"
    # Apply optimizations from bigdl-llm
    source bigdl-llm-init -t
    # First check if user have set OMP_NUM_THREADS by themselves
    if [[ -n "${omp_num_threads}" ]]; then
      echo "Setting OMP_NUM_THREADS to its original value: $omp_num_threads"
      export OMP_NUM_THREADS=$omp_num_threads
    else
      # Use calculate_total_cores to acquire cpuset settings
      # Set OMP_NUM_THREADS to correct numbers
      cores=$(calculate_total_cores)
      if [[ $cores == -1 || $cores == 0 ]]; then
        echo "Failed to obtain the number of cores, will use the default settings OMP_NUM_THREADS=$OMP_NUM_THREADS"
      else
        echo "Setting OMP_NUM_THREADS to $cores"
        export OMP_NUM_THREADS=$cores
      fi
    fi
    if [[ -z "${model_path}" ]]; then
          echo "Please set env MODEL_PATH used for worker"
          usage
    fi
    echo "Worker type: $worker_type"
    echo "Worker address: $worker_address"
    echo "Controller address: $controller_address"
    if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then
      python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
    elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then
      python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
    fi
  fi
fi

exec /usr/bin/bash -s -- "bash"