Add Kubernetes support for BigDL-LLM-serving CPU. (#9071)
This commit is contained in:
parent
36dd4afd61
commit
b773d67dd4
6 changed files with 580 additions and 1 deletions
27
.github/workflows/manually_build.yml
vendored
27
.github/workflows/manually_build.yml
vendored
|
|
@ -12,6 +12,7 @@ on:
|
||||||
- all
|
- all
|
||||||
- bigdl-llm-xpu
|
- bigdl-llm-xpu
|
||||||
- bigdl-llm-cpu
|
- bigdl-llm-cpu
|
||||||
|
- bigdl-llm-serving-cpu
|
||||||
- bigdl-ppml-gramine-base
|
- bigdl-ppml-gramine-base
|
||||||
- bigdl-ppml-trusted-bigdl-llm-gramine-base
|
- bigdl-ppml-trusted-bigdl-llm-gramine-base
|
||||||
- bigdl-ppml-trusted-bigdl-llm-gramine-ref
|
- bigdl-ppml-trusted-bigdl-llm-gramine-ref
|
||||||
|
|
@ -114,6 +115,32 @@ jobs:
|
||||||
sudo docker push 10.239.45.10/arda/${image}:${TAG}
|
sudo docker push 10.239.45.10/arda/${image}:${TAG}
|
||||||
sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
|
sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
|
||||||
|
|
||||||
|
bigdl-llm-serving-cpu:
|
||||||
|
if: ${{ github.event.inputs.artifact == 'bigdl-llm-serving-cpu' || github.event.inputs.artifact == 'all' }}
|
||||||
|
runs-on: [self-hosted, Shire]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- name: docker login
|
||||||
|
run: |
|
||||||
|
docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
|
||||||
|
- name: bigdl-llm-serving-cpu
|
||||||
|
run: |
|
||||||
|
echo "##############################################################"
|
||||||
|
echo "####### bigdl-llm-serving-cpu ########"
|
||||||
|
echo "##############################################################"
|
||||||
|
export image=intelanalytics/bigdl-llm-serving-cpu
|
||||||
|
cd docker/llm/serving/cpu/docker
|
||||||
|
sudo docker build \
|
||||||
|
--no-cache=true \
|
||||||
|
--build-arg http_proxy=${HTTP_PROXY} \
|
||||||
|
--build-arg https_proxy=${HTTPS_PROXY} \
|
||||||
|
--build-arg no_proxy=${NO_PROXY} \
|
||||||
|
-t ${image}:${TAG} -f ./Dockerfile .
|
||||||
|
sudo docker push ${image}:${TAG}
|
||||||
|
sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
|
||||||
|
sudo docker push 10.239.45.10/arda/${image}:${TAG}
|
||||||
|
sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
|
||||||
|
|
||||||
bigdl-ppml-gramine-base:
|
bigdl-ppml-gramine-base:
|
||||||
if: ${{ github.event.inputs.artifact == 'bigdl-ppml-gramine-base' || github.event.inputs.artifact == 'all' }}
|
if: ${{ github.event.inputs.artifact == 'bigdl-ppml-gramine-base' || github.event.inputs.artifact == 'all' }}
|
||||||
runs-on: [self-hosted, Shire]
|
runs-on: [self-hosted, Shire]
|
||||||
|
|
|
||||||
|
|
@ -2,10 +2,13 @@ FROM intelanalytics/bigdl-llm-cpu:2.4.0-SNAPSHOT
|
||||||
|
|
||||||
ARG http_proxy
|
ARG http_proxy
|
||||||
ARG https_proxy
|
ARG https_proxy
|
||||||
|
ARG TINI_VERSION=v0.18.0
|
||||||
|
|
||||||
# Disable pip's cache behavior
|
# Disable pip's cache behavior
|
||||||
ARG PIP_NO_CACHE_DIR=false
|
ARG PIP_NO_CACHE_DIR=false
|
||||||
|
|
||||||
|
ADD ./entrypoint.sh /opt/entrypoint.sh
|
||||||
|
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini
|
||||||
# Install Serving Dependencies
|
# Install Serving Dependencies
|
||||||
RUN mkdir /llm && \
|
RUN mkdir /llm && \
|
||||||
cd /llm && \
|
cd /llm && \
|
||||||
|
|
@ -13,7 +16,11 @@ RUN mkdir /llm && \
|
||||||
cd FastChat && \
|
cd FastChat && \
|
||||||
git checkout dev-2023-09-22 && \
|
git checkout dev-2023-09-22 && \
|
||||||
pip3 install -e ".[model_worker,webui]" && \
|
pip3 install -e ".[model_worker,webui]" && \
|
||||||
cd /llm
|
cd /llm && \
|
||||||
|
chmod +x /opt/entrypoint.sh && \
|
||||||
|
chmod +x /sbin/tini && \
|
||||||
|
cp /sbin/tini /usr/bin/tini
|
||||||
|
|
||||||
|
|
||||||
WORKDIR /llm/
|
WORKDIR /llm/
|
||||||
|
ENTRYPOINT [ "/opt/entrypoint.sh" ]
|
||||||
200
docker/llm/serving/cpu/docker/entrypoint.sh
Normal file
200
docker/llm/serving/cpu/docker/entrypoint.sh
Normal file
|
|
@ -0,0 +1,200 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "Usage: $0 [-m --mode <controller|worker>] [-h --help]"
|
||||||
|
echo "-h: Print help message."
|
||||||
|
echo "Controller mode reads the following env:"
|
||||||
|
echo "CONTROLLER_HOST (default: localhost)."
|
||||||
|
echo "CONTROLLER_PORT (default: 21001)."
|
||||||
|
echo "API_HOST (default: localhost)."
|
||||||
|
echo "API_PORT (default: 8000)."
|
||||||
|
echo "Worker mode reads the following env:"
|
||||||
|
echo "CONTROLLER_HOST (default: localhost)."
|
||||||
|
echo "CONTROLLER_PORT (default: 21001)."
|
||||||
|
echo "WORKER_HOST (default: localhost)."
|
||||||
|
echo "WORKER_PORT (default: 21002)."
|
||||||
|
echo "MODEL_PATH (default: empty)."
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Acquire correct core_nums if using cpuset-cpus, return -1 if file not exist
|
||||||
|
calculate_total_cores() {
|
||||||
|
local cpuset_file="/sys/fs/cgroup/cpuset/cpuset.cpus"
|
||||||
|
|
||||||
|
if [[ -f "$cpuset_file" ]]; then
|
||||||
|
local cpuset_cpus=$(cat "$cpuset_file")
|
||||||
|
cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
|
||||||
|
|
||||||
|
local total_cores=0
|
||||||
|
IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
|
||||||
|
for cpu in "${cpu_list[@]}"; do
|
||||||
|
if [[ $cpu =~ - ]]; then
|
||||||
|
# Range of CPUs
|
||||||
|
local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
|
||||||
|
local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
|
||||||
|
local range_cores=$((end_cpu - start_cpu + 1))
|
||||||
|
total_cores=$((total_cores + range_cores))
|
||||||
|
else
|
||||||
|
# Single CPU
|
||||||
|
total_cores=$((total_cores + 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo $total_cores
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
# Kubernetes core-binding will use this file
|
||||||
|
cpuset_file="/sys/fs/cgroup/cpuset.cpus"
|
||||||
|
if [[ -f "$cpuset_file" ]]; then
|
||||||
|
local cpuset_cpus=$(cat "$cpuset_file")
|
||||||
|
cpuset_cpus=$(echo "${cpuset_cpus}" | tr -d '\n')
|
||||||
|
|
||||||
|
local total_cores=0
|
||||||
|
IFS=',' read -ra cpu_list <<< "$cpuset_cpus"
|
||||||
|
for cpu in "${cpu_list[@]}"; do
|
||||||
|
if [[ $cpu =~ - ]]; then
|
||||||
|
# Range of CPUs
|
||||||
|
local start_cpu=$(echo "$cpu" | cut -d'-' -f1)
|
||||||
|
local end_cpu=$(echo "$cpu" | cut -d'-' -f2)
|
||||||
|
local range_cores=$((end_cpu - start_cpu + 1))
|
||||||
|
total_cores=$((total_cores + range_cores))
|
||||||
|
else
|
||||||
|
# Single CPU
|
||||||
|
total_cores=$((total_cores + 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo $total_cores
|
||||||
|
return
|
||||||
|
else
|
||||||
|
echo -1
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Default values
|
||||||
|
controller_host="localhost"
|
||||||
|
controller_port="21001"
|
||||||
|
api_host="localhost"
|
||||||
|
api_port="8000"
|
||||||
|
worker_host="localhost"
|
||||||
|
worker_port="21002"
|
||||||
|
model_path=""
|
||||||
|
mode=""
|
||||||
|
omp_num_threads=""
|
||||||
|
dispatch_method="shortest_queue" # shortest_queue or lottery
|
||||||
|
|
||||||
|
# Update rootCA config if needed
|
||||||
|
update-ca-certificates
|
||||||
|
|
||||||
|
# Remember the value of `OMP_NUM_THREADS`:
|
||||||
|
if [[ -n "${OMP_NUM_THREADS}" ]]; then
|
||||||
|
omp_num_threads="${OMP_NUM_THREADS}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# We do not have any arguments, just run bash
|
||||||
|
if [ "$#" == 0 ]; then
|
||||||
|
echo "[INFO] no command is passed in"
|
||||||
|
echo "[INFO] enter pass-through mode"
|
||||||
|
exec /usr/bin/tini -s -- "bash"
|
||||||
|
else
|
||||||
|
# Parse command-line options
|
||||||
|
options=$(getopt -o "m:h" --long "mode:,help" -n "$0" -- "$@")
|
||||||
|
if [ $? != 0 ]; then
|
||||||
|
usage
|
||||||
|
fi
|
||||||
|
eval set -- "$options"
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
case "$1" in
|
||||||
|
-m|--mode)
|
||||||
|
mode="$2"
|
||||||
|
[[ $mode == "controller" || $mode == "worker" ]] || usage
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-h|--help)
|
||||||
|
usage
|
||||||
|
;;
|
||||||
|
--)
|
||||||
|
shift
|
||||||
|
break
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
usage
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ -n $CONTROLLER_HOST ]]; then
|
||||||
|
controller_host=$CONTROLLER_HOST
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $CONTROLLER_PORT ]]; then
|
||||||
|
controller_port=$CONTROLLER_PORT
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $API_HOST ]]; then
|
||||||
|
api_host=$API_HOST
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $API_PORT ]]; then
|
||||||
|
api_port=$API_PORT
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $WORKER_HOST ]]; then
|
||||||
|
worker_host=$WORKER_HOST
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $WORKER_PORT ]]; then
|
||||||
|
worker_port=$WORKER_PORT
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $MODEL_PATH ]]; then
|
||||||
|
model_path=$MODEL_PATH
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $DISPATCH_METHOD ]]; then
|
||||||
|
dispatch_method=$DISPATCH_METHOD
|
||||||
|
fi
|
||||||
|
|
||||||
|
controller_address="http://$controller_host:$controller_port"
|
||||||
|
# Execute logic based on options
|
||||||
|
if [[ $mode == "controller" ]]; then
|
||||||
|
# Logic for controller mode
|
||||||
|
# Boot Controller
|
||||||
|
api_address="http://$api_host:$api_port"
|
||||||
|
echo "Controller address: $controller_address"
|
||||||
|
echo "OpenAI API address: $api_address"
|
||||||
|
python3 -m fastchat.serve.controller --host $controller_host --port $controller_port --dispatch-method $dispatch_method &
|
||||||
|
# Boot openai api server
|
||||||
|
python3 -m fastchat.serve.openai_api_server --host $api_host --port $api_port --controller-address $controller_address
|
||||||
|
else
|
||||||
|
# Logic for non-controller(worker) mode
|
||||||
|
worker_address="http://$worker_host:$worker_port"
|
||||||
|
# Apply optimizations from bigdl-nano
|
||||||
|
source bigdl-nano-init -t
|
||||||
|
# First check if user have set OMP_NUM_THREADS by themselves
|
||||||
|
if [[ -n "${omp_num_threads}" ]]; then
|
||||||
|
echo "Setting OMP_NUM_THREADS to its original value: $omp_num_threads"
|
||||||
|
export OMP_NUM_THREADS=$omp_num_threads
|
||||||
|
else
|
||||||
|
# Use calculate_total_cores to acquire cpuset settings
|
||||||
|
# Set OMP_NUM_THREADS to correct numbers
|
||||||
|
cores=$(calculate_total_cores)
|
||||||
|
if [[ $cores == -1 || $cores == 0 ]]; then
|
||||||
|
echo "Failed to obtain the number of cores, will use the default settings OMP_NUM_THREADS=$OMP_NUM_THREADS"
|
||||||
|
else
|
||||||
|
echo "Setting OMP_NUM_THREADS to $cores"
|
||||||
|
export OMP_NUM_THREADS=$cores
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if [[ -z "${model_path}" ]]; then
|
||||||
|
echo "Please set env MODEL_PATH used for worker"
|
||||||
|
usage
|
||||||
|
fi
|
||||||
|
echo "Worker address: $worker_address"
|
||||||
|
echo "Controller address: $controller_address"
|
||||||
|
python3 -m fastchat.serve.model_worker --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
235
docker/llm/serving/cpu/kubernetes/README.md
Normal file
235
docker/llm/serving/cpu/kubernetes/README.md
Normal file
|
|
@ -0,0 +1,235 @@
|
||||||
|
## Deployment bigdl-llm serving service in K8S environment
|
||||||
|
|
||||||
|
|
||||||
|
## Image
|
||||||
|
|
||||||
|
To deploy BigDL-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT`
|
||||||
|
|
||||||
|
## Before deployment
|
||||||
|
|
||||||
|
### Models
|
||||||
|
|
||||||
|
In this document, we will use `vicuna-7b-v1.5` as the deployment model.
|
||||||
|
|
||||||
|
After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna-7b-v1.5-bigdl` to use `bigdl-llm` as the backend. The `bigdl-llm` backend will be used if model path contains `bigdl`. Otherwise, the original transformer-backend will be used.
|
||||||
|
|
||||||
|
You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5).
|
||||||
|
|
||||||
|
### Kubernetes config
|
||||||
|
|
||||||
|
We recommend to setup your kubernetes cluster before deployment. Mostly importantly, please set `cpu-management-policy` to `static` by using this [tutorial](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/). Also, it would be great to also set the `topology management policy` to `single-numa-node`.
|
||||||
|
|
||||||
|
### Machine config
|
||||||
|
|
||||||
|
Set hyper-threading to off, ensure that only physical cores are used during deployment.
|
||||||
|
|
||||||
|
## Deployment
|
||||||
|
|
||||||
|
### Reminder on `OMP_NUM_THREADS`
|
||||||
|
|
||||||
|
The entrypoint of the image will try to set `OMP_NUM_THREADS` to the correct number by reading configs from the `runtime`. However, this only happens correctly if the `core-binding` feature is enabled. If not, please set environment variable `OMP_NUM_THREADS` manually in the yaml file.
|
||||||
|
|
||||||
|
|
||||||
|
### Controller
|
||||||
|
|
||||||
|
We use the following yaml file for controller deployment:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: bigdl-fschat-a1234bd-controller
|
||||||
|
labels:
|
||||||
|
fastchat-appid: a1234bd
|
||||||
|
fastchat-app-type: controller
|
||||||
|
spec:
|
||||||
|
dnsPolicy: "ClusterFirst"
|
||||||
|
containers:
|
||||||
|
- name: fastchat-controller # fixed
|
||||||
|
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
env:
|
||||||
|
- name: CONTROLLER_HOST # fixed
|
||||||
|
value: "0.0.0.0"
|
||||||
|
- name: CONTROLLER_PORT # fixed
|
||||||
|
value: "21005"
|
||||||
|
- name: API_HOST # fixed
|
||||||
|
value: "0.0.0.0"
|
||||||
|
- name: API_PORT # fixed
|
||||||
|
value: "8000"
|
||||||
|
ports:
|
||||||
|
- containerPort: 21005
|
||||||
|
name: con-port
|
||||||
|
- containerPort: 8000
|
||||||
|
name: api-port
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 16Gi
|
||||||
|
cpu: 4
|
||||||
|
limits:
|
||||||
|
memory: 16Gi
|
||||||
|
cpu: 4
|
||||||
|
args: ["-m", "controller"]
|
||||||
|
restartPolicy: "Never"
|
||||||
|
---
|
||||||
|
# Service for the controller
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: bigdl-a1234bd-fschat-controller-service
|
||||||
|
spec:
|
||||||
|
# You may also want to change this to use the cluster's feature
|
||||||
|
type: NodePort
|
||||||
|
selector:
|
||||||
|
fastchat-appid: a1234bd
|
||||||
|
fastchat-app-type: controller
|
||||||
|
ports:
|
||||||
|
- name: cont-port
|
||||||
|
protocol: TCP
|
||||||
|
port: 21005
|
||||||
|
targetPort: 21005
|
||||||
|
- name: api-port
|
||||||
|
protocol: TCP
|
||||||
|
port: 8000
|
||||||
|
targetPort: 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
### Worker
|
||||||
|
|
||||||
|
We use the following deployment for worker deployment:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: bigdl-fschat-a1234bd-worker-deployment
|
||||||
|
spec:
|
||||||
|
# Change this to the number you want
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
fastchat: worker
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
fastchat: worker
|
||||||
|
spec:
|
||||||
|
dnsPolicy: "ClusterFirst"
|
||||||
|
containers:
|
||||||
|
- name: fastchat-worker # fixed
|
||||||
|
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
env:
|
||||||
|
- name: CONTROLLER_HOST # fixed
|
||||||
|
value: bigdl-a1234bd-fschat-controller-service
|
||||||
|
- name: CONTROLLER_PORT # fixed
|
||||||
|
value: "21005"
|
||||||
|
- name: WORKER_HOST # fixed
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: status.podIP
|
||||||
|
- name: WORKER_PORT # fixed
|
||||||
|
value: "21841"
|
||||||
|
- name: MODEL_PATH # Change this
|
||||||
|
value: "/llm/models/vicuna-7b-v1.5-bigdl/"
|
||||||
|
- name: OMP_NUM_THREADS
|
||||||
|
value: "16"
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 32Gi
|
||||||
|
cpu: 16
|
||||||
|
limits:
|
||||||
|
memory: 32Gi
|
||||||
|
cpu: 16
|
||||||
|
args: ["-m", "worker"]
|
||||||
|
volumeMounts:
|
||||||
|
- name: llm-models
|
||||||
|
mountPath: /llm/models/
|
||||||
|
restartPolicy: "Always"
|
||||||
|
volumes:
|
||||||
|
- name: llm-models
|
||||||
|
hostPath:
|
||||||
|
path: /home/llm/models # change this in other envs
|
||||||
|
```
|
||||||
|
|
||||||
|
You may want to change the `MODEL_PATH` variable in the yaml. Also, please remember to change the volume path accordingly.
|
||||||
|
|
||||||
|
|
||||||
|
### Testing
|
||||||
|
|
||||||
|
#### Using openai-python
|
||||||
|
|
||||||
|
First, install openai-python:
|
||||||
|
```bash
|
||||||
|
pip install --upgrade openai
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, interact with model vicuna-7b-v1.5-bigdl:
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
openai.api_key = "EMPTY"
|
||||||
|
openai.api_base = "http://localhost:8000/v1"
|
||||||
|
|
||||||
|
model = "vicuna-7b-v1.5-bigdl"
|
||||||
|
prompt = "Once upon a time"
|
||||||
|
|
||||||
|
# create a completion
|
||||||
|
completion = openai.Completion.create(model=model, prompt=prompt, max_tokens=64)
|
||||||
|
# print the completion
|
||||||
|
print(prompt + completion.choices[0].text)
|
||||||
|
|
||||||
|
# create a chat completion
|
||||||
|
completion = openai.ChatCompletion.create(
|
||||||
|
model=model,
|
||||||
|
messages=[{"role": "user", "content": "Hello! What is your name?"}]
|
||||||
|
)
|
||||||
|
# print the completion
|
||||||
|
print(completion.choices[0].message.content)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### cURL
|
||||||
|
cURL is another good tool for observing the output of the api.
|
||||||
|
|
||||||
|
For the following examples, you may also change the service deployment address.
|
||||||
|
|
||||||
|
List Models:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/v1/models
|
||||||
|
```
|
||||||
|
|
||||||
|
If you have `jq` installed, you can use it to format the output like this:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/v1/models | jq
|
||||||
|
```
|
||||||
|
|
||||||
|
Chat Completions:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "YOUR_MODEL",
|
||||||
|
"messages": [{"role": "user", "content": "Hello! What is your name?"}]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Text Completions:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/v1/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "YOUR_MODEL",
|
||||||
|
"prompt": "Once upon a time",
|
||||||
|
"max_tokens": 41,
|
||||||
|
"temperature": 0.5
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Embeddings:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/v1/embeddings \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "YOUR_MODEL",
|
||||||
|
"input": "Hello world!"
|
||||||
|
}'
|
||||||
|
```
|
||||||
1
docker/llm/serving/cpu/kubernetes/clean.sh
Normal file
1
docker/llm/serving/cpu/kubernetes/clean.sh
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
kubectl delete -f deployment.yaml
|
||||||
109
docker/llm/serving/cpu/kubernetes/deployment.yaml
Normal file
109
docker/llm/serving/cpu/kubernetes/deployment.yaml
Normal file
|
|
@ -0,0 +1,109 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: bigdl-fschat-a1234bd-controller
|
||||||
|
labels:
|
||||||
|
fastchat-appid: a1234bd
|
||||||
|
fastchat-app-type: controller
|
||||||
|
spec:
|
||||||
|
dnsPolicy: "ClusterFirst"
|
||||||
|
containers:
|
||||||
|
- name: fastchat-controller # fixed
|
||||||
|
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
env:
|
||||||
|
- name: CONTROLLER_HOST # fixed
|
||||||
|
value: "0.0.0.0"
|
||||||
|
- name: CONTROLLER_PORT # fixed
|
||||||
|
value: "21005"
|
||||||
|
- name: API_HOST # fixed
|
||||||
|
value: "0.0.0.0"
|
||||||
|
- name: API_PORT # fixed
|
||||||
|
value: "8000"
|
||||||
|
ports:
|
||||||
|
- containerPort: 21005
|
||||||
|
name: con-port
|
||||||
|
- containerPort: 8000
|
||||||
|
name: api-port
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 16Gi
|
||||||
|
cpu: 4
|
||||||
|
limits:
|
||||||
|
memory: 16Gi
|
||||||
|
cpu: 4
|
||||||
|
args: ["-m", "controller"]
|
||||||
|
restartPolicy: "Never"
|
||||||
|
---
|
||||||
|
# Service for the controller
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: bigdl-a1234bd-fschat-controller-service
|
||||||
|
spec:
|
||||||
|
# You may also want to change this to use the cluster's feature
|
||||||
|
type: NodePort
|
||||||
|
selector:
|
||||||
|
fastchat-appid: a1234bd
|
||||||
|
fastchat-app-type: controller
|
||||||
|
ports:
|
||||||
|
- name: cont-port
|
||||||
|
protocol: TCP
|
||||||
|
port: 21005
|
||||||
|
targetPort: 21005
|
||||||
|
- name: api-port
|
||||||
|
protocol: TCP
|
||||||
|
port: 8000
|
||||||
|
targetPort: 8000
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: bigdl-fschat-a1234bd-worker-deployment
|
||||||
|
spec:
|
||||||
|
# Change this to the number you want
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
fastchat: worker
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
fastchat: worker
|
||||||
|
spec:
|
||||||
|
dnsPolicy: "ClusterFirst"
|
||||||
|
containers:
|
||||||
|
- name: fastchat-worker # fixed
|
||||||
|
image: intelanalytics/bigdl-llm-serving-cpu:2.4.0-SNAPSHOT
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
env:
|
||||||
|
- name: CONTROLLER_HOST # fixed
|
||||||
|
value: bigdl-a1234bd-fschat-controller-service
|
||||||
|
- name: CONTROLLER_PORT # fixed
|
||||||
|
value: "21005"
|
||||||
|
- name: WORKER_HOST # fixed
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: status.podIP
|
||||||
|
- name: WORKER_PORT # fixed
|
||||||
|
value: "21841"
|
||||||
|
- name: MODEL_PATH # Change this
|
||||||
|
value: "/llm/models/vicuna-7b-v1.5-bigdl/"
|
||||||
|
- name: OMP_NUM_THREADS
|
||||||
|
value: "16"
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 32Gi
|
||||||
|
cpu: 16
|
||||||
|
limits:
|
||||||
|
memory: 32Gi
|
||||||
|
cpu: 16
|
||||||
|
args: ["-m", "worker"]
|
||||||
|
volumeMounts:
|
||||||
|
- name: llm-models
|
||||||
|
mountPath: /llm/models/
|
||||||
|
restartPolicy: "Always"
|
||||||
|
volumes:
|
||||||
|
- name: llm-models
|
||||||
|
hostPath:
|
||||||
|
path: /home/llm/models # change this in other envs
|
||||||
Loading…
Reference in a new issue