ipex-llm/docker/llm/serving/cpu/kubernetes/deployment.yaml

126 lines
3 KiB
YAML

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: models-pvc
spec:
accessModes:
- ReadWriteMany
storageClassName: models
resources:
requests:
storage: 10Gi
# selector: # Optional
# matchLabels:
# app: kubecon-models
---
apiVersion: v1
kind: Pod
metadata:
name: ipex-llm-fschat-a1234bd-controller
labels:
fastchat-appid: a1234bd
fastchat-app-type: controller
spec:
dnsPolicy: "ClusterFirst"
containers:
- name: fastchat-controller # fixed
image: intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT
imagePullPolicy: IfNotPresent
env:
- name: CONTROLLER_HOST # fixed
value: "0.0.0.0"
- name: CONTROLLER_PORT # fixed
value: "21005"
- name: API_HOST # fixed
value: "0.0.0.0"
- name: API_PORT # fixed
value: "8000"
- name: "GRADIO_PORT" # You can change this port
value: "8002"
ports:
- containerPort: 21005
name: con-port
- containerPort: 8000
name: api-port
resources:
requests:
memory: 16Gi
cpu: 4
limits:
memory: 16Gi
cpu: 4
args: ["-m", "controller"]
restartPolicy: "Never"
---
# Service for the controller
apiVersion: v1
kind: Service
metadata:
name: ipex-llm-a1234bd-fschat-controller-service
spec:
# You may also want to change this to use the cluster's feature
type: NodePort
selector:
fastchat-appid: a1234bd
fastchat-app-type: controller
ports:
- name: cont-port
protocol: TCP
port: 21005
targetPort: 21005
- name: api-port
protocol: TCP
port: 8000
targetPort: 8000
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: ipex-llm-fschat-a1234bd-worker-deployment
spec:
# Change this to the number you want
replicas: 1
selector:
matchLabels:
fastchat: worker
template:
metadata:
labels:
fastchat: worker
spec:
dnsPolicy: "ClusterFirst"
containers:
- name: fastchat-worker # fixed
image: intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT
imagePullPolicy: IfNotPresent
env:
- name: CONTROLLER_HOST # fixed
value: ipex-llm-a1234bd-fschat-controller-service
- name: CONTROLLER_PORT # fixed
value: "21005"
- name: WORKER_HOST # fixed
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: WORKER_PORT # fixed
value: "21841"
- name: MODEL_PATH
value: "/llm/models/vicuna-7b-v1.5-ipex-llm/" # change this to your model
- name: OMP_NUM_THREADS
value: "16"
resources:
requests:
memory: 32Gi
cpu: 16
limits:
memory: 32Gi
cpu: 16
args: ["-m", "worker"]
volumeMounts:
- name: llm-models
mountPath: /llm/models/
restartPolicy: "Always"
volumes:
- name: llm-models
persistentVolumeClaim:
claimName: models-pvc