apiVersion: v1 kind: PersistentVolumeClaim metadata: name: models-pvc spec: accessModes: - ReadWriteMany storageClassName: models resources: requests: storage: 10Gi # selector: # Optional # matchLabels: # app: kubecon-models --- apiVersion: v1 kind: Pod metadata: name: bigdl-fschat-a1234bd-controller labels: fastchat-appid: a1234bd fastchat-app-type: controller spec: dnsPolicy: "ClusterFirst" containers: - name: fastchat-controller # fixed image: intelanalytics/bigdl-llm-serving-cpu:2.5.0-SNAPSHOT imagePullPolicy: IfNotPresent env: - name: CONTROLLER_HOST # fixed value: "0.0.0.0" - name: CONTROLLER_PORT # fixed value: "21005" - name: API_HOST # fixed value: "0.0.0.0" - name: API_PORT # fixed value: "8000" - name: "GRADIO_PORT" # You can change this port value: "8002" ports: - containerPort: 21005 name: con-port - containerPort: 8000 name: api-port resources: requests: memory: 16Gi cpu: 4 limits: memory: 16Gi cpu: 4 args: ["-m", "controller"] restartPolicy: "Never" --- # Service for the controller apiVersion: v1 kind: Service metadata: name: bigdl-a1234bd-fschat-controller-service spec: # You may also want to change this to use the cluster's feature type: NodePort selector: fastchat-appid: a1234bd fastchat-app-type: controller ports: - name: cont-port protocol: TCP port: 21005 targetPort: 21005 - name: api-port protocol: TCP port: 8000 targetPort: 8000 --- apiVersion: apps/v1 kind: Deployment metadata: name: bigdl-fschat-a1234bd-worker-deployment spec: # Change this to the number you want replicas: 1 selector: matchLabels: fastchat: worker template: metadata: labels: fastchat: worker spec: dnsPolicy: "ClusterFirst" containers: - name: fastchat-worker # fixed image: intelanalytics/bigdl-llm-serving-cpu:2.5.0-SNAPSHOT imagePullPolicy: IfNotPresent env: - name: CONTROLLER_HOST # fixed value: bigdl-a1234bd-fschat-controller-service - name: CONTROLLER_PORT # fixed value: "21005" - name: WORKER_HOST # fixed valueFrom: fieldRef: fieldPath: status.podIP - name: WORKER_PORT # fixed value: "21841" - name: MODEL_PATH value: "/llm/models/vicuna-7b-v1.5-bigdl/" # change this to your model - name: OMP_NUM_THREADS value: "16" resources: requests: memory: 32Gi cpu: 16 limits: memory: 32Gi cpu: 16 args: ["-m", "worker"] volumeMounts: - name: llm-models mountPath: /llm/models/ restartPolicy: "Always" volumes: - name: llm-models persistentVolumeClaim: claimName: models-pvc