ipex-llm/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml
Xiangyu Tian 52878d3e5f [PPML] Enable TLS in Attestation API Serving for LLM finetuning (#8945)
Add enableTLS flag to enable TLS in Attestation API Serving for LLM finetuning.
2023-09-18 09:32:25 +08:00

168 lines
5.4 KiB
YAML

{{- if eq .Values.TEEMode "tdx" }}
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: bigdl-lora-finetuning-job
namespace: bigdl-lora-finetuning
spec:
slotsPerWorker: 1
runPolicy:
cleanPodPolicy: Running
sshAuthMountPath: /home/mpiuser/.ssh
mpiImplementation: Intel
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
volumes:
- name: nfs-storage
persistentVolumeClaim:
claimName: nfs-pvc
- name: dev
hostPath:
path: /dev
{{- if eq .Values.enableTLS true }}
- name: ssl-keys
secret:
secretName: ssl-keys
{{- end }}
runtimeClassName: kata-qemu-tdx
containers:
- image: {{ .Values.imageName }}
name: bigdl-ppml-finetuning-launcher
securityContext:
runAsUser: 0
privileged: true
command: ["/bin/sh", "-c"]
args:
- |
nohup python /ppml/bigdl_aa.py > /ppml/bigdl_aa.log 2>&1 &
sudo -E -u mpiuser bash /ppml/bigdl-lora-finetuing-entrypoint.sh
env:
- name: WORKER_ROLE
value: "launcher"
- name: WORLD_SIZE
value: "{{ .Values.trainerNum }}"
- name: MICRO_BATCH_SIZE
value: "{{ .Values.microBatchSize }}"
- name: MASTER_PORT
value: "42679"
- name: MASTER_ADDR
value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker"
- name: DATA_SUB_PATH
value: "{{ .Values.dataSubPath }}"
- name: OMP_NUM_THREADS
value: "{{ .Values.ompNumThreads }}"
- name: LOCAL_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: HF_DATASETS_CACHE
value: "/ppml/output/cache"
- name: ATTESTATION_API_SERVICE_PORT
value: "{{ .Values.attestionApiServicePort }}"
- name: ENABLE_TLS
value: "{{ .Values.enableTLS }}"
volumeMounts:
- name: nfs-storage
subPath: {{ .Values.modelSubPath }}
mountPath: /ppml/model
- name: nfs-storage
subPath: {{ .Values.dataSubPath }}
mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
- name: nfs-storage
subPath: {{ .Values.outputSubPath }}
mountPath: "/ppml/output"
- name: dev
mountPath: /dev
{{- if eq .Values.enableTLS true }}
- name: ssl-keys
mountPath: /ppml/keys
{{- end }}
Worker:
replicas: {{ .Values.trainerNum }}
template:
spec:
runtimeClassName: kata-qemu-tdx
containers:
- image: {{ .Values.imageName }}
name: bigdl-ppml-finetuning-worker
securityContext:
runAsUser: 0
privileged: true
command: ["/bin/sh", "-c"]
args:
- |
chown nobody /home/mpiuser/.ssh/id_rsa &
sudo -E -u mpiuser bash /ppml/bigdl-lora-finetuing-entrypoint.sh
env:
- name: WORKER_ROLE
value: "trainer"
- name: WORLD_SIZE
value: "{{ .Values.trainerNum }}"
- name: MICRO_BATCH_SIZE
value: "{{ .Values.microBatchSize }}"
- name: MASTER_PORT
value: "42679"
- name: MASTER_ADDR
value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker"
- name: LOCAL_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: nfs-storage
subPath: {{ .Values.modelSubPath }}
mountPath: /ppml/model
- name: nfs-storage
subPath: {{ .Values.dataSubPath }}
mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
- name: nfs-storage
subPath: {{ .Values.outputSubPath }}
mountPath: "/ppml/output"
- name: dev
mountPath: /dev
resources:
requests:
cpu: {{ .Values.cpuPerPod }}
limits:
cpu: {{ .Values.cpuPerPod }}
volumes:
- name: nfs-storage
persistentVolumeClaim:
claimName: nfs-pvc
- name: dev
hostPath:
path: /dev
---
apiVersion: v1
kind: Service
metadata:
name: bigdl-lora-finetuning-launcher-attestation-api-service
namespace: bigdl-lora-finetuning
spec:
selector:
job-name: bigdl-lora-finetuning-job-launcher
training.kubeflow.org/job-name: bigdl-lora-finetuning-job
training.kubeflow.org/job-role: launcher
ports:
- name: launcher-attestation-api-service-port
protocol: TCP
port: {{ .Values.attestionApiServicePort }}
targetPort: {{ .Values.attestionApiServicePort }}
type: ClusterIP
---
{{- if eq .Values.enableTLS true }}
apiVersion: v1
kind: Secret
metadata:
name: ssl-keys
namespace: bigdl-lora-finetuning
type: Opaque
data:
server.crt: {{ .Values.base64ServerCrt }}
server.key: {{ .Values.base64ServerKey }}
{{- end }}
{{- end }}