ipex-llm/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml
Xiangyu Tian ea6d4148e9 [PPML] Add attestation for LLM Finetuning (#8908)
Add TDX attestation for LLM Finetuning in TDX CoCo

---------

Co-authored-by: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com>
2023-09-08 10:24:04 +08:00

144 lines
No EOL
4.8 KiB
YAML

{{- if eq .Values.TEEMode "tdx" }}
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: bigdl-lora-finetuning-job
namespace: bigdl-lora-finetuning
spec:
slotsPerWorker: 1
runPolicy:
cleanPodPolicy: Running
sshAuthMountPath: /home/mpiuser/.ssh
mpiImplementation: Intel
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
volumes:
- name: nfs-storage
persistentVolumeClaim:
claimName: nfs-pvc
- name: dev
hostPath:
path: /dev
runtimeClassName: kata-qemu-tdx
containers:
- image: {{ .Values.imageName }}
name: bigdl-ppml-finetuning-launcher
securityContext:
runAsUser: 0
privileged: true
command: ["/bin/sh", "-c"]
args:
- |
nohup python /ppml/bigdl_aa.py > /ppml/bigdl_aa.log 2>&1 &
sudo -E -u mpiuser bash /ppml/bigdl-lora-finetuing-entrypoint.sh
env:
- name: WORKER_ROLE
value: "launcher"
- name: WORLD_SIZE
value: "{{ .Values.trainerNum }}"
- name: MICRO_BATCH_SIZE
value: "{{ .Values.microBatchSize }}"
- name: MASTER_PORT
value: "42679"
- name: MASTER_ADDR
value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker"
- name: DATA_SUB_PATH
value: "{{ .Values.dataSubPath }}"
- name: OMP_NUM_THREADS
value: "{{ .Values.ompNumThreads }}"
- name: LOCAL_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: HF_DATASETS_CACHE
value: "/ppml/output/cache"
- name: ATTESTATION_API_SERVICE_PORT
value: "{{ .Values.attestionApiServicePort }}"
volumeMounts:
- name: nfs-storage
subPath: {{ .Values.modelSubPath }}
mountPath: /ppml/model
- name: nfs-storage
subPath: {{ .Values.dataSubPath }}
mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
- name: nfs-storage
subPath: {{ .Values.outputSubPath }}
mountPath: "/ppml/output"
- name: dev
mountPath: /dev
Worker:
replicas: {{ .Values.trainerNum }}
template:
spec:
runtimeClassName: kata-qemu-tdx
containers:
- image: {{ .Values.imageName }}
name: bigdl-ppml-finetuning-worker
securityContext:
runAsUser: 0
privileged: true
command: ["/bin/sh", "-c"]
args:
- |
chown nobody /home/mpiuser/.ssh/id_rsa &
sudo -E -u mpiuser bash /ppml/bigdl-lora-finetuing-entrypoint.sh
env:
- name: WORKER_ROLE
value: "trainer"
- name: WORLD_SIZE
value: "{{ .Values.trainerNum }}"
- name: MICRO_BATCH_SIZE
value: "{{ .Values.microBatchSize }}"
- name: MASTER_PORT
value: "42679"
- name: MASTER_ADDR
value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker"
- name: LOCAL_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: nfs-storage
subPath: {{ .Values.modelSubPath }}
mountPath: /ppml/model
- name: nfs-storage
subPath: {{ .Values.dataSubPath }}
mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
- name: nfs-storage
subPath: {{ .Values.outputSubPath }}
mountPath: "/ppml/output"
- name: dev
mountPath: /dev
resources:
requests:
cpu: {{ .Values.cpuPerPod }}
limits:
cpu: {{ .Values.cpuPerPod }}
volumes:
- name: nfs-storage
persistentVolumeClaim:
claimName: nfs-pvc
- name: dev
hostPath:
path: /dev
---
apiVersion: v1
kind: Service
metadata:
name: bigdl-lora-finetuning-launcher-attestation-api-service
namespace: bigdl-lora-finetuning
spec:
selector:
job-name: bigdl-lora-finetuning-job-launcher
training.kubeflow.org/job-name: bigdl-lora-finetuning-job
training.kubeflow.org/job-role: launcher
ports:
- name: launcher-attestation-api-service-port
protocol: TCP
port: {{ .Values.attestionApiServicePort }}
targetPort: {{ .Values.attestionApiServicePort }}
type: ClusterIP
{{- end }}