{{- if eq .Values.TEEMode "tdx" }} apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: bigdl-lora-finetuning-job namespace: bigdl-lora-finetuning spec: slotsPerWorker: 1 runPolicy: cleanPodPolicy: Running sshAuthMountPath: /home/mpiuser/.ssh mpiImplementation: Intel mpiReplicaSpecs: Launcher: replicas: 1 template: spec: volumes: - name: nfs-storage persistentVolumeClaim: claimName: nfs-pvc - name: dev hostPath: path: /dev runtimeClassName: kata-qemu-tdx containers: - image: {{ .Values.imageName }} name: bigdl-ppml-finetuning-launcher securityContext: runAsUser: 0 privileged: true command: ["/bin/sh", "-c"] args: - | nohup python /ppml/bigdl_aa.py > /ppml/bigdl_aa.log 2>&1 & sudo -E -u mpiuser bash /ppml/bigdl-lora-finetuing-entrypoint.sh env: - name: WORKER_ROLE value: "launcher" - name: WORLD_SIZE value: "{{ .Values.trainerNum }}" - name: MICRO_BATCH_SIZE value: "{{ .Values.microBatchSize }}" - name: MASTER_PORT value: "42679" - name: MASTER_ADDR value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker" - name: DATA_SUB_PATH value: "{{ .Values.dataSubPath }}" - name: OMP_NUM_THREADS value: "{{ .Values.ompNumThreads }}" - name: LOCAL_POD_NAME valueFrom: fieldRef: fieldPath: metadata.name - name: HF_DATASETS_CACHE value: "/ppml/output/cache" - name: ATTESTATION_API_SERVICE_PORT value: "{{ .Values.attestionApiServicePort }}" volumeMounts: - name: nfs-storage subPath: {{ .Values.modelSubPath }} mountPath: /ppml/model - name: nfs-storage subPath: {{ .Values.dataSubPath }} mountPath: "/ppml/data/{{ .Values.dataSubPath }}" - name: nfs-storage subPath: {{ .Values.outputSubPath }} mountPath: "/ppml/output" - name: dev mountPath: /dev Worker: replicas: {{ .Values.trainerNum }} template: spec: runtimeClassName: kata-qemu-tdx containers: - image: {{ .Values.imageName }} name: bigdl-ppml-finetuning-worker securityContext: runAsUser: 0 privileged: true command: ["/bin/sh", "-c"] args: - | chown nobody /home/mpiuser/.ssh/id_rsa & sudo -E -u mpiuser bash /ppml/bigdl-lora-finetuing-entrypoint.sh env: - name: WORKER_ROLE value: "trainer" - name: WORLD_SIZE value: "{{ .Values.trainerNum }}" - name: MICRO_BATCH_SIZE value: "{{ .Values.microBatchSize }}" - name: MASTER_PORT value: "42679" - name: MASTER_ADDR value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker" - name: LOCAL_POD_NAME valueFrom: fieldRef: fieldPath: metadata.name volumeMounts: - name: nfs-storage subPath: {{ .Values.modelSubPath }} mountPath: /ppml/model - name: nfs-storage subPath: {{ .Values.dataSubPath }} mountPath: "/ppml/data/{{ .Values.dataSubPath }}" - name: nfs-storage subPath: {{ .Values.outputSubPath }} mountPath: "/ppml/output" - name: dev mountPath: /dev resources: requests: cpu: {{ .Values.cpuPerPod }} limits: cpu: {{ .Values.cpuPerPod }} volumes: - name: nfs-storage persistentVolumeClaim: claimName: nfs-pvc - name: dev hostPath: path: /dev --- apiVersion: v1 kind: Service metadata: name: bigdl-lora-finetuning-launcher-attestation-api-service namespace: bigdl-lora-finetuning spec: selector: job-name: bigdl-lora-finetuning-job-launcher training.kubeflow.org/job-name: bigdl-lora-finetuning-job training.kubeflow.org/job-role: launcher ports: - name: launcher-attestation-api-service-port protocol: TCP port: {{ .Values.attestionApiServicePort }} targetPort: {{ .Values.attestionApiServicePort }} type: ClusterIP {{- end }}