168 lines
		
	
	
	
		
			5.4 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			168 lines
		
	
	
	
		
			5.4 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
{{- if eq .Values.TEEMode "tdx" }}
 | 
						|
apiVersion: kubeflow.org/v2beta1
 | 
						|
kind: MPIJob
 | 
						|
metadata:
 | 
						|
  name: bigdl-lora-finetuning-job
 | 
						|
  namespace: bigdl-lora-finetuning
 | 
						|
spec:
 | 
						|
  slotsPerWorker: 1
 | 
						|
  runPolicy:
 | 
						|
    cleanPodPolicy: Running
 | 
						|
  sshAuthMountPath: /home/mpiuser/.ssh
 | 
						|
  mpiImplementation: Intel
 | 
						|
  mpiReplicaSpecs:
 | 
						|
    Launcher:
 | 
						|
      replicas: 1
 | 
						|
      template:
 | 
						|
         spec:
 | 
						|
           volumes:
 | 
						|
           - name: nfs-storage
 | 
						|
             persistentVolumeClaim:
 | 
						|
               claimName: nfs-pvc
 | 
						|
           - name: dev
 | 
						|
             hostPath:
 | 
						|
               path: /dev
 | 
						|
           {{- if eq .Values.enableTLS true }}
 | 
						|
           - name: ssl-keys
 | 
						|
             secret:
 | 
						|
               secretName: ssl-keys
 | 
						|
           {{- end }}
 | 
						|
           runtimeClassName: kata-qemu-tdx
 | 
						|
           containers:
 | 
						|
           - image: {{ .Values.imageName }}
 | 
						|
             name: bigdl-ppml-finetuning-launcher
 | 
						|
             securityContext:
 | 
						|
              runAsUser: 0
 | 
						|
              privileged: true
 | 
						|
             command: ["/bin/sh", "-c"]
 | 
						|
             args:
 | 
						|
               - |
 | 
						|
                  nohup python /ppml/bigdl_aa.py > /ppml/bigdl_aa.log 2>&1 &
 | 
						|
                  sudo -E -u mpiuser bash /ppml/bigdl-lora-finetuing-entrypoint.sh                  
 | 
						|
             env:
 | 
						|
             - name: WORKER_ROLE
 | 
						|
               value: "launcher"
 | 
						|
             - name: WORLD_SIZE
 | 
						|
               value: "{{ .Values.trainerNum }}"
 | 
						|
             - name: MICRO_BATCH_SIZE
 | 
						|
               value: "{{ .Values.microBatchSize }}"
 | 
						|
             - name: MASTER_PORT
 | 
						|
               value: "42679"
 | 
						|
             - name: MASTER_ADDR
 | 
						|
               value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker"
 | 
						|
             - name: DATA_SUB_PATH
 | 
						|
               value: "{{ .Values.dataSubPath }}"
 | 
						|
             - name: OMP_NUM_THREADS
 | 
						|
               value: "{{ .Values.ompNumThreads }}"
 | 
						|
             - name: LOCAL_POD_NAME
 | 
						|
               valueFrom:
 | 
						|
                 fieldRef:
 | 
						|
                   fieldPath: metadata.name
 | 
						|
             - name: HF_DATASETS_CACHE
 | 
						|
               value: "/ppml/output/cache"
 | 
						|
             - name: ATTESTATION_API_SERVICE_PORT
 | 
						|
               value: "{{ .Values.attestionApiServicePort }}"
 | 
						|
             - name: ENABLE_TLS
 | 
						|
               value: "{{ .Values.enableTLS }}"
 | 
						|
             volumeMounts:
 | 
						|
             - name: nfs-storage
 | 
						|
               subPath: {{ .Values.modelSubPath }}
 | 
						|
               mountPath: /ppml/model
 | 
						|
             - name: nfs-storage
 | 
						|
               subPath: {{ .Values.dataSubPath }}
 | 
						|
               mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
 | 
						|
             - name: nfs-storage
 | 
						|
               subPath: {{ .Values.outputSubPath }}
 | 
						|
               mountPath: "/ppml/output"
 | 
						|
             - name: dev
 | 
						|
               mountPath: /dev
 | 
						|
             {{- if eq .Values.enableTLS true }}
 | 
						|
             - name: ssl-keys
 | 
						|
               mountPath: /ppml/keys
 | 
						|
             {{- end }}
 | 
						|
    Worker:
 | 
						|
      replicas: {{ .Values.trainerNum }}
 | 
						|
      template:
 | 
						|
        spec:
 | 
						|
          runtimeClassName: kata-qemu-tdx
 | 
						|
          containers:
 | 
						|
          - image: {{ .Values.imageName }}
 | 
						|
            name: bigdl-ppml-finetuning-worker
 | 
						|
            securityContext:
 | 
						|
              runAsUser: 0
 | 
						|
              privileged: true
 | 
						|
            command: ["/bin/sh", "-c"]
 | 
						|
            args:
 | 
						|
              - |
 | 
						|
                  chown nobody /home/mpiuser/.ssh/id_rsa &
 | 
						|
                  sudo -E -u mpiuser bash /ppml/bigdl-lora-finetuing-entrypoint.sh                  
 | 
						|
            env:
 | 
						|
            - name: WORKER_ROLE
 | 
						|
              value: "trainer"
 | 
						|
            - name: WORLD_SIZE
 | 
						|
              value: "{{ .Values.trainerNum }}"
 | 
						|
            - name: MICRO_BATCH_SIZE
 | 
						|
              value: "{{ .Values.microBatchSize }}"
 | 
						|
            - name: MASTER_PORT
 | 
						|
              value: "42679"
 | 
						|
            - name: MASTER_ADDR
 | 
						|
              value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker"
 | 
						|
            - name: LOCAL_POD_NAME
 | 
						|
              valueFrom:
 | 
						|
                fieldRef:
 | 
						|
                  fieldPath: metadata.name
 | 
						|
            volumeMounts:
 | 
						|
            - name: nfs-storage
 | 
						|
              subPath: {{ .Values.modelSubPath }}
 | 
						|
              mountPath: /ppml/model
 | 
						|
            - name: nfs-storage
 | 
						|
              subPath: {{ .Values.dataSubPath }}
 | 
						|
              mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
 | 
						|
            - name: nfs-storage
 | 
						|
              subPath: {{ .Values.outputSubPath }}
 | 
						|
              mountPath: "/ppml/output"
 | 
						|
            - name: dev
 | 
						|
              mountPath: /dev
 | 
						|
            resources:
 | 
						|
              requests:
 | 
						|
                cpu: {{ .Values.cpuPerPod }}
 | 
						|
              limits:
 | 
						|
                cpu: {{ .Values.cpuPerPod }}
 | 
						|
          volumes:
 | 
						|
          - name: nfs-storage
 | 
						|
            persistentVolumeClaim:
 | 
						|
              claimName: nfs-pvc
 | 
						|
          - name: dev
 | 
						|
            hostPath:
 | 
						|
              path: /dev
 | 
						|
---
 | 
						|
apiVersion: v1
 | 
						|
kind: Service
 | 
						|
metadata:
 | 
						|
  name: bigdl-lora-finetuning-launcher-attestation-api-service
 | 
						|
  namespace: bigdl-lora-finetuning
 | 
						|
spec:
 | 
						|
  selector:
 | 
						|
    job-name: bigdl-lora-finetuning-job-launcher
 | 
						|
    training.kubeflow.org/job-name: bigdl-lora-finetuning-job
 | 
						|
    training.kubeflow.org/job-role: launcher
 | 
						|
  ports:
 | 
						|
    - name: launcher-attestation-api-service-port
 | 
						|
      protocol: TCP
 | 
						|
      port: {{ .Values.attestionApiServicePort }}
 | 
						|
      targetPort: {{ .Values.attestionApiServicePort }}
 | 
						|
  type: ClusterIP
 | 
						|
---
 | 
						|
{{- if eq .Values.enableTLS true }}
 | 
						|
apiVersion: v1
 | 
						|
kind: Secret
 | 
						|
metadata:
 | 
						|
  name: ssl-keys
 | 
						|
  namespace: bigdl-lora-finetuning
 | 
						|
type: Opaque
 | 
						|
data:
 | 
						|
  server.crt: {{ .Values.base64ServerCrt }}
 | 
						|
  server.key: {{ .Values.base64ServerKey }}
 | 
						|
{{- end }}
 | 
						|
 | 
						|
{{- end }}
 |