162 lines
5.2 KiB
YAML
162 lines
5.2 KiB
YAML
{{- if eq .Values.TEEMode "tdx" }}
|
|
apiVersion: kubeflow.org/v2beta1
|
|
kind: MPIJob
|
|
metadata:
|
|
name: bigdl-lora-finetuning-job
|
|
namespace: bigdl-lora-finetuning
|
|
spec:
|
|
slotsPerWorker: 1
|
|
runPolicy:
|
|
cleanPodPolicy: Running
|
|
sshAuthMountPath: /home/mpiuser/.ssh
|
|
mpiImplementation: Intel
|
|
mpiReplicaSpecs:
|
|
Launcher:
|
|
replicas: 1
|
|
template:
|
|
spec:
|
|
volumes:
|
|
- name: nfs-storage
|
|
persistentVolumeClaim:
|
|
claimName: nfs-pvc
|
|
- name: dev
|
|
hostPath:
|
|
path: /dev
|
|
{{- if eq .Values.enableTLS true }}
|
|
- name: ssl-keys
|
|
secret:
|
|
secretName: ssl-keys
|
|
{{- end }}
|
|
runtimeClassName: kata-qemu-tdx
|
|
containers:
|
|
- image: {{ .Values.imageName }}
|
|
name: bigdl-ppml-finetuning-launcher
|
|
securityContext:
|
|
runAsUser: 0
|
|
privileged: true
|
|
command: ["/bin/sh", "-c"]
|
|
args:
|
|
- |
|
|
nohup python /ppml/bigdl_aa.py > /ppml/bigdl_aa.log 2>&1 &
|
|
sudo -E -u mpiuser bash /ppml/bigdl-lora-finetuing-entrypoint.sh
|
|
env:
|
|
- name: WORKER_ROLE
|
|
value: "launcher"
|
|
- name: WORLD_SIZE
|
|
value: "{{ .Values.trainerNum }}"
|
|
- name: MICRO_BATCH_SIZE
|
|
value: "{{ .Values.microBatchSize }}"
|
|
- name: MASTER_PORT
|
|
value: "42679"
|
|
- name: MASTER_ADDR
|
|
value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker"
|
|
- name: DATA_SUB_PATH
|
|
value: "{{ .Values.dataSubPath }}"
|
|
- name: OMP_NUM_THREADS
|
|
value: "{{ .Values.ompNumThreads }}"
|
|
- name: LOCAL_POD_NAME
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: metadata.name
|
|
- name: HF_DATASETS_CACHE
|
|
value: "/ppml/output/cache"
|
|
- name: ATTESTATION_API_SERVICE_PORT
|
|
value: "{{ .Values.attestionApiServicePort }}"
|
|
- name: ENABLE_TLS
|
|
value: "{{ .Values.enableTLS }}"
|
|
volumeMounts:
|
|
- name: nfs-storage
|
|
subPath: {{ .Values.modelSubPath }}
|
|
mountPath: /ppml/model
|
|
- name: nfs-storage
|
|
subPath: {{ .Values.dataSubPath }}
|
|
mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
|
|
- name: dev
|
|
mountPath: /dev
|
|
{{- if eq .Values.enableTLS true }}
|
|
- name: ssl-keys
|
|
mountPath: /ppml/keys
|
|
{{- end }}
|
|
Worker:
|
|
replicas: {{ .Values.trainerNum }}
|
|
template:
|
|
spec:
|
|
runtimeClassName: kata-qemu-tdx
|
|
containers:
|
|
- image: {{ .Values.imageName }}
|
|
name: bigdl-ppml-finetuning-worker
|
|
securityContext:
|
|
runAsUser: 0
|
|
privileged: true
|
|
command: ["/bin/sh", "-c"]
|
|
args:
|
|
- |
|
|
chown nobody /home/mpiuser/.ssh/id_rsa &
|
|
sudo -E -u mpiuser bash /ppml/bigdl-lora-finetuing-entrypoint.sh
|
|
env:
|
|
- name: WORKER_ROLE
|
|
value: "trainer"
|
|
- name: WORLD_SIZE
|
|
value: "{{ .Values.trainerNum }}"
|
|
- name: MICRO_BATCH_SIZE
|
|
value: "{{ .Values.microBatchSize }}"
|
|
- name: MASTER_PORT
|
|
value: "42679"
|
|
- name: MASTER_ADDR
|
|
value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker"
|
|
- name: LOCAL_POD_NAME
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: metadata.name
|
|
volumeMounts:
|
|
- name: nfs-storage
|
|
subPath: {{ .Values.modelSubPath }}
|
|
mountPath: /ppml/model
|
|
- name: nfs-storage
|
|
subPath: {{ .Values.dataSubPath }}
|
|
mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
|
|
- name: dev
|
|
mountPath: /dev
|
|
resources:
|
|
requests:
|
|
cpu: {{ .Values.cpuPerPod }}
|
|
limits:
|
|
cpu: {{ .Values.cpuPerPod }}
|
|
volumes:
|
|
- name: nfs-storage
|
|
persistentVolumeClaim:
|
|
claimName: nfs-pvc
|
|
- name: dev
|
|
hostPath:
|
|
path: /dev
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: bigdl-lora-finetuning-launcher-attestation-api-service
|
|
namespace: bigdl-lora-finetuning
|
|
spec:
|
|
selector:
|
|
job-name: bigdl-lora-finetuning-job-launcher
|
|
training.kubeflow.org/job-name: bigdl-lora-finetuning-job
|
|
training.kubeflow.org/job-role: launcher
|
|
ports:
|
|
- name: launcher-attestation-api-service-port
|
|
protocol: TCP
|
|
port: {{ .Values.attestionApiServicePort }}
|
|
targetPort: {{ .Values.attestionApiServicePort }}
|
|
type: ClusterIP
|
|
---
|
|
{{- if eq .Values.enableTLS true }}
|
|
apiVersion: v1
|
|
kind: Secret
|
|
metadata:
|
|
name: ssl-keys
|
|
namespace: bigdl-lora-finetuning
|
|
type: Opaque
|
|
data:
|
|
server.crt: {{ .Values.base64ServerCrt }}
|
|
server.key: {{ .Values.base64ServerKey }}
|
|
{{- end }}
|
|
|
|
{{- end }}
|