ipex-llm/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml
Heyang Sun 4b843d1dbf change lora-model output behavior on k8s (#9038)
Co-authored-by: leonardozcm <leonardo1997zcm@gmail.com>
2023-09-25 09:28:44 +08:00

93 lines
3.1 KiB
YAML

{{- if eq .Values.TEEMode "native" }}
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: bigdl-lora-finetuning-job
namespace: bigdl-lora-finetuning
spec:
slotsPerWorker: 1
runPolicy:
cleanPodPolicy: Running
sshAuthMountPath: /home/mpiuser/.ssh
mpiImplementation: Intel
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
volumes:
- name: nfs-storage
persistentVolumeClaim:
claimName: nfs-pvc
containers:
- image: {{ .Values.imageName }}
name: bigdl-ppml-finetuning-launcher
securityContext:
runAsUser: 1000
command: ['sh' , '-c', 'bash /ppml/bigdl-lora-finetuing-entrypoint.sh']
env:
- name: WORKER_ROLE
value: "launcher"
- name: WORLD_SIZE
value: "{{ .Values.trainerNum }}"
- name: MICRO_BATCH_SIZE
value: "{{ .Values.microBatchSize }}"
- name: MASTER_PORT
value: "42679"
- name: MASTER_ADDR
value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker"
- name: DATA_SUB_PATH
value: "{{ .Values.dataSubPath }}"
- name: OMP_NUM_THREADS
value: "{{ .Values.ompNumThreads }}"
- name: LOCAL_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: nfs-storage
subPath: {{ .Values.modelSubPath }}
mountPath: /ppml/model
- name: nfs-storage
subPath: {{ .Values.dataSubPath }}
mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
Worker:
replicas: {{ .Values.trainerNum }}
template:
spec:
containers:
- image: {{ .Values.imageName }}
name: bigdl-ppml-finetuning-worker
securityContext:
runAsUser: 1000
command: ['sh' , '-c', 'bash /ppml/bigdl-lora-finetuing-entrypoint.sh']
env:
- name: WORKER_ROLE
value: "trainer"
- name: WORLD_SIZE
value: "{{ .Values.trainerNum }}"
- name: MICRO_BATCH_SIZE
value: "{{ .Values.microBatchSize }}"
- name: MASTER_PORT
value: "42679"
- name: MASTER_ADDR
value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker"
- name: LOCAL_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: nfs-storage
subPath: {{ .Values.modelSubPath }}
mountPath: /ppml/model
- name: nfs-storage
subPath: {{ .Values.dataSubPath }}
mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
resources:
requests:
cpu: {{ .Values.cpuPerPod }}
volumes:
- name: nfs-storage
persistentVolumeClaim:
claimName: nfs-pvc
{{- end }}