ipex-llm/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml
Heyang Sun b1ac8dc1bc BF16 Lora Finetuning on K8S with OneCCL and Intel MPI (#8775)
* BF16 Lora Finetuning on K8S with OneCCL and Intel MPI

* Update README.md

* format

* refine

* Update README.md

* refine

* Update README.md

* increase nfs volume size to improve IO performance

* fix bugs

* Update README.md

* Update README.md

* fix permission

* move output destination

* Update README.md

* fix wrong base model name in doc

* fix output path in entrypoint

* add a permission-precreated output dir

* format

* move output logs to a persistent storage
2023-08-31 14:56:23 +08:00

97 lines
3.3 KiB
YAML

apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: bigdl-lora-finetuning-job
namespace: bigdl-lora-finetuning
spec:
slotsPerWorker: 1
runPolicy:
cleanPodPolicy: Running
sshAuthMountPath: /home/mpiuser/.ssh
mpiImplementation: Intel
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
volumes:
- name: nfs-storage
persistentVolumeClaim:
claimName: nfs-pvc
containers:
- image: {{ .Values.imageName }}
name: bigdl-ppml-finetuning-launcher
securityContext:
runAsUser: 1000
command: ['sh' , '-c', 'bash /ppml/bigdl-lora-finetuing-entrypoint.sh']
env:
- name: WORKER_ROLE
value: "launcher"
- name: WORLD_SIZE
value: "{{ .Values.trainerNum }}"
- name: MICRO_BATCH_SIZE
value: "{{ .Values.microBatchSize }}"
- name: MASTER_PORT
value: "42679"
- name: MASTER_ADDR
value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker"
- name: DATA_SUB_PATH
value: "{{ .Values.dataSubPath }}"
- name: OMP_NUM_THREADS
value: "{{ .Values.ompNumThreads }}"
- name: LOCAL_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: nfs-storage
subPath: {{ .Values.modelSubPath }}
mountPath: /ppml/model
- name: nfs-storage
subPath: {{ .Values.dataSubPath }}
mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
- name: nfs-storage
subPath: {{ .Values.outputSubPath }}
mountPath: "/ppml/output"
Worker:
replicas: {{ .Values.trainerNum }}
template:
spec:
containers:
- image: {{ .Values.imageName }}
name: bigdl-ppml-finetuning-worker
securityContext:
runAsUser: 1000
command: ['sh' , '-c', 'bash /ppml/bigdl-lora-finetuing-entrypoint.sh']
env:
- name: WORKER_ROLE
value: "trainer"
- name: WORLD_SIZE
value: "{{ .Values.trainerNum }}"
- name: MICRO_BATCH_SIZE
value: "{{ .Values.microBatchSize }}"
- name: MASTER_PORT
value: "42679"
- name: MASTER_ADDR
value: "bigdl-lora-finetuning-job-worker-0.bigdl-lora-finetuning-job-worker"
- name: LOCAL_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: nfs-storage
subPath: {{ .Values.modelSubPath }}
mountPath: /ppml/model
- name: nfs-storage
subPath: {{ .Values.dataSubPath }}
mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
- name: nfs-storage
subPath: {{ .Values.outputSubPath }}
mountPath: "/ppml/output"
resources:
requests:
cpu: {{ .Values.cpuPerPod }}
volumes:
- name: nfs-storage
persistentVolumeClaim:
claimName: nfs-pvc