From 4b843d1dbfb9a0b7db359711193296ef0c78a302 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Mon, 25 Sep 2023 09:28:44 +0800 Subject: [PATCH] change lora-model output behavior on k8s (#9038) Co-authored-by: leonardozcm --- docker/llm/finetune/lora/README.md | 8 +++++--- .../lora/docker/bigdl-lora-finetuing-entrypoint.sh | 7 +++---- .../kubernetes/templates/bigdl-lora-finetuning-job.yaml | 8 +------- .../templates/bigdl-lora-finetuning-tdx-job.yaml | 6 ------ docker/llm/finetune/lora/kubernetes/values.yaml | 3 +-- 5 files changed, 10 insertions(+), 22 deletions(-) diff --git a/docker/llm/finetune/lora/README.md b/docker/llm/finetune/lora/README.md index 81b61add..1492cf90 100644 --- a/docker/llm/finetune/lora/README.md +++ b/docker/llm/finetune/lora/README.md @@ -22,13 +22,13 @@ Follow [here](https://github.com/kubeflow/mpi-operator/tree/master#installation) Follow [here](https://github.com/intel-analytics/BigDL/tree/main/docker/llm/finetune/lora/docker#prepare-bigdl-image-for-lora-finetuning) to prepare BigDL Lora Finetuning image in your cluster. -As finetuning is from a base model, first download [Llama 7b hf model from the public download site of Hugging Face](https://huggingface.co/decapoda-research/llama-7b-hf/tree/main). Then, download [cleaned alpaca data](https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json), which contains all kinds of general knowledge and has already been cleaned. Next, move the downloaded files to a shared directory on your NFS server. In addition, make an empty directory under the same destination to save the finetuned model output later. +As finetuning is from a base model, first download [Llama 7b hf model from the public download site of Hugging Face](https://huggingface.co/decapoda-research/llama-7b-hf/tree/main). Then, download [cleaned alpaca data](https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json), which contains all kinds of general knowledge and has already been cleaned. Next, move the downloaded files to a shared directory on your NFS server. ### 3. Deploy through Helm Chart You are allowed to edit and experiment with different parameters in `./kubernetes/values.yaml` to improve finetuning performance and accuracy. For example, you can adjust `trainerNum` and `cpuPerPod` according to node and CPU core numbers in your cluster to make full use of these resources, and different `microBatchSize` result in different training speed and loss (here note that `microBatchSize`×`trainerNum` should not more than 128, as it is the batch size). -**Note: `dataSubPath`, `modelSubPath` and `outputPath` need to have the same names as files under the NFS directory in step 2.** +**Note: `dataSubPath` and `modelSubPath` need to have the same names as files under the NFS directory in step 2.** After preparing parameters in `./kubernetes/values.yaml`, submit the job as beflow: @@ -52,7 +52,9 @@ kubectl exec -it bash -n bigdl-ppml-finetuning # enter launc cat launcher.log # display logs collected from other workers ``` -From the log, you can see whether finetuning process has been invoked successfully in all MPI worker pods, and a progress bar with finetuning speed and estimated time will be showed after some data preprocessing steps (this may take quiet a while). For the fine-tuned model, it is written by the worker 0 (who holds rank 0), so you can find the model output inside the pod or the `output` folder under the NFS path (because it has been mounted to worker 0 as output path). +From the log, you can see whether finetuning process has been invoked successfully in all MPI worker pods, and a progress bar with finetuning speed and estimated time will be showed after some data preprocessing steps (this may take quiet a while). + +For the fine-tuned model, it is written by the worker 0 (who holds rank 0), so you can find the model output inside the pod, which can be saved to host by command tools like `kubectl cp` or `scp`. ## To run in TDX-CoCo and enable Remote Attestation API diff --git a/docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh b/docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh index 1f9873b0..f9008c8d 100644 --- a/docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh +++ b/docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh @@ -8,7 +8,6 @@ if [ "$WORKER_ROLE" = "launcher" ] then sed "s/:1/ /g" /etc/mpi/hostfile > /home/mpiuser/hostfile export DATA_PATH="/ppml/data/$DATA_SUB_PATH" - export SAVE_PATH="/ppml/output" sleep 10 mpirun \ -n $WORLD_SIZE \ @@ -22,13 +21,13 @@ then python /ppml/lora_finetune.py \ --base_model '/ppml/model/' \ --data_path "$DATA_PATH" \ - --output_dir "$SAVE_PATH/finetuned_model" \ + --output_dir "/home/mpiuser/finetuned_model" \ --micro_batch_size $MICRO_BATCH_SIZE \ - --bf16 > $SAVE_PATH/launcher.log 2>&1 + --bf16 > /home/mpiuser/launcher.log 2>&1 exit_status=$? if [ $exit_status -ne 0 ]; then - cat $SAVE_PATH/launcher.log + cat /home/mpiuser/launcher.log exit $exit_status else while true diff --git a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml b/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml index 63d50461..4c22b068 100644 --- a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml +++ b/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml @@ -51,9 +51,6 @@ spec: - name: nfs-storage subPath: {{ .Values.dataSubPath }} mountPath: "/ppml/data/{{ .Values.dataSubPath }}" - - name: nfs-storage - subPath: {{ .Values.outputSubPath }} - mountPath: "/ppml/output" Worker: replicas: {{ .Values.trainerNum }} template: @@ -86,9 +83,6 @@ spec: - name: nfs-storage subPath: {{ .Values.dataSubPath }} mountPath: "/ppml/data/{{ .Values.dataSubPath }}" - - name: nfs-storage - subPath: {{ .Values.outputSubPath }} - mountPath: "/ppml/output" resources: requests: cpu: {{ .Values.cpuPerPod }} @@ -96,4 +90,4 @@ spec: - name: nfs-storage persistentVolumeClaim: claimName: nfs-pvc -{{- end }} \ No newline at end of file +{{- end }} diff --git a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml b/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml index cd4d260b..ed00ea45 100644 --- a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml +++ b/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml @@ -71,9 +71,6 @@ spec: - name: nfs-storage subPath: {{ .Values.dataSubPath }} mountPath: "/ppml/data/{{ .Values.dataSubPath }}" - - name: nfs-storage - subPath: {{ .Values.outputSubPath }} - mountPath: "/ppml/output" - name: dev mountPath: /dev {{- if eq .Values.enableTLS true }} @@ -118,9 +115,6 @@ spec: - name: nfs-storage subPath: {{ .Values.dataSubPath }} mountPath: "/ppml/data/{{ .Values.dataSubPath }}" - - name: nfs-storage - subPath: {{ .Values.outputSubPath }} - mountPath: "/ppml/output" - name: dev mountPath: /dev resources: diff --git a/docker/llm/finetune/lora/kubernetes/values.yaml b/docker/llm/finetune/lora/kubernetes/values.yaml index 70691935..92df0493 100644 --- a/docker/llm/finetune/lora/kubernetes/values.yaml +++ b/docker/llm/finetune/lora/kubernetes/values.yaml @@ -6,11 +6,10 @@ nfsServerIp: your_nfs_server_ip nfsPath: a_nfs_shared_folder_path_on_the_server dataSubPath: alpaca_data_cleaned_archive.json # a subpath of the data file under nfs directory modelSubPath: llama-7b-hf # a subpath of the model file (dir) under nfs directory -outputSubPath: output # a subpath of the empty directory under the nfs directory to save finetuned model, for example, if you make an empty dir named 'output' at the nfsPath, the value should be 'output' ompNumThreads: 14 cpuPerPod: 42 attestionApiServicePort: 9870 enableTLS: false # true or false base64ServerCrt: "your_base64_format_server_crt" -base64ServerKey: "your_base64_format_server_key" \ No newline at end of file +base64ServerKey: "your_base64_format_server_key"