From 4b843d1dbfb9a0b7db359711193296ef0c78a302 Mon Sep 17 00:00:00 2001
From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com>
Date: Mon, 25 Sep 2023 09:28:44 +0800
Subject: [PATCH] change lora-model output behavior on k8s (#9038)

Co-authored-by: leonardozcm <leonardo1997zcm@gmail.com>
---
 docker/llm/finetune/lora/README.md                        | 8 +++++---
 .../lora/docker/bigdl-lora-finetuing-entrypoint.sh        | 7 +++----
 .../kubernetes/templates/bigdl-lora-finetuning-job.yaml   | 8 +-------
 .../templates/bigdl-lora-finetuning-tdx-job.yaml          | 6 ------
 docker/llm/finetune/lora/kubernetes/values.yaml           | 3 +--
 5 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/docker/llm/finetune/lora/README.md b/docker/llm/finetune/lora/README.md
index 81b61add..1492cf90 100644
--- a/docker/llm/finetune/lora/README.md
+++ b/docker/llm/finetune/lora/README.md
@@ -22,13 +22,13 @@ Follow [here](https://github.com/kubeflow/mpi-operator/tree/master#installation)
 
 Follow [here](https://github.com/intel-analytics/BigDL/tree/main/docker/llm/finetune/lora/docker#prepare-bigdl-image-for-lora-finetuning) to prepare BigDL Lora Finetuning image in your cluster.
 
-As finetuning is from a base model, first download [Llama 7b hf model from the public download site of Hugging Face](https://huggingface.co/decapoda-research/llama-7b-hf/tree/main). Then, download [cleaned alpaca data](https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json), which contains all kinds of general knowledge and has already been cleaned. Next, move the downloaded files to a shared directory on your NFS server. In addition, make an empty directory under the same destination to save the finetuned model output later.
+As finetuning is from a base model, first download [Llama 7b hf model from the public download site of Hugging Face](https://huggingface.co/decapoda-research/llama-7b-hf/tree/main). Then, download [cleaned alpaca data](https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json), which contains all kinds of general knowledge and has already been cleaned. Next, move the downloaded files to a shared directory on your NFS server.
 
 ### 3. Deploy through Helm Chart
 
 You are allowed to edit and experiment with different parameters in `./kubernetes/values.yaml` to improve finetuning performance and accuracy. For example, you can adjust `trainerNum` and `cpuPerPod` according to node and CPU core numbers in your cluster to make full use of these resources, and different `microBatchSize` result in different training speed and loss (here note that `microBatchSize`×`trainerNum` should not more than 128, as it is the batch size).
 
-**Note: `dataSubPath`, `modelSubPath` and `outputPath` need to have the same names as files under the NFS directory in step 2.**
+**Note: `dataSubPath` and `modelSubPath` need to have the same names as files under the NFS directory in step 2.**
 
 After preparing parameters in `./kubernetes/values.yaml`, submit the job as beflow:
 
@@ -52,7 +52,9 @@ kubectl exec -it <launcher_pod_name> bash -n bigdl-ppml-finetuning # enter launc
 cat launcher.log # display logs collected from other workers
 ```
 
-From the log, you can see whether finetuning process has been invoked successfully in all MPI worker pods, and a progress bar with finetuning speed and estimated time will be showed after some data preprocessing steps (this may take quiet a while). For the fine-tuned model, it is written by the worker 0 (who holds rank 0), so you can find the model output inside the pod or the `output` folder under the NFS path (because it has been mounted to worker 0 as output path).
+From the log, you can see whether finetuning process has been invoked successfully in all MPI worker pods, and a progress bar with finetuning speed and estimated time will be showed after some data preprocessing steps (this may take quiet a while).
+
+For the fine-tuned model, it is written by the worker 0 (who holds rank 0), so you can find the model output inside the pod, which can be saved to host by command tools like `kubectl cp` or `scp`.
 
 
 ## To run in TDX-CoCo and enable Remote Attestation API
diff --git a/docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh b/docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh
index 1f9873b0..f9008c8d 100644
--- a/docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh
+++ b/docker/llm/finetune/lora/docker/bigdl-lora-finetuing-entrypoint.sh
@@ -8,7 +8,6 @@ if [ "$WORKER_ROLE" = "launcher" ]
 then
   sed "s/:1/ /g" /etc/mpi/hostfile > /home/mpiuser/hostfile
   export DATA_PATH="/ppml/data/$DATA_SUB_PATH"
-  export SAVE_PATH="/ppml/output"
   sleep 10
   mpirun \
     -n $WORLD_SIZE \
@@ -22,13 +21,13 @@ then
     python /ppml/lora_finetune.py \
       --base_model '/ppml/model/'  \
       --data_path "$DATA_PATH" \
-      --output_dir "$SAVE_PATH/finetuned_model" \
+      --output_dir "/home/mpiuser/finetuned_model" \
       --micro_batch_size $MICRO_BATCH_SIZE \
-      --bf16 > $SAVE_PATH/launcher.log 2>&1
+      --bf16 > /home/mpiuser/launcher.log 2>&1
   exit_status=$?
   if [ $exit_status -ne 0 ];
   then
-    cat $SAVE_PATH/launcher.log
+    cat /home/mpiuser/launcher.log
     exit $exit_status
   else
     while true
diff --git a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml b/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml
index 63d50461..4c22b068 100644
--- a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml
+++ b/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-job.yaml
@@ -51,9 +51,6 @@ spec:
              - name: nfs-storage
                subPath: {{ .Values.dataSubPath }}
                mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
-             - name: nfs-storage
-               subPath: {{ .Values.outputSubPath }}
-               mountPath: "/ppml/output"
     Worker:
       replicas: {{ .Values.trainerNum }}
       template:
@@ -86,9 +83,6 @@ spec:
             - name: nfs-storage
               subPath: {{ .Values.dataSubPath }}
               mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
-            - name: nfs-storage
-              subPath: {{ .Values.outputSubPath }}
-              mountPath: "/ppml/output"
             resources:
               requests:
                 cpu: {{ .Values.cpuPerPod }}
@@ -96,4 +90,4 @@ spec:
           - name: nfs-storage
             persistentVolumeClaim:
               claimName: nfs-pvc
-{{- end }}
\ No newline at end of file
+{{- end }}
diff --git a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml b/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml
index cd4d260b..ed00ea45 100644
--- a/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml
+++ b/docker/llm/finetune/lora/kubernetes/templates/bigdl-lora-finetuning-tdx-job.yaml
@@ -71,9 +71,6 @@ spec:
              - name: nfs-storage
                subPath: {{ .Values.dataSubPath }}
                mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
-             - name: nfs-storage
-               subPath: {{ .Values.outputSubPath }}
-               mountPath: "/ppml/output"
              - name: dev
                mountPath: /dev
              {{- if eq .Values.enableTLS true }}
@@ -118,9 +115,6 @@ spec:
             - name: nfs-storage
               subPath: {{ .Values.dataSubPath }}
               mountPath: "/ppml/data/{{ .Values.dataSubPath }}"
-            - name: nfs-storage
-              subPath: {{ .Values.outputSubPath }}
-              mountPath: "/ppml/output"
             - name: dev
               mountPath: /dev
             resources:
diff --git a/docker/llm/finetune/lora/kubernetes/values.yaml b/docker/llm/finetune/lora/kubernetes/values.yaml
index 70691935..92df0493 100644
--- a/docker/llm/finetune/lora/kubernetes/values.yaml
+++ b/docker/llm/finetune/lora/kubernetes/values.yaml
@@ -6,11 +6,10 @@ nfsServerIp: your_nfs_server_ip
 nfsPath: a_nfs_shared_folder_path_on_the_server
 dataSubPath: alpaca_data_cleaned_archive.json # a subpath of the data file under nfs directory
 modelSubPath: llama-7b-hf # a subpath of the model file (dir) under nfs directory
-outputSubPath: output # a subpath of the empty directory under the nfs directory to save finetuned model, for example, if you make an empty dir named 'output' at the nfsPath, the value should be 'output'
 ompNumThreads: 14
 cpuPerPod: 42
 attestionApiServicePort: 9870
 
 enableTLS: false # true or false
 base64ServerCrt: "your_base64_format_server_crt"
-base64ServerKey: "your_base64_format_server_key"
\ No newline at end of file
+base64ServerKey: "your_base64_format_server_key"