Add stand-alone mode on cpu for finetuning (#9127)

* Added steps for finetune on CPU in stand-alone mode * Add stand-alone mode to bigdl-lora-finetuing-entrypoint.sh * delete redundant docker commands * Update README.md Turn to intelanalytics/bigdl-llm-finetune-cpu:2.4.0-SNAPSHOT and append example outputs to allow users to check the running * Update bigdl-lora-finetuing-entrypoint.sh Add some tunable parameters * Add parameters --cpus and -e WORKER_COUNT_DOCKER * Modified the cpu number range parameters * Set -ppn to CCL_WORKER_COUNT * Add related configuration suggestions in README.md
2023-10-11 15:01:21 +08:00 · 2023-10-11 15:01:21 +08:00 · 4a0a3c376a
commit 4a0a3c376a
parent 995b0f119f
2 changed files with 150 additions and 42 deletions
--- a/docker/llm/finetune/lora/cpu/docker/README.md
+++ b/docker/llm/finetune/lora/cpu/docker/README.md
@ -1,4 +1,6 @@
-## Prepare BigDL image for Lora Finetuning
+## Fine-tune LLM with One CPU
+
+### 1. Prepare BigDL image for Lora Finetuning

 You can download directly from Dockerhub like:

@ -18,3 +20,87 @@ docker build \
  -t intelanalytics/bigdl-llm-finetune-lora-cpu:2.4.0-SNAPSHOT \
  -f ./Dockerfile .
 ```
+
+### 2. Prepare Base Model, Data and Container
+
+Here, we try to finetune [Llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b) with [Cleaned alpaca data](https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json), which contains all kinds of general knowledge and has already been cleaned. And please download them and start a docker container with files mounted like below:
+
+```
+docker run -itd \
+ --name=bigdl-llm-fintune-lora-cpu \
+ --cpuset-cpus="your_expected_range_of_cpu_numbers" \
+ -e STANDALONE_DOCKER=TRUE \
+ -e WORKER_COUNT_DOCKER=your_worker_count \
+ -v your_downloaded_base_model_path:/ppml/model \
+ -v your_downloaded_data_path:/ppml/data/alpaca_data_cleaned_archive.json \
+ intelanalytics/bigdl-llm-finetune-cpu:2.4.0-SNAPSHOT \
+ bash
+```
+
+You can adjust the configuration according to your own environment. After our testing, we recommend you set worker_count=1, and then allocate 80G memory to Docker.
+
+### 3. Start Finetuning
+
+Enter the running container:
+
+```
+docker exec -it bigdl-llm-fintune-lora-cpu bash
+```
+
+Then, run the script to start finetuning:
+
+```
+bash /ppml/bigdl-lora-finetuing-entrypoint.sh
+```
+
+After minutes, it is expected to get results like:
+
+```
+Training Alpaca-LoRA model with params:
+base_model: /ppml/model/
+data_path: /ppml/data/alpaca_data_cleaned_archive.json
+output_dir: /home/mpiuser/finetuned_model
+batch_size: 128
+micro_batch_size: 8
+num_epochs: 3
+learning_rate: 0.0003
+cutoff_len: 256
+val_set_size: 2000
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: ['q_proj', 'v_proj']
+train_on_inputs: True
+group_by_length: False
+wandb_project:
+wandb_run_name:
+wandb_watch:
+wandb_log_model:
+resume_from_checkpoint: None
+use_ipex: False
+bf16: False
+
+world_size: 2!!
+PMI_RANK(local_rank): 1
+Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.28s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.62s/it]
+trainable params: 4194304 || all params: 6742609920 || trainable%: 0.06220594176090199
+[INFO] spliting and shuffling dataset...
+[INFO] shuffling and tokenizing train data...
+Map:   2%|▏         | 1095/49759 [00:00<00:30, 1599.00 examples/s]trainable params: 4194304 || all params: 6742609920 || trainable%: 0.06220594176090199
+[INFO] spliting and shuffling dataset...
+[INFO] shuffling and tokenizing train data...
+Map: 100%|██████████| 49759/49759 [00:29<00:00, 1678.89 examples/s]
+[INFO] shuffling and tokenizing test data...
+Map: 100%|██████████| 49759/49759 [00:29<00:00, 1685.42 examples/s]
+[INFO] shuffling and tokenizing test data...
+Map: 100%|██████████| 2000/2000 [00:01<00:00, 1573.61 examples/s]
+Map: 100%|██████████| 2000/2000 [00:01<00:00, 1578.71 examples/s]
+2023:10:11-01:12:20:(  670) |CCL_WARN| no membind support for NUMA node 0, skip thread membind
+2023:10:11-01:12:20:(  671) |CCL_WARN| no membind support for NUMA node 1, skip thread membind
+2023:10:11-01:12:20:(  672) |CCL_WARN| no membind support for NUMA node 0, skip thread membind
+2023:10:11-01:12:20:(  673) |CCL_WARN| no membind support for NUMA node 1, skip thread membind
+[INFO] begining the training of transformers...
+[INFO] Process rank: 0, device: cpudistributed training: True
+  0%|          | 1/1164 [02:42<52:28:24, 162.43s/it]
+```
--- a/docker/llm/finetune/lora/cpu/docker/bigdl-lora-finetuing-entrypoint.sh
+++ b/docker/llm/finetune/lora/cpu/docker/bigdl-lora-finetuing-entrypoint.sh
@ -1,46 +1,68 @@
 #!/bin/bash
 set -x
-source /opt/intel/oneapi/setvars.sh
-export CCL_WORKER_COUNT=$WORLD_SIZE
-export CCL_WORKER_AFFINITY=auto
-
-if [ "$WORKER_ROLE" = "launcher" ]
+if [ "$STANDALONE_DOCKER" = "TRUE" ]
 then
-  sed "s/:1/ /g" /etc/mpi/hostfile > /home/mpiuser/hostfile
-  export DATA_PATH="/ppml/data/$DATA_SUB_PATH"
-  sleep 10
+  export CONTAINER_IP=$(hostname -i)
+  export CPU_CORES=$(nproc)
+  source /opt/intel/oneapi/setvars.sh
+  export CCL_WORKER_COUNT=$WORKER_COUNT_DOCKER
+  export CCL_WORKER_AFFINITY=auto
+  export MASTER_ADDR=$CONTAINER_IP
  mpirun \
-    -n $WORLD_SIZE \
-    -ppn 1 \
-    -f /home/mpiuser/hostfile \
-    -iface eth0 \
-    -genv OMP_NUM_THREADS=$OMP_NUM_THREADS \
-    -genv KMP_AFFINITY="granularity=fine,none" \
-    -genv KMP_BLOCKTIME=1 \
-    -genv TF_ENABLE_ONEDNN_OPTS=1 \
-    python /ppml/lora_finetune.py \
-      --base_model '/ppml/model/'  \
-      --data_path "$DATA_PATH" \
-      --output_dir "/home/mpiuser/finetuned_model" \
-      --micro_batch_size $MICRO_BATCH_SIZE \
-      --bf16 > /home/mpiuser/launcher.log 2>&1
-  exit_status=$?
-  if [ $exit_status -ne 0 ];
-  then
-    cat /home/mpiuser/launcher.log
-    exit $exit_status
-  else
-    while true
-    do
-      echo "[INFO] Successfully finished training"
-      sleep 900
-    done
-  fi
-elif [ "$WORKER_ROLE" = "trainer" ]
-then
-  export LOCAL_RANK=$(cut -d "-" -f6 <<< "$LOCAL_POD_NAME")
-  export PMI_SIZE=$WORLD_SIZE
-  export PMI_RANK=$LOCAL_RANK
-  /usr/sbin/sshd -De -f /home/mpiuser/.sshd_config
-fi
+     -n $CCL_WORKER_COUNT \
+     -ppn $CCL_WORKER_COUNT \
+     -genv OMP_NUM_THREADS=$((CPU_CORES / CCL_WORKER_COUNT)) \
+     -genv KMP_AFFINITY="granularity=fine,none" \
+     -genv KMP_BLOCKTIME=1 \
+     -genv TF_ENABLE_ONEDNN_OPTS=1 \
+     python /ppml/lora_finetune.py \
+       --base_model '/ppml/model/'  \
+       --data_path "/ppml/data/alpaca_data_cleaned_archive.json" \
+       --output_dir "/home/mpiuser/finetuned_model" \
+       --micro_batch_size 8 \
+       --bf16 

+else
+  source /opt/intel/oneapi/setvars.sh
+  export CCL_WORKER_COUNT=$WORLD_SIZE
+  export CCL_WORKER_AFFINITY=auto
+  if [ "$WORKER_ROLE" = "launcher" ]
+  then
+    sed "s/:1/ /g" /etc/mpi/hostfile > /home/mpiuser/hostfile
+    export DATA_PATH="/ppml/data/$DATA_SUB_PATH"
+    sleep 10
+    mpirun \
+      -n $WORLD_SIZE \
+      -ppn 1 \
+      -f /home/mpiuser/hostfile \
+      -iface eth0 \
+      -genv OMP_NUM_THREADS=$OMP_NUM_THREADS \
+      -genv KMP_AFFINITY="granularity=fine,none" \
+      -genv KMP_BLOCKTIME=1 \
+      -genv TF_ENABLE_ONEDNN_OPTS=1 \
+      python /ppml/lora_finetune.py \
+        --base_model '/ppml/model/'  \
+        --data_path "$DATA_PATH" \
+        --output_dir "/home/mpiuser/finetuned_model" \
+        --micro_batch_size $MICRO_BATCH_SIZE \
+        --bf16 > /home/mpiuser/launcher.log 2>&1
+    exit_status=$?
+    if [ $exit_status -ne 0 ];
+    then
+      cat /home/mpiuser/launcher.log
+      exit $exit_status
+    else
+      while true
+      do
+        echo "[INFO] Successfully finished training"
+        sleep 900
+      done
+    fi
+  elif [ "$WORKER_ROLE" = "trainer" ]
+  then
+    export LOCAL_RANK=$(cut -d "-" -f6 <<< "$LOCAL_POD_NAME")
+    export PMI_SIZE=$WORLD_SIZE
+    export PMI_RANK=$LOCAL_RANK
+    /usr/sbin/sshd -De -f /home/mpiuser/.sshd_config
+  fi
+fi