From 4a0a3c376a2e40f596c16b6923ad1cd444e9ff38 Mon Sep 17 00:00:00 2001 From: Ziteng Zhang <87107332+Jasonzzt@users.noreply.github.com> Date: Wed, 11 Oct 2023 15:01:21 +0800 Subject: [PATCH] Add stand-alone mode on cpu for finetuning (#9127) * Added steps for finetune on CPU in stand-alone mode * Add stand-alone mode to bigdl-lora-finetuing-entrypoint.sh * delete redundant docker commands * Update README.md Turn to intelanalytics/bigdl-llm-finetune-cpu:2.4.0-SNAPSHOT and append example outputs to allow users to check the running * Update bigdl-lora-finetuing-entrypoint.sh Add some tunable parameters * Add parameters --cpus and -e WORKER_COUNT_DOCKER * Modified the cpu number range parameters * Set -ppn to CCL_WORKER_COUNT * Add related configuration suggestions in README.md --- docker/llm/finetune/lora/cpu/docker/README.md | 88 ++++++++++++++- .../docker/bigdl-lora-finetuing-entrypoint.sh | 104 +++++++++++------- 2 files changed, 150 insertions(+), 42 deletions(-) diff --git a/docker/llm/finetune/lora/cpu/docker/README.md b/docker/llm/finetune/lora/cpu/docker/README.md index be86f2b2..3be1c760 100644 --- a/docker/llm/finetune/lora/cpu/docker/README.md +++ b/docker/llm/finetune/lora/cpu/docker/README.md @@ -1,4 +1,6 @@ -## Prepare BigDL image for Lora Finetuning +## Fine-tune LLM with One CPU + +### 1. Prepare BigDL image for Lora Finetuning You can download directly from Dockerhub like: @@ -18,3 +20,87 @@ docker build \ -t intelanalytics/bigdl-llm-finetune-lora-cpu:2.4.0-SNAPSHOT \ -f ./Dockerfile . ``` + +### 2. Prepare Base Model, Data and Container + +Here, we try to finetune [Llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b) with [Cleaned alpaca data](https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json), which contains all kinds of general knowledge and has already been cleaned. And please download them and start a docker container with files mounted like below: + +``` +docker run -itd \ + --name=bigdl-llm-fintune-lora-cpu \ + --cpuset-cpus="your_expected_range_of_cpu_numbers" \ + -e STANDALONE_DOCKER=TRUE \ + -e WORKER_COUNT_DOCKER=your_worker_count \ + -v your_downloaded_base_model_path:/ppml/model \ + -v your_downloaded_data_path:/ppml/data/alpaca_data_cleaned_archive.json \ + intelanalytics/bigdl-llm-finetune-cpu:2.4.0-SNAPSHOT \ + bash +``` + +You can adjust the configuration according to your own environment. After our testing, we recommend you set worker_count=1, and then allocate 80G memory to Docker. + +### 3. Start Finetuning + +Enter the running container: + +``` +docker exec -it bigdl-llm-fintune-lora-cpu bash +``` + +Then, run the script to start finetuning: + +``` +bash /ppml/bigdl-lora-finetuing-entrypoint.sh +``` + +After minutes, it is expected to get results like: + +``` +Training Alpaca-LoRA model with params: +base_model: /ppml/model/ +data_path: /ppml/data/alpaca_data_cleaned_archive.json +output_dir: /home/mpiuser/finetuned_model +batch_size: 128 +micro_batch_size: 8 +num_epochs: 3 +learning_rate: 0.0003 +cutoff_len: 256 +val_set_size: 2000 +lora_r: 8 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: ['q_proj', 'v_proj'] +train_on_inputs: True +group_by_length: False +wandb_project: +wandb_run_name: +wandb_watch: +wandb_log_model: +resume_from_checkpoint: None +use_ipex: False +bf16: False + +world_size: 2!! +PMI_RANK(local_rank): 1 +Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00, 2.28s/it] +Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00, 2.62s/it] +trainable params: 4194304 || all params: 6742609920 || trainable%: 0.06220594176090199 +[INFO] spliting and shuffling dataset... +[INFO] shuffling and tokenizing train data... +Map: 2%|▏ | 1095/49759 [00:00<00:30, 1599.00 examples/s]trainable params: 4194304 || all params: 6742609920 || trainable%: 0.06220594176090199 +[INFO] spliting and shuffling dataset... +[INFO] shuffling and tokenizing train data... +Map: 100%|██████████| 49759/49759 [00:29<00:00, 1678.89 examples/s] +[INFO] shuffling and tokenizing test data... +Map: 100%|██████████| 49759/49759 [00:29<00:00, 1685.42 examples/s] +[INFO] shuffling and tokenizing test data... +Map: 100%|██████████| 2000/2000 [00:01<00:00, 1573.61 examples/s] +Map: 100%|██████████| 2000/2000 [00:01<00:00, 1578.71 examples/s] +2023:10:11-01:12:20:( 670) |CCL_WARN| no membind support for NUMA node 0, skip thread membind +2023:10:11-01:12:20:( 671) |CCL_WARN| no membind support for NUMA node 1, skip thread membind +2023:10:11-01:12:20:( 672) |CCL_WARN| no membind support for NUMA node 0, skip thread membind +2023:10:11-01:12:20:( 673) |CCL_WARN| no membind support for NUMA node 1, skip thread membind +[INFO] begining the training of transformers... +[INFO] Process rank: 0, device: cpudistributed training: True + 0%| | 1/1164 [02:42<52:28:24, 162.43s/it] +``` diff --git a/docker/llm/finetune/lora/cpu/docker/bigdl-lora-finetuing-entrypoint.sh b/docker/llm/finetune/lora/cpu/docker/bigdl-lora-finetuing-entrypoint.sh index f9008c8d..340f35dc 100644 --- a/docker/llm/finetune/lora/cpu/docker/bigdl-lora-finetuing-entrypoint.sh +++ b/docker/llm/finetune/lora/cpu/docker/bigdl-lora-finetuing-entrypoint.sh @@ -1,46 +1,68 @@ #!/bin/bash set -x -source /opt/intel/oneapi/setvars.sh -export CCL_WORKER_COUNT=$WORLD_SIZE -export CCL_WORKER_AFFINITY=auto - -if [ "$WORKER_ROLE" = "launcher" ] +if [ "$STANDALONE_DOCKER" = "TRUE" ] then - sed "s/:1/ /g" /etc/mpi/hostfile > /home/mpiuser/hostfile - export DATA_PATH="/ppml/data/$DATA_SUB_PATH" - sleep 10 + export CONTAINER_IP=$(hostname -i) + export CPU_CORES=$(nproc) + source /opt/intel/oneapi/setvars.sh + export CCL_WORKER_COUNT=$WORKER_COUNT_DOCKER + export CCL_WORKER_AFFINITY=auto + export MASTER_ADDR=$CONTAINER_IP mpirun \ - -n $WORLD_SIZE \ - -ppn 1 \ - -f /home/mpiuser/hostfile \ - -iface eth0 \ - -genv OMP_NUM_THREADS=$OMP_NUM_THREADS \ - -genv KMP_AFFINITY="granularity=fine,none" \ - -genv KMP_BLOCKTIME=1 \ - -genv TF_ENABLE_ONEDNN_OPTS=1 \ - python /ppml/lora_finetune.py \ - --base_model '/ppml/model/' \ - --data_path "$DATA_PATH" \ - --output_dir "/home/mpiuser/finetuned_model" \ - --micro_batch_size $MICRO_BATCH_SIZE \ - --bf16 > /home/mpiuser/launcher.log 2>&1 - exit_status=$? - if [ $exit_status -ne 0 ]; - then - cat /home/mpiuser/launcher.log - exit $exit_status - else - while true - do - echo "[INFO] Successfully finished training" - sleep 900 - done - fi -elif [ "$WORKER_ROLE" = "trainer" ] -then - export LOCAL_RANK=$(cut -d "-" -f6 <<< "$LOCAL_POD_NAME") - export PMI_SIZE=$WORLD_SIZE - export PMI_RANK=$LOCAL_RANK - /usr/sbin/sshd -De -f /home/mpiuser/.sshd_config -fi + -n $CCL_WORKER_COUNT \ + -ppn $CCL_WORKER_COUNT \ + -genv OMP_NUM_THREADS=$((CPU_CORES / CCL_WORKER_COUNT)) \ + -genv KMP_AFFINITY="granularity=fine,none" \ + -genv KMP_BLOCKTIME=1 \ + -genv TF_ENABLE_ONEDNN_OPTS=1 \ + python /ppml/lora_finetune.py \ + --base_model '/ppml/model/' \ + --data_path "/ppml/data/alpaca_data_cleaned_archive.json" \ + --output_dir "/home/mpiuser/finetuned_model" \ + --micro_batch_size 8 \ + --bf16 +else + source /opt/intel/oneapi/setvars.sh + export CCL_WORKER_COUNT=$WORLD_SIZE + export CCL_WORKER_AFFINITY=auto + if [ "$WORKER_ROLE" = "launcher" ] + then + sed "s/:1/ /g" /etc/mpi/hostfile > /home/mpiuser/hostfile + export DATA_PATH="/ppml/data/$DATA_SUB_PATH" + sleep 10 + mpirun \ + -n $WORLD_SIZE \ + -ppn 1 \ + -f /home/mpiuser/hostfile \ + -iface eth0 \ + -genv OMP_NUM_THREADS=$OMP_NUM_THREADS \ + -genv KMP_AFFINITY="granularity=fine,none" \ + -genv KMP_BLOCKTIME=1 \ + -genv TF_ENABLE_ONEDNN_OPTS=1 \ + python /ppml/lora_finetune.py \ + --base_model '/ppml/model/' \ + --data_path "$DATA_PATH" \ + --output_dir "/home/mpiuser/finetuned_model" \ + --micro_batch_size $MICRO_BATCH_SIZE \ + --bf16 > /home/mpiuser/launcher.log 2>&1 + exit_status=$? + if [ $exit_status -ne 0 ]; + then + cat /home/mpiuser/launcher.log + exit $exit_status + else + while true + do + echo "[INFO] Successfully finished training" + sleep 900 + done + fi + elif [ "$WORKER_ROLE" = "trainer" ] + then + export LOCAL_RANK=$(cut -d "-" -f6 <<< "$LOCAL_POD_NAME") + export PMI_SIZE=$WORLD_SIZE + export PMI_RANK=$LOCAL_RANK + /usr/sbin/sshd -De -f /home/mpiuser/.sshd_config + fi +fi