68 lines
2 KiB
Bash
68 lines
2 KiB
Bash
#!/bin/bash
|
|
set -x
|
|
if [ "$STANDALONE_DOCKER" = "TRUE" ]
|
|
then
|
|
export CONTAINER_IP=$(hostname -i)
|
|
export CPU_CORES=$(nproc)
|
|
source /opt/intel/oneapi/setvars.sh
|
|
export CCL_WORKER_COUNT=$WORKER_COUNT_DOCKER
|
|
export CCL_WORKER_AFFINITY=auto
|
|
export MASTER_ADDR=$CONTAINER_IP
|
|
mpirun \
|
|
-n $CCL_WORKER_COUNT \
|
|
-ppn $CCL_WORKER_COUNT \
|
|
-genv OMP_NUM_THREADS=$((CPU_CORES / CCL_WORKER_COUNT)) \
|
|
-genv KMP_AFFINITY="granularity=fine,none" \
|
|
-genv KMP_BLOCKTIME=1 \
|
|
-genv TF_ENABLE_ONEDNN_OPTS=1 \
|
|
python /bigdl/lora_finetune.py \
|
|
--base_model '/bigdl/model/' \
|
|
--data_path "/bigdl/data/alpaca_data_cleaned_archive.json" \
|
|
--output_dir "/home/mpiuser/finetuned_model" \
|
|
--micro_batch_size 8 \
|
|
--bf16
|
|
|
|
else
|
|
source /opt/intel/oneapi/setvars.sh
|
|
export CCL_WORKER_COUNT=$WORLD_SIZE
|
|
export CCL_WORKER_AFFINITY=auto
|
|
if [ "$WORKER_ROLE" = "launcher" ]
|
|
then
|
|
sed "s/:1/ /g" /etc/mpi/hostfile > /home/mpiuser/hostfile
|
|
export DATA_PATH="/bigdl/data/$DATA_SUB_PATH"
|
|
sleep 10
|
|
mpirun \
|
|
-n $WORLD_SIZE \
|
|
-ppn 1 \
|
|
-f /home/mpiuser/hostfile \
|
|
-iface eth0 \
|
|
-genv OMP_NUM_THREADS=$OMP_NUM_THREADS \
|
|
-genv KMP_AFFINITY="granularity=fine,none" \
|
|
-genv KMP_BLOCKTIME=1 \
|
|
-genv TF_ENABLE_ONEDNN_OPTS=1 \
|
|
python /bigdl/lora_finetune.py \
|
|
--base_model '/bigdl/model/' \
|
|
--data_path "$DATA_PATH" \
|
|
--output_dir "/home/mpiuser/finetuned_model" \
|
|
--micro_batch_size $MICRO_BATCH_SIZE \
|
|
--bf16 > /home/mpiuser/launcher.log 2>&1
|
|
exit_status=$?
|
|
if [ $exit_status -ne 0 ];
|
|
then
|
|
cat /home/mpiuser/launcher.log
|
|
exit $exit_status
|
|
else
|
|
while true
|
|
do
|
|
echo "[INFO] Successfully finished training"
|
|
sleep 900
|
|
done
|
|
fi
|
|
elif [ "$WORKER_ROLE" = "trainer" ]
|
|
then
|
|
export LOCAL_RANK=$(cut -d "-" -f6 <<< "$LOCAL_POD_NAME")
|
|
export PMI_SIZE=$WORLD_SIZE
|
|
export PMI_RANK=$LOCAL_RANK
|
|
/usr/sbin/sshd -De -f /home/mpiuser/.sshd_config
|
|
fi
|
|
fi
|