47 lines
1.2 KiB
Bash
47 lines
1.2 KiB
Bash
#!/bin/bash
|
|
set -x
|
|
source /opt/intel/oneapi/setvars.sh
|
|
export CCL_WORKER_COUNT=$WORLD_SIZE
|
|
export CCL_WORKER_AFFINITY=auto
|
|
|
|
if [ "$WORKER_ROLE" = "launcher" ]
|
|
then
|
|
sed "s/:1/ /g" /etc/mpi/hostfile > /home/mpiuser/hostfile
|
|
export DATA_PATH="/ppml/data/$DATA_SUB_PATH"
|
|
export SAVE_PATH="/ppml/output"
|
|
sleep 10
|
|
mpirun \
|
|
-n $WORLD_SIZE \
|
|
-ppn 1 \
|
|
-f /home/mpiuser/hostfile \
|
|
-iface eth0 \
|
|
-genv OMP_NUM_THREADS=$OMP_NUM_THREADS \
|
|
-genv KMP_AFFINITY="granularity=fine,none" \
|
|
-genv KMP_BLOCKTIME=1 \
|
|
-genv TF_ENABLE_ONEDNN_OPTS=1 \
|
|
python /ppml/lora_finetune.py \
|
|
--base_model '/ppml/model/' \
|
|
--data_path "$DATA_PATH" \
|
|
--output_dir "$SAVE_PATH/finetuned_model" \
|
|
--micro_batch_size $MICRO_BATCH_SIZE \
|
|
--bf16 > $SAVE_PATH/launcher.log 2>&1
|
|
exit_status=$?
|
|
if [ $exit_status -ne 0 ];
|
|
then
|
|
cat $SAVE_PATH/launcher.log
|
|
exit $exit_status
|
|
else
|
|
while true
|
|
do
|
|
echo "[INFO] Successfully finished training"
|
|
sleep 900
|
|
done
|
|
fi
|
|
elif [ "$WORKER_ROLE" = "trainer" ]
|
|
then
|
|
export LOCAL_RANK=$(cut -d "-" -f6 <<< "$LOCAL_POD_NAME")
|
|
export PMI_SIZE=$WORLD_SIZE
|
|
export PMI_RANK=$LOCAL_RANK
|
|
/usr/sbin/sshd -De -f /home/mpiuser/.sshd_config
|
|
fi
|
|
|