#!/bin/bash
# this is to run alpaca qlora on k8s
set -x
source /opt/intel/oneapi/setvars.sh
export CCL_WORKER_COUNT=$WORLD_SIZE
source bigdl-llm-init -t
cd /bigdl/alpaca-qlora
if [ "$WORKER_ROLE" = "launcher" ]
then
  sed "s/:1/ /g" /etc/mpi/hostfile > /home/mpiuser/hostfile
  sleep 10 # wait for worker pods to be ready
  export ACCELERATE_USE_CPU=True
  if [ "$ENABLE_GRADIENT_CHECKPOINT" = "true" ]
  then
    GRADIENT_CHECKPOINT_PARAM="--gradient_checkpointing"
  fi
  mpirun \
    -n $WORLD_SIZE \
    -ppn 1 \
    -f /home/mpiuser/hostfile \
    -iface eth0 \
    --bind-to socket \
    -genv OMP_NUM_THREADS=48 \
    -genv KMP_AFFINITY="granularity=fine,none" \
    -genv KMP_BLOCKTIME=1 \
    -genv TF_ENABLE_ONEDNN_OPTS=1 \
    python /bigdl/alpaca-qlora/alpaca_qlora_finetuning_cpu.py \
      --base_model '/bigdl/model'  \
      --data_path "/bigdl/data" \
      --output_dir "/home/mpiuser/finetuned_model" \
      --batch_size 128 \
      --micro_batch_size $MICRO_BATCH_SIZE \
      $GRADIENT_CHECKPOINT_PARAM > /home/mpiuser/launcher.log 2>&1
  exit_status=$?
  if [ $exit_status -ne 0 ];
  then
    cat /home/mpiuser/launcher.log
    exit $exit_status
  else
    while true
    do
      echo "[INFO] Successfully finished fine-tuning"
      sleep 900
    done
  fi
elif [ "$WORKER_ROLE" = "trainer" ]
then
  export LOCAL_RANK=$(cut -d "-" -f6 <<< "$LOCAL_POD_NAME")
  export PMI_SIZE=$WORLD_SIZE
  export PMI_RANK=$LOCAL_RANK
  /usr/sbin/sshd -De -f /home/mpiuser/.sshd_config
fi