source bigdl-llm-init -t -g
export MASTER_ADDR=127.0.0.1
export CCL_ZE_IPC_EXCHANGE=sockets
NUM_GPUS=4
if [[ -n $OMP_NUM_THREADS ]]; then
    export OMP_NUM_THREADS=$(($OMP_NUM_THREADS / $NUM_GPUS))
else
    export OMP_NUM_THREADS=$(($(nproc) / $NUM_GPUS))
fi
torchrun --standalone \
         --nnodes=1 \
         --nproc-per-node $NUM_GPUS \
         deepspeed_autotp.py --repo-id-or-model-path "meta-llama/Llama-2-7b-hf"