* [LLM] Multi-process and distributed QLoRA on CPU platform * Update README.md * Update README.md * Update README.md * Update README.md * enable llm-init and bind to socket * refine * Update Dockerfile * add all files of qlora cpu example to /bigdl * fix * fix k8s * Update bigdl-qlora-finetuing-entrypoint.sh * Update bigdl-qlora-finetuing-entrypoint.sh * Update bigdl-qlora-finetuning-job.yaml * fix train sync and performance issues * add node affinity * disable user to tune cpu per pod * Update bigdl-qlora-finetuning-job.yaml
38 lines
1 KiB
Bash
38 lines
1 KiB
Bash
#!/bin/bash
|
|
set -x
|
|
cd /bigdl
|
|
export USE_XETLA=OFF
|
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
|
source /opt/intel/oneapi/setvars.sh
|
|
source bigdl-llm-init -t
|
|
|
|
if [ -d "./model" ];
|
|
then
|
|
MODEL_PARAM="--repo-id-or-model-path ./model" # otherwise, default to download from HF repo
|
|
fi
|
|
|
|
if [ -d "./data/english_quotes" ];
|
|
then
|
|
DATA_PARAM="--dataset ./data/english_quotes" # otherwise, default to download from HF dataset
|
|
fi
|
|
|
|
if [ "$STANDALONE_DOCKER" = "TRUE" ]
|
|
then
|
|
export CONTAINER_IP=$(hostname -i)
|
|
export CPU_CORES=$(nproc)
|
|
source /opt/intel/oneapi/setvars.sh
|
|
export CCL_WORKER_COUNT=$WORKER_COUNT_DOCKER
|
|
export CCL_WORKER_AFFINITY=auto
|
|
export MASTER_ADDR=$CONTAINER_IP
|
|
mpirun \
|
|
-n $CCL_WORKER_COUNT \
|
|
-ppn $CCL_WORKER_COUNT \
|
|
-genv OMP_NUM_THREADS=$((CPU_CORES / CCL_WORKER_COUNT)) \
|
|
-genv KMP_AFFINITY="granularity=fine,none" \
|
|
-genv KMP_BLOCKTIME=1 \
|
|
-genv TF_ENABLE_ONEDNN_OPTS=1 \
|
|
python qlora_finetuning_cpu.py $MODEL_PARAM $DATA_PARAM
|
|
else
|
|
python qlora_finetuning_cpu.py $MODEL_PARAM $DATA_PARAM
|
|
fi
|
|
|