ipex-llm/docker/llm/finetune/qlora/cpu/docker/start-qlora-finetuning-on-cpu.sh
Heyang Sun 74fd7077a2 [LLM] Multi-process and distributed QLoRA on CPU platform (#9491)
* [LLM] Multi-process and distributed QLoRA on CPU platform

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* enable llm-init and bind to socket

* refine

* Update Dockerfile

* add all files of qlora cpu example to /bigdl

* fix

* fix k8s

* Update bigdl-qlora-finetuing-entrypoint.sh

* Update bigdl-qlora-finetuing-entrypoint.sh

* Update bigdl-qlora-finetuning-job.yaml

* fix train sync and performance issues

* add node affinity

* disable user to tune cpu per pod

* Update bigdl-qlora-finetuning-job.yaml
2023-12-01 13:47:19 +08:00

38 lines
1 KiB
Bash

#!/bin/bash
set -x
cd /bigdl
export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
source /opt/intel/oneapi/setvars.sh
source bigdl-llm-init -t
if [ -d "./model" ];
then
MODEL_PARAM="--repo-id-or-model-path ./model" # otherwise, default to download from HF repo
fi
if [ -d "./data/english_quotes" ];
then
DATA_PARAM="--dataset ./data/english_quotes" # otherwise, default to download from HF dataset
fi
if [ "$STANDALONE_DOCKER" = "TRUE" ]
then
export CONTAINER_IP=$(hostname -i)
export CPU_CORES=$(nproc)
source /opt/intel/oneapi/setvars.sh
export CCL_WORKER_COUNT=$WORKER_COUNT_DOCKER
export CCL_WORKER_AFFINITY=auto
export MASTER_ADDR=$CONTAINER_IP
mpirun \
-n $CCL_WORKER_COUNT \
-ppn $CCL_WORKER_COUNT \
-genv OMP_NUM_THREADS=$((CPU_CORES / CCL_WORKER_COUNT)) \
-genv KMP_AFFINITY="granularity=fine,none" \
-genv KMP_BLOCKTIME=1 \
-genv TF_ENABLE_ONEDNN_OPTS=1 \
python qlora_finetuning_cpu.py $MODEL_PARAM $DATA_PARAM
else
python qlora_finetuning_cpu.py $MODEL_PARAM $DATA_PARAM
fi