[LLM] Multi-process and distributed QLoRA on CPU platform (#9491)

* [LLM] Multi-process and distributed QLoRA on CPU platform

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* enable llm-init and bind to socket

* refine

* Update Dockerfile

* add all files of qlora cpu example to /bigdl

* fix

* fix k8s

* Update bigdl-qlora-finetuing-entrypoint.sh

* Update bigdl-qlora-finetuing-entrypoint.sh

* Update bigdl-qlora-finetuning-job.yaml

* fix train sync and performance issues

* add node affinity

* disable user to tune cpu per pod

* Update bigdl-qlora-finetuning-job.yaml
This commit is contained in:
Heyang Sun 2023-12-01 13:47:19 +08:00 committed by GitHub
parent ed0dc57c6e
commit 74fd7077a2
11 changed files with 368 additions and 28 deletions

View file

@ -1,19 +1,43 @@
FROM intel/oneapi-basekit:2023.2.1-devel-ubuntu22.04
FROM ubuntu:20.04 as key-getter
ARG http_proxy
ARG https_proxy
RUN apt-get update && \
apt-get install -y curl gpg && \
curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB | gpg --dearmor | tee /root/intel-oneapi-archive-keyring.gpg
FROM mpioperator/intel as builder
ARG http_proxy
ARG https_proxy
ENV TZ=Asia/Shanghai
ARG PIP_NO_CACHE_DIR=false
ENV TRANSFORMERS_COMMIT_ID=95fe0f5
# retrive oneapi repo public key
RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " > /etc/apt/sources.list.d/oneAPI.list
# add public key
COPY --from=key-getter /root/intel-oneapi-archive-keyring.gpg /usr/share/keyrings/intel-oneapi-archive-keyring.gpg
RUN echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " > /etc/apt/sources.list.d/oneAPI.list
# update dependencies
RUN apt-get update && \
# install basic dependencies
RUN mkdir -p /bigdl/data && mkdir -p /bigdl/model && \
# install pytorch 2.1.0
apt-get update && \
apt-get install -y python3-pip python3.9-dev python3-wheel git software-properties-common && \
pip3 install --upgrade pip && \
export PIP_DEFAULT_TIMEOUT=100 && \
pip install --upgrade torch==2.1.0 --index-url https://download.pytorch.org/whl/cpu && \
# install CPU bigdl-llm
pip3 install --pre --upgrade bigdl-llm[all] -i https://pypi.tuna.tsinghua.edu.cn/simple/ && \
# install ipex and oneccl
pip install intel_extension_for_pytorch==2.0.100 && \
pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable && \
# install huggingface dependencies
pip install datasets transformers==4.34.0 && \
pip install fire peft==0.5.0 && \
pip install accelerate==0.23.0 && \
# install basic dependencies
apt-get install -y curl wget git gnupg gpg-agent software-properties-common libunwind8-dev vim less && \
# install python 3.9
# install python 3.9
ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \
env DEBIAN_FRONTEND=noninteractive apt-get update && \
add-apt-repository ppa:deadsnakes/ppa -y && \
@ -22,18 +46,29 @@ RUN apt-get update && \
ln -s /usr/bin/python3.9 /usr/bin/python3 && \
ln -s /usr/bin/python3 /usr/bin/python && \
apt-get install -y python3-pip python3.9-dev python3-wheel python3.9-distutils && \
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
# install OpenSSH for MPI to communicate between containers
apt-get install -y --no-install-recommends openssh-client openssh-server && \
mkdir -p /var/run/sshd && \
# allow OpenSSH to talk to containers without asking for confirmation
# by disabling StrictHostKeyChecking.
# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
# to disable UserKnownHostsFile to avoid write permissions.
# disabling StrictModes avoids directory and files read permission checks.
sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
# add bigdl-llm qlora cpu example
cd /bigdl && \
git clone https://github.com/intel-analytics/BigDL.git && \
mv BigDL/python/llm/example/CPU/QLoRA-FineTuning/* . && \
rm -r BigDL
# for docker directly run example
COPY ./start-qlora-finetuning-on-cpu.sh /bigdl/start-qlora-finetuning-on-cpu.sh
# for k8s
COPY ./bigdl-qlora-finetuing-entrypoint.sh /bigdl/bigdl-qlora-finetuing-entrypoint.sh
# install torch and oneccl to reduce bigdl-llm size
RUN pip3 install --upgrade pip && \
export PIP_DEFAULT_TIMEOUT=100 && \
pip install --upgrade torch==2.0.1 --index-url https://download.pytorch.org/whl/cpu && \
pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable && \
# install CPU bigdl-llm
pip install --pre --upgrade bigdl-llm[all] -i https://pypi.tuna.tsinghua.edu.cn/simple/ && \
# install huggingface dependencies
pip install transformers==4.34.0 && \
pip install peft==0.5.0 datasets
ADD ./qlora_finetuning_cpu.py /qlora_finetuning_cpu.py
ADD ./start-qlora-finetuning-on-cpu.sh /start-qlora-finetuning-on-cpu.sh
RUN chown -R mpiuser /bigdl
USER mpiuser
ENTRYPOINT ["/bin/bash"]

View file

@ -38,8 +38,8 @@ docker run -itd \
--name=bigdl-llm-fintune-qlora-cpu \
-e http_proxy=${HTTP_PROXY} \
-e https_proxy=${HTTPS_PROXY} \
-v $BASE_MODE_PATH:/model \
-v $DATA_PATH:/data/english_quotes \
-v $BASE_MODE_PATH:/bigdl/model \
-v $DATA_PATH:/bigdl/data/english_quotes \
intelanalytics/bigdl-llm-finetune-qlora-cpu:2.4.0-SNAPSHOT
```
@ -59,7 +59,7 @@ docker run -itd \
However, we do recommend you to handle them manually, because the automatical download can be blocked by Internet access and Huggingface authentication etc. according to different environment, and the manual method allows you to fine-tune in a custom way (with different base model and dataset).
### 3. Start Fine-Tuning
### 3. Start Fine-Tuning (Local Mode)
Enter the running container:
@ -70,9 +70,7 @@ docker exec -it bigdl-llm-fintune-qlora-cpu bash
Then, start QLoRA fine-tuning:
If the machine memory is not enough, you can try to set `use_gradient_checkpointing=True`.
And remember to use `bigdl-llm-init` before you start finetuning, which can accelerate the job.
```bash
source bigdl-llm-init -t
bash start-qlora-finetuning-on-cpu.sh
```
@ -127,3 +125,32 @@ Inference time: xxx s
-------------------- Output --------------------
“QLoRA fine-tuning using BigDL-LLM 4bit optimizations on Intel CPU is Efficient and convenient” ->: ['bigdl'] ['deep-learning'] ['distributed-computing'] ['intel'] ['optimization'] ['training'] ['training-speed']
```
### 4. Start Multi-Porcess Fine-Tuning in One Docker
<img src="https://github.com/Uxito-Ada/BigDL/assets/60865256/f25c43b3-2b24-4476-a0fe-804c0ef3c36c" height="240px"><br>
Multi-process parallelism enables higher performance for QLoRA fine-tuning, e.g. Xeon server series with multi-processor-socket architecture is suitable to run one instance on each QLoRA. This can be done by simply invoke >=2 OneCCL instances in BigDL QLoRA docker:
```bash
docker run -itd \
--name=bigdl-llm-fintune-qlora-cpu \
--cpuset-cpus="your_expected_range_of_cpu_numbers" \
-e STANDALONE_DOCKER=TRUE \
-e WORKER_COUNT_DOCKER=your_worker_count \
-v your_downloaded_base_model_path:/bigdl/model \
-v your_downloaded_data_path:/bigdl/data/alpaca_data_cleaned_archive.json \
intelanalytics/bigdl-llm-finetune-qlora-cpu:2.5.0-SNAPSHOT
```
Note that `STANDALONE_DOCKER` is set to **TRUE** here.
Then following the same way as above to enter the docker container and start fine-tuning:
```bash
bash start-qlora-finetuning-on-cpu.sh
```
### 5. Start Distributed Fine-Tuning on Kubernetes
Besides multi-process mode, you can also run QLoRA on a kubernetes cluster. please refer [here](https://github.com/intel-analytics/BigDL/blob/main/docker/llm/finetune/qlora/cpu/kubernetes/README.md).

View file

@ -0,0 +1,46 @@
#!/bin/bash
# this is to run alpaca qlora on k8s
set -x
source /opt/intel/oneapi/setvars.sh
export CCL_WORKER_COUNT=$WORLD_SIZE
source bigdl-llm-init -t
if [ "$WORKER_ROLE" = "launcher" ]
then
sed "s/:1/ /g" /etc/mpi/hostfile > /home/mpiuser/hostfile
sleep 10 # wait for worker pods to be ready
export ACCELERATE_USE_CPU=True
mpirun \
-n $WORLD_SIZE \
-ppn 1 \
-f /home/mpiuser/hostfile \
-iface eth0 \
--bind-to socket \
-genv OMP_NUM_THREADS=48 \
-genv KMP_AFFINITY="granularity=fine,none" \
-genv KMP_BLOCKTIME=1 \
-genv TF_ENABLE_ONEDNN_OPTS=1 \
python /bigdl/alpaca-qlora/alpaca_qlora_finetuning_cpu.py \
--base_model '/bigdl/model' \
--data_path "/bigdl/data" \
--output_dir "/home/mpiuser/finetuned_model" \
--batch_size 128 \
--micro_batch_size $MICRO_BATCH_SIZE > /home/mpiuser/launcher.log 2>&1
exit_status=$?
if [ $exit_status -ne 0 ];
then
cat /home/mpiuser/launcher.log
exit $exit_status
else
while true
do
echo "[INFO] Successfully finished fine-tuning"
sleep 900
done
fi
elif [ "$WORKER_ROLE" = "trainer" ]
then
export LOCAL_RANK=$(cut -d "-" -f6 <<< "$LOCAL_POD_NAME")
export PMI_SIZE=$WORLD_SIZE
export PMI_RANK=$LOCAL_RANK
/usr/sbin/sshd -De -f /home/mpiuser/.sshd_config
fi

View file

@ -1,8 +1,10 @@
#!/bin/bash
set -x
cd /bigdl
export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
source /opt/intel/oneapi/setvars.sh
source bigdl-llm-init -t
if [ -d "./model" ];
then
@ -14,5 +16,23 @@ then
DATA_PARAM="--dataset ./data/english_quotes" # otherwise, default to download from HF dataset
fi
python qlora_finetuning_cpu.py $MODEL_PARAM $DATA_PARAM
if [ "$STANDALONE_DOCKER" = "TRUE" ]
then
export CONTAINER_IP=$(hostname -i)
export CPU_CORES=$(nproc)
source /opt/intel/oneapi/setvars.sh
export CCL_WORKER_COUNT=$WORKER_COUNT_DOCKER
export CCL_WORKER_AFFINITY=auto
export MASTER_ADDR=$CONTAINER_IP
mpirun \
-n $CCL_WORKER_COUNT \
-ppn $CCL_WORKER_COUNT \
-genv OMP_NUM_THREADS=$((CPU_CORES / CCL_WORKER_COUNT)) \
-genv KMP_AFFINITY="granularity=fine,none" \
-genv KMP_BLOCKTIME=1 \
-genv TF_ENABLE_ONEDNN_OPTS=1 \
python qlora_finetuning_cpu.py $MODEL_PARAM $DATA_PARAM
else
python qlora_finetuning_cpu.py $MODEL_PARAM $DATA_PARAM
fi

View file

@ -0,0 +1,6 @@
apiVersion: v2
name: bigdl-fintune-service
description: A Helm chart for BigDL Finetune Service on Kubernetes
type: application
version: 1.1.27
appVersion: "1.16.0"

View file

@ -0,0 +1,55 @@
## Run NF4&BF16-quantized QLoRA Finetuning on Kubernetes with OneCCL
![image](https://github.com/intel-analytics/BigDL/assets/60865256/825f47d9-c864-4f39-a331-adb1e3cb528e)
BigDL here provides a CPU optimization to accelerate the QLoRA finetuning of Llama2-7b, in the power of mixed-precision and distributed training. Detailedly, [Intel OneCCL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/oneccl.html), an available Hugging Face backend, is able to speed up the Pytorch computation with BF16 datatype on CPUs, as well as parallel processing on Kubernetes enabled by [Intel MPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html). Moreover, advanaced quantization of BigDL-LLM has been applied to improve memory utilization, which makes CPU large-scale fine-tuning possible with runtime NF4 model storage and BF16 computing types.
The architecture is illustrated in the following:
As above, BigDL implements its MPI training with [Kubeflow MPI operator](https://github.com/kubeflow/mpi-operator/tree/master), which encapsulates the deployment as MPIJob CRD, and assists users to handle the construction of a MPI worker cluster on Kubernetes, such as public key distribution, SSH connection, and log collection.
Now, let's go to deploy a QLoRA finetuning to create a new LLM from Llama2-7b.
**Note: Please make sure you have already have an available Kubernetes infrastructure and NFS shared storage, and install [Helm CLI](https://helm.sh/docs/helm/helm_install/) for Kubernetes job submission.**
### 1. Install Kubeflow MPI Operator
Follow [here](https://github.com/kubeflow/mpi-operator/tree/master#installation) to install a Kubeflow MPI operator in your Kubernetes, which will listen and receive the following MPIJob request at backend.
### 2. Download Image, Base Model and Finetuning Data
Follow [here](https://github.com/intel-analytics/BigDL/tree/main/docker/llm/finetune/qlora/cpu/docker#1-prepare-docker-image) to prepare BigDL QLoRA Finetuning image in your cluster.
As finetuning is from a base model, first download [Llama2-7b model from the public download site of Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b). Then, download [cleaned alpaca data](https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json), which contains all kinds of general knowledge and has already been cleaned. Next, move the downloaded files to a shared directory on your NFS server.
### 3. Deploy through Helm Chart
You are allowed to edit and experiment with different parameters in `./kubernetes/values.yaml` to improve finetuning performance and accuracy. For example, you can adjust `trainerNum` and `cpuPerPod` according to node and CPU core numbers in your cluster to make full use of these resources, and different `microBatchSize` result in different training speed and loss (here note that `microBatchSize`×`trainerNum` should not more than 128, as it is the batch size).
**Note: `dataSubPath` and `modelSubPath` need to have the same names as files under the NFS directory in step 2.**
After preparing parameters in `./kubernetes/values.yaml`, submit the job as beflow:
```bash
cd ./kubernetes
helm install bigdl-qlora-finetuning .
```
### 4. Check Deployment
```bash
kubectl get all -n bigdl-qlora-finetuning # you will see launcher and worker pods running
```
### 5. Check Finetuning Process
After deploying successfully, you can find a launcher pod, and then go inside this pod and check the logs collected from all workers.
```bash
kubectl get all -n bigdl-qlora-finetuning # you will see a launcher pod
kubectl exec -it <launcher_pod_name> bash -n bigdl-qlora-finetuning # enter launcher pod
cat launcher.log # display logs collected from other workers
```
From the log, you can see whether finetuning process has been invoked successfully in all MPI worker pods, and a progress bar with finetuning speed and estimated time will be showed after some data preprocessing steps (this may take quiet a while).
For the fine-tuned model, it is written by the worker 0 (who holds rank 0), so you can find the model output inside the pod, which can be saved to host by command tools like `kubectl cp` or `scp`.

View file

@ -0,0 +1,111 @@
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: bigdl-qlora-finetuning-job
namespace: bigdl-qlora-finetuning
spec:
slotsPerWorker: 1
runPolicy:
cleanPodPolicy: Running
sshAuthMountPath: /home/mpiuser/.ssh
mpiImplementation: Intel
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
volumes:
- name: nfs-storage
persistentVolumeClaim:
claimName: nfs-pvc
containers:
- image: {{ .Values.imageName }}
name: bigdl-qlora-finetuning-launcher
securityContext:
runAsUser: 1000
command: ['sh' , '-c', 'bash /bigdl/bigdl-qlora-finetuing-entrypoint.sh']
env:
- name: WORKER_ROLE
value: "launcher"
- name: WORLD_SIZE
value: "{{ .Values.trainerNum }}"
- name: MICRO_BATCH_SIZE
value: "{{ .Values.microBatchSize }}"
- name: MASTER_PORT
value: "42679"
- name: MASTER_ADDR
value: "bigdl-qlora-finetuning-job-worker-0.bigdl-qlora-finetuning-job-worker"
- name: DATA_SUB_PATH
value: "{{ .Values.dataSubPath }}"
- name: http_proxy
value: "{{ .Values.httpProxy }}"
- name: https_proxy
value: "{{ .Values.httpsProxy }}"
- name: LOCAL_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: nfs-storage
subPath: {{ .Values.modelSubPath }}
mountPath: /bigdl/model
- name: nfs-storage
subPath: {{ .Values.dataSubPath }}
mountPath: "/bigdl/data/{{ .Values.dataSubPath }}"
Worker:
replicas: {{ .Values.trainerNum }}
template:
spec:
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: training.kubeflow.org/job-role
operator: In
values:
- worker
topologyKey: kubernetes.io/hostname
containers:
- image: {{ .Values.imageName }}
name: bigdl-qlora-finetuning-worker
securityContext:
runAsUser: 1000
command: ['sh' , '-c', 'bash /bigdl/bigdl-qlora-finetuing-entrypoint.sh']
env:
- name: WORKER_ROLE
value: "trainer"
- name: WORLD_SIZE
value: "{{ .Values.trainerNum }}"
- name: MICRO_BATCH_SIZE
value: "{{ .Values.microBatchSize }}"
- name: MASTER_PORT
value: "42679"
- name: MASTER_ADDR
value: "bigdl-qlora-finetuning-job-worker-0.bigdl-qlora-finetuning-job-worker"
- name: http_proxy
value: "{{ .Values.httpProxy }}"
- name: https_proxy
value: "{{ .Values.httpsProxy }}"
- name: LOCAL_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: nfs-storage
subPath: {{ .Values.modelSubPath }}
mountPath: /bigdl/model
- name: nfs-storage
subPath: {{ .Values.dataSubPath }}
mountPath: "/bigdl/data/{{ .Values.dataSubPath }}"
resources:
requests:
cpu: 48
limits:
cpu: 48
volumes:
- name: nfs-storage
persistentVolumeClaim:
claimName: nfs-pvc

View file

@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: bigdl-qlora-finetuning

View file

@ -0,0 +1,15 @@
apiVersion: v1
kind: PersistentVolume
metadata:
name: nfs-pv-bigdl-qlora-finetuning
namespace: bigdl-qlora-finetuning
spec:
capacity:
storage: 15Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: nfs
nfs:
path: {{ .Values.nfsPath }}
server: {{ .Values.nfsServerIp }}

View file

@ -0,0 +1,12 @@
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: nfs-pvc
namespace: bigdl-qlora-finetuning
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
storageClassName: nfs

View file

@ -0,0 +1,9 @@
imageName: intelanalytics/bigdl-llm-finetune-qlora-cpu:2.5.0-SNAPSHOT
trainerNum: 2
microBatchSize: 8
nfsServerIp: your_nfs_server_ip
nfsPath: a_nfs_shared_folder_path_on_the_server
dataSubPath: alpaca_data_cleaned_archive.json # a subpath of the data file under nfs directory
modelSubPath: Llama-2-7b-chat-hf # a subpath of the model file (dir) under nfs directory
httpProxy: "your_http_proxy_like_http://xxx:xxxx_if_needed_else_empty"
httpsProxy: "your_https_proxy_like_http://xxx:xxxx_if_needed_else_empty"