From 4e70e339347b7f663dce3c33bfc748e1dfc00ae0 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Wed, 6 Dec 2023 09:23:17 +0800 Subject: [PATCH] [LLM] code and document for distributed qlora (#9585) * [LLM] code and document for distributed qlora * doc * refine for gradient checkpoint * refine * Update alpaca_qlora_finetuning_cpu.py * Update alpaca_qlora_finetuning_cpu.py * Update alpaca_qlora_finetuning_cpu.py * add link in doc --- .../llm/finetune/qlora/cpu/docker/Dockerfile | 2 +- .../bigdl-qlora-finetuing-entrypoint.sh | 7 +++- .../finetune/qlora/cpu/kubernetes/README.md | 2 +- .../templates/bigdl-qlora-finetuning-job.yaml | 4 +++ .../finetune/qlora/cpu/kubernetes/values.yaml | 1 + .../example/CPU/QLoRA-FineTuning/README.md | 2 +- .../QLoRA-FineTuning/alpaca-qlora/README.md | 9 ++++- .../alpaca_qlora_finetuning_cpu.py | 33 +++++++++++-------- 8 files changed, 41 insertions(+), 19 deletions(-) diff --git a/docker/llm/finetune/qlora/cpu/docker/Dockerfile b/docker/llm/finetune/qlora/cpu/docker/Dockerfile index 9d907ac0..b5647d9a 100644 --- a/docker/llm/finetune/qlora/cpu/docker/Dockerfile +++ b/docker/llm/finetune/qlora/cpu/docker/Dockerfile @@ -32,7 +32,7 @@ RUN mkdir -p /bigdl/data && mkdir -p /bigdl/model && \ pip install intel_extension_for_pytorch==2.0.100 && \ pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable && \ # install huggingface dependencies - pip install datasets transformers==4.34.0 && \ + pip install datasets https://files.pythonhosted.org/packages/9a/06/e4ec2a321e57c03b7e9345d709d554a52c33760e5015fdff0919d9459af0/transformers-4.35.0-py3-none-any.whl && \ pip install fire peft==0.5.0 && \ pip install accelerate==0.23.0 && \ # install basic dependencies diff --git a/docker/llm/finetune/qlora/cpu/docker/bigdl-qlora-finetuing-entrypoint.sh b/docker/llm/finetune/qlora/cpu/docker/bigdl-qlora-finetuing-entrypoint.sh index aea90695..d44119e1 100644 --- a/docker/llm/finetune/qlora/cpu/docker/bigdl-qlora-finetuing-entrypoint.sh +++ b/docker/llm/finetune/qlora/cpu/docker/bigdl-qlora-finetuing-entrypoint.sh @@ -9,6 +9,10 @@ then sed "s/:1/ /g" /etc/mpi/hostfile > /home/mpiuser/hostfile sleep 10 # wait for worker pods to be ready export ACCELERATE_USE_CPU=True + if [ "$ENABLE_GRADIENT_CHECKPOINT" = "true" ] + then + GRADIENT_CHECKPOINT_PARAM="--gradient_checkpointing" + fi mpirun \ -n $WORLD_SIZE \ -ppn 1 \ @@ -24,7 +28,8 @@ then --data_path "/bigdl/data" \ --output_dir "/home/mpiuser/finetuned_model" \ --batch_size 128 \ - --micro_batch_size $MICRO_BATCH_SIZE > /home/mpiuser/launcher.log 2>&1 + --micro_batch_size $MICRO_BATCH_SIZE \ + $GRADIENT_CHECKPOINT_PARAM > /home/mpiuser/launcher.log 2>&1 exit_status=$? if [ $exit_status -ne 0 ]; then diff --git a/docker/llm/finetune/qlora/cpu/kubernetes/README.md b/docker/llm/finetune/qlora/cpu/kubernetes/README.md index 89c76411..73fb6491 100644 --- a/docker/llm/finetune/qlora/cpu/kubernetes/README.md +++ b/docker/llm/finetune/qlora/cpu/kubernetes/README.md @@ -1,4 +1,4 @@ -## Run NF4&BF16-quantized QLoRA Finetuning on Kubernetes with OneCCL +## Run Distributed QLoRA Fine-Tuning on Kubernetes with OneCCL ![image](https://github.com/intel-analytics/BigDL/assets/60865256/825f47d9-c864-4f39-a331-adb1e3cb528e) diff --git a/docker/llm/finetune/qlora/cpu/kubernetes/templates/bigdl-qlora-finetuning-job.yaml b/docker/llm/finetune/qlora/cpu/kubernetes/templates/bigdl-qlora-finetuning-job.yaml index 3aa8b721..2ca952e6 100644 --- a/docker/llm/finetune/qlora/cpu/kubernetes/templates/bigdl-qlora-finetuning-job.yaml +++ b/docker/llm/finetune/qlora/cpu/kubernetes/templates/bigdl-qlora-finetuning-job.yaml @@ -37,6 +37,8 @@ spec: value: "bigdl-qlora-finetuning-job-worker-0.bigdl-qlora-finetuning-job-worker" - name: DATA_SUB_PATH value: "{{ .Values.dataSubPath }}" + - name: ENABLE_GRADIENT_CHECKPOINT + value: "{{ .Values.enableGradientCheckpoint }}" - name: http_proxy value: "{{ .Values.httpProxy }}" - name: https_proxy @@ -85,6 +87,8 @@ spec: value: "42679" - name: MASTER_ADDR value: "bigdl-qlora-finetuning-job-worker-0.bigdl-qlora-finetuning-job-worker" + - name: ENABLE_GRADIENT_CHECKPOINT + value: "{{ .Values.enableGradientCheckpoint }}" - name: http_proxy value: "{{ .Values.httpProxy }}" - name: https_proxy diff --git a/docker/llm/finetune/qlora/cpu/kubernetes/values.yaml b/docker/llm/finetune/qlora/cpu/kubernetes/values.yaml index 3ecbe1f4..c6066c22 100644 --- a/docker/llm/finetune/qlora/cpu/kubernetes/values.yaml +++ b/docker/llm/finetune/qlora/cpu/kubernetes/values.yaml @@ -1,6 +1,7 @@ imageName: intelanalytics/bigdl-llm-finetune-qlora-cpu:2.5.0-SNAPSHOT trainerNum: 2 microBatchSize: 8 +enableGradientCheckpoint: false # true will save more memory but increase latency nfsServerIp: your_nfs_server_ip nfsPath: a_nfs_shared_folder_path_on_the_server dataSubPath: alpaca_data_cleaned_archive.json # a subpath of the data file under nfs directory diff --git a/python/llm/example/CPU/QLoRA-FineTuning/README.md b/python/llm/example/CPU/QLoRA-FineTuning/README.md index 02b482c9..37c34b14 100644 --- a/python/llm/example/CPU/QLoRA-FineTuning/README.md +++ b/python/llm/example/CPU/QLoRA-FineTuning/README.md @@ -7,7 +7,7 @@ This example demonstrates how to finetune a llama2-7b model using Big-LLM 4bit o 1. Single node with single socket: [simple example](https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/CPU/QLoRA-FineTuning#example-finetune-llama2-7b-using-qlora) or [alpaca example](https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora) 2. [Single node with multiple sockets](https://github.com/intel-analytics/BigDL/tree/main/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora#guide-to-finetuning-qlora-on-one-node-with-multiple-sockets) -3. multiple nodes with multiple sockets +3. [multiple nodes with multiple sockets](https://github.com/intel-analytics/BigDL/blob/main/docker/llm/finetune/qlora/cpu/kubernetes/README.md) ## Example: Finetune llama2-7b using QLoRA diff --git a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md index 42e5a959..0d19f36f 100644 --- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md +++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md @@ -126,4 +126,11 @@ need to modify the [tokenization_baichuan.py](https://huggingface.co/baichuan-in from transformers import AutoTokenizer # noqa: F402 tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) base_model = AutoModelForCausalLM.from_pretrained(base_model,trust_remote_code=True) -``` \ No newline at end of file +``` + + +### 4. Finetuning in docker and multiple nodes (k8s) + +If you want to run multi-process fine-tuning, or do not want to manually install the above dependencies, we provide a docker solution to quickly start a one-container finetuning. Please refer to [here](https://github.com/intel-analytics/BigDL/tree/main/docker/llm/finetune/qlora/cpu/docker#fine-tune-llm-with-bigdl-llm-container). + +Moreover, for users with multiple CPU server resources e.g. Xeon series like SPR and ICX, we give a k8s distributed solution, where machines and processor sockets are allowed to collaborate by one click easily. Please refer to [here](https://github.com/intel-analytics/BigDL/blob/main/docker/llm/finetune/qlora/cpu/kubernetes/README.md) for how to run QLoRA on k8s. diff --git a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py index dc96c166..540abf78 100644 --- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py +++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py @@ -61,14 +61,6 @@ def get_int_from_env(env_keys, default): return val return default -local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0") -world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1") -port = get_int_from_env(["MASTER_PORT"], 29500) -os.environ["LOCAL_RANK"] = str(local_rank) -os.environ["WORLD_SIZE"] = str(world_size) -os.environ["RANK"] = str(local_rank) -os.environ["MASTER_PORT"] = str(port) - def train( # model/data params base_model: str = "meta-llama/Llama-2-7b-hf", # the only required argument, default to be "meta-llama/Llama-2-7b-hf" @@ -134,6 +126,7 @@ def train( f"wandb_log_model: {wandb_log_model}\n" f"resume_from_checkpoint: {resume_from_checkpoint or False}\n" f"prompt template: {prompt_template_name}\n" + f"gradient_checkpointing: {gradient_checkpointing}\n" ) assert ( base_model @@ -143,7 +136,21 @@ def train( prompter = Prompter(prompt_template_name) device_map = "auto" - world_size = int(os.environ.get("WORLD_SIZE", 1)) + if os.environ.get("LOCAL_POD_NAME", "") != "": # K8S dist + pmi_world_size = int(os.environ.get('PMI_SIZE', -1)) + if pmi_world_size > 0: + os.environ['WORLD_SIZE'] = str(pmi_world_size) + world_size = 1 if pmi_world_size == 0 else pmi_world_size + else: # Standalone (centralized or multi-process) + local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0") + world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1") + port = get_int_from_env(["MASTER_PORT"], 29500) + os.environ["LOCAL_RANK"] = str(local_rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["RANK"] = str(local_rank) + os.environ["MASTER_PORT"] = str(port) + + print(f"world_size: {world_size}") ddp = world_size != 1 if ddp: device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} @@ -176,7 +183,6 @@ def train( load_in_low_bit="sym_int4", # not support "nf4" optimize_model=False, torch_dtype=torch.bfloat16, - # device_map=device_map, modules_to_not_convert=["lm_head"], ) print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}") @@ -190,7 +196,7 @@ def train( 0 # unk. we want this to be different from the eos token ) tokenizer.padding_side = "left" # Allow batched inference - + print(model) def tokenize(prompt, add_eos_token=True): @@ -322,11 +328,9 @@ def train( report_to="wandb" if use_wandb else None, run_name=wandb_run_name if use_wandb else None, gradient_checkpointing=gradient_checkpointing, + gradient_checkpointing_kwargs={"use_reentrant": False} if gradient_checkpointing else None, ddp_backend="ccl" if ddp else None, ) - if ddp: - from accelerate.state import PartialState - args.distributed_state = PartialState(cpu=True, backend=args.ddp_backend) trainer = transformers.Trainer( model=model, @@ -351,3 +355,4 @@ def train( if __name__ == "__main__": fire.Fire(train) +