From 1a2129221d3bb549c6a2e9fe761c10c135e6a782 Mon Sep 17 00:00:00 2001
From: binbin Deng <108676127+plusbang@users.noreply.github.com>
Date: Wed, 22 Nov 2023 13:49:14 +0800
Subject: [PATCH] LLM: support resume from checkpoint in Alpaca QLoRA (#9502)

---
 .../QLoRA-FineTuning/alpaca-qlora/README.md   | 13 ++++++++++--
 .../alpaca-qlora/alpaca_qlora_finetuning.py   | 21 +------------------
 2 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md
index 5e4e5cae..b8695f74 100644
--- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md
+++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md
@@ -13,8 +13,8 @@ conda activate llm
 # below command will install intel_extension_for_pytorch==2.0.110+xpu as default
 # you can install specific ipex/torch version for your need
 pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
-pip install transformers==4.34.0
-pip install fire datasets peft==0.5.0
+pip install datasets transformers==4.34.0
+pip install fire peft==0.5.0
 pip install oneccl_bind_pt==2.0.100 -f https://developer.intel.com/ipex-whl-stable-xpu # necessary to run distributed finetuning
 pip install accelerate==0.23.0
 ```
@@ -76,6 +76,15 @@ bash finetune_llama2_7b_pvc_1550_1_card.sh
 bash finetune_llama2_7b_pvc_1550_4_card.sh
 ```
 
+**Important: If you fail to complete the whole finetuning process, it is suggested to resume training from a previously saved checkpoint by specifying `resume_from_checkpoint` to the local checkpoint folder as following:**
+```bash
+python ./alpaca_qlora_finetuning.py \
+    --base_model "meta-llama/Llama-2-7b-hf" \
+    --data_path "yahma/alpaca-cleaned" \
+    --output_dir "./bigdl-qlora-alpaca" \
+    --resume_from_checkpoint "./bigdl-qlora-alpaca/checkpoint-1100"
+```
+
 ### 4. Sample Output
 ```log
 {'loss': 1.9231, 'learning_rate': 2.9999945367033285e-05, 'epoch': 0.0}                                                                                                                            
diff --git a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py
index c08abbde..2b6be6eb 100644
--- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py
+++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py
@@ -261,26 +261,6 @@ def train(
     else:
         data = load_dataset(data_path)
 
-    if resume_from_checkpoint:
-        # Check the available weights and load them
-        checkpoint_name = os.path.join(
-            resume_from_checkpoint, "pytorch_model.bin"
-        )  # Full checkpoint
-        if not os.path.exists(checkpoint_name):
-            checkpoint_name = os.path.join(
-                resume_from_checkpoint, "adapter_model.bin"
-            )  # only LoRA model - LoRA config above has to fit
-            resume_from_checkpoint = (
-                False  # So the trainer won't try loading its state
-            )
-        # The two files above have a different name depending on how they were saved, but are actually the same.
-        if os.path.exists(checkpoint_name):
-            print(f"Restarting from {checkpoint_name}")
-            adapters_weights = torch.load(checkpoint_name)
-            set_peft_model_state_dict(model, adapters_weights)
-        else:
-            print(f"Checkpoint {checkpoint_name} not found")
-
     model.print_trainable_parameters()  # Be more transparent about the % of trainable params.
 
     if val_set_size > 0:
@@ -336,6 +316,7 @@ def train(
             gradient_checkpointing=gradient_checkpointing,
             ddp_backend="ccl",
             deepspeed=deepspeed,
+            save_safetensors=False,
         ),
         data_collator=transformers.DataCollatorForSeq2Seq(
             tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True