From 1a2129221d3bb549c6a2e9fe761c10c135e6a782 Mon Sep 17 00:00:00 2001 From: binbin Deng <108676127+plusbang@users.noreply.github.com> Date: Wed, 22 Nov 2023 13:49:14 +0800 Subject: [PATCH] LLM: support resume from checkpoint in Alpaca QLoRA (#9502) --- .../QLoRA-FineTuning/alpaca-qlora/README.md | 13 ++++++++++-- .../alpaca-qlora/alpaca_qlora_finetuning.py | 21 +------------------ 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md index 5e4e5cae..b8695f74 100644 --- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md +++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md @@ -13,8 +13,8 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.0.110+xpu as default # you can install specific ipex/torch version for your need pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu -pip install transformers==4.34.0 -pip install fire datasets peft==0.5.0 +pip install datasets transformers==4.34.0 +pip install fire peft==0.5.0 pip install oneccl_bind_pt==2.0.100 -f https://developer.intel.com/ipex-whl-stable-xpu # necessary to run distributed finetuning pip install accelerate==0.23.0 ``` @@ -76,6 +76,15 @@ bash finetune_llama2_7b_pvc_1550_1_card.sh bash finetune_llama2_7b_pvc_1550_4_card.sh ``` +**Important: If you fail to complete the whole finetuning process, it is suggested to resume training from a previously saved checkpoint by specifying `resume_from_checkpoint` to the local checkpoint folder as following:** +```bash +python ./alpaca_qlora_finetuning.py \ + --base_model "meta-llama/Llama-2-7b-hf" \ + --data_path "yahma/alpaca-cleaned" \ + --output_dir "./bigdl-qlora-alpaca" \ + --resume_from_checkpoint "./bigdl-qlora-alpaca/checkpoint-1100" +``` + ### 4. Sample Output ```log {'loss': 1.9231, 'learning_rate': 2.9999945367033285e-05, 'epoch': 0.0} diff --git a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py index c08abbde..2b6be6eb 100644 --- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py +++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py @@ -261,26 +261,6 @@ def train( else: data = load_dataset(data_path) - if resume_from_checkpoint: - # Check the available weights and load them - checkpoint_name = os.path.join( - resume_from_checkpoint, "pytorch_model.bin" - ) # Full checkpoint - if not os.path.exists(checkpoint_name): - checkpoint_name = os.path.join( - resume_from_checkpoint, "adapter_model.bin" - ) # only LoRA model - LoRA config above has to fit - resume_from_checkpoint = ( - False # So the trainer won't try loading its state - ) - # The two files above have a different name depending on how they were saved, but are actually the same. - if os.path.exists(checkpoint_name): - print(f"Restarting from {checkpoint_name}") - adapters_weights = torch.load(checkpoint_name) - set_peft_model_state_dict(model, adapters_weights) - else: - print(f"Checkpoint {checkpoint_name} not found") - model.print_trainable_parameters() # Be more transparent about the % of trainable params. if val_set_size > 0: @@ -336,6 +316,7 @@ def train( gradient_checkpointing=gradient_checkpointing, ddp_backend="ccl", deepspeed=deepspeed, + save_safetensors=False, ), data_collator=transformers.DataCollatorForSeq2Seq( tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True