LLM: support resume from checkpoint in Alpaca QLoRA (#9502)
This commit is contained in:
parent
139e98aa18
commit
1a2129221d
2 changed files with 12 additions and 22 deletions
|
|
@ -13,8 +13,8 @@ conda activate llm
|
||||||
# below command will install intel_extension_for_pytorch==2.0.110+xpu as default
|
# below command will install intel_extension_for_pytorch==2.0.110+xpu as default
|
||||||
# you can install specific ipex/torch version for your need
|
# you can install specific ipex/torch version for your need
|
||||||
pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
|
pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
|
||||||
pip install transformers==4.34.0
|
pip install datasets transformers==4.34.0
|
||||||
pip install fire datasets peft==0.5.0
|
pip install fire peft==0.5.0
|
||||||
pip install oneccl_bind_pt==2.0.100 -f https://developer.intel.com/ipex-whl-stable-xpu # necessary to run distributed finetuning
|
pip install oneccl_bind_pt==2.0.100 -f https://developer.intel.com/ipex-whl-stable-xpu # necessary to run distributed finetuning
|
||||||
pip install accelerate==0.23.0
|
pip install accelerate==0.23.0
|
||||||
```
|
```
|
||||||
|
|
@ -76,6 +76,15 @@ bash finetune_llama2_7b_pvc_1550_1_card.sh
|
||||||
bash finetune_llama2_7b_pvc_1550_4_card.sh
|
bash finetune_llama2_7b_pvc_1550_4_card.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Important: If you fail to complete the whole finetuning process, it is suggested to resume training from a previously saved checkpoint by specifying `resume_from_checkpoint` to the local checkpoint folder as following:**
|
||||||
|
```bash
|
||||||
|
python ./alpaca_qlora_finetuning.py \
|
||||||
|
--base_model "meta-llama/Llama-2-7b-hf" \
|
||||||
|
--data_path "yahma/alpaca-cleaned" \
|
||||||
|
--output_dir "./bigdl-qlora-alpaca" \
|
||||||
|
--resume_from_checkpoint "./bigdl-qlora-alpaca/checkpoint-1100"
|
||||||
|
```
|
||||||
|
|
||||||
### 4. Sample Output
|
### 4. Sample Output
|
||||||
```log
|
```log
|
||||||
{'loss': 1.9231, 'learning_rate': 2.9999945367033285e-05, 'epoch': 0.0}
|
{'loss': 1.9231, 'learning_rate': 2.9999945367033285e-05, 'epoch': 0.0}
|
||||||
|
|
|
||||||
|
|
@ -261,26 +261,6 @@ def train(
|
||||||
else:
|
else:
|
||||||
data = load_dataset(data_path)
|
data = load_dataset(data_path)
|
||||||
|
|
||||||
if resume_from_checkpoint:
|
|
||||||
# Check the available weights and load them
|
|
||||||
checkpoint_name = os.path.join(
|
|
||||||
resume_from_checkpoint, "pytorch_model.bin"
|
|
||||||
) # Full checkpoint
|
|
||||||
if not os.path.exists(checkpoint_name):
|
|
||||||
checkpoint_name = os.path.join(
|
|
||||||
resume_from_checkpoint, "adapter_model.bin"
|
|
||||||
) # only LoRA model - LoRA config above has to fit
|
|
||||||
resume_from_checkpoint = (
|
|
||||||
False # So the trainer won't try loading its state
|
|
||||||
)
|
|
||||||
# The two files above have a different name depending on how they were saved, but are actually the same.
|
|
||||||
if os.path.exists(checkpoint_name):
|
|
||||||
print(f"Restarting from {checkpoint_name}")
|
|
||||||
adapters_weights = torch.load(checkpoint_name)
|
|
||||||
set_peft_model_state_dict(model, adapters_weights)
|
|
||||||
else:
|
|
||||||
print(f"Checkpoint {checkpoint_name} not found")
|
|
||||||
|
|
||||||
model.print_trainable_parameters() # Be more transparent about the % of trainable params.
|
model.print_trainable_parameters() # Be more transparent about the % of trainable params.
|
||||||
|
|
||||||
if val_set_size > 0:
|
if val_set_size > 0:
|
||||||
|
|
@ -336,6 +316,7 @@ def train(
|
||||||
gradient_checkpointing=gradient_checkpointing,
|
gradient_checkpointing=gradient_checkpointing,
|
||||||
ddp_backend="ccl",
|
ddp_backend="ccl",
|
||||||
deepspeed=deepspeed,
|
deepspeed=deepspeed,
|
||||||
|
save_safetensors=False,
|
||||||
),
|
),
|
||||||
data_collator=transformers.DataCollatorForSeq2Seq(
|
data_collator=transformers.DataCollatorForSeq2Seq(
|
||||||
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
|
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue