LLM: remove english_quotes dataset (#10370)

This commit is contained in:
binbin Deng 2024-03-12 16:57:40 +08:00 committed by GitHub
parent df2b84f7de
commit df3bcc0e65
6 changed files with 77 additions and 37 deletions

View file

@ -237,7 +237,7 @@ jobs:
shell: bash
run: |
echo "DATASET_DIR=${ORIGIN_DIR}/../datasets" >> "$GITHUB_ENV"
echo "ABIRATE_ENGLISH_QUOTES_PATH=${ORIGIN_DIR}/../datasets/abirate_english_quotes" >> "$GITHUB_ENV"
echo "YAHMA_ALPACA_CLEANED_PATH=${ORIGIN_DIR}/../datasets/yahma_alpaca_cleaned" >> "$GITHUB_ENV"
echo "SPEECH_DATASET_PATH=${ORIGIN_DIR}/../datasets/librispeech_asr_dummy" >> "$GITHUB_ENV"
echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
@ -308,9 +308,9 @@ jobs:
if [ ! -d $DATASET_DIR ]; then
mkdir -p $DATASET_DIR
fi
if [ ! -d $ABIRATE_ENGLISH_QUOTES_PATH ]; then
echo "Directory $ABIRATE_ENGLISH_QUOTES_PATH not found. Downloading from FTP server..."
wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/datasets/abirate_english_quotes -P $DATASET_DIR
if [ ! -d $YAHMA_ALPACA_CLEANED_PATH ]; then
echo "Directory $YAHMA_ALPACA_CLEANED_PATH not found. Downloading from FTP server..."
wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/datasets/yahma_alpaca_cleaned -P $DATASET_DIR
fi
if [ ! -d $SPEECH_DATASET_PATH ]; then
echo "Directory $SPEECH_DATASET_PATH not found. Downloading from FTP server..."

View file

@ -8,7 +8,7 @@ To run this example with BigDL-LLM on Intel GPUs, we have some recommended requi
## Example: Finetune llama2-7b using qlora
This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing). The `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).
This example is referred to [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing) and utilizes a subset of [yahma/alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned) for training. And the `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).
### 1. Install
@ -36,19 +36,19 @@ python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH
#### Sample Output
```log
{'loss': 1.6134, 'learning_rate': 0.0002, 'epoch': 0.03}
{'loss': 1.3038, 'learning_rate': 0.00017777777777777779, 'epoch': 0.06}
{'loss': 1.2634, 'learning_rate': 0.00015555555555555556, 'epoch': 0.1}
{'loss': 1.2389, 'learning_rate': 0.00013333333333333334, 'epoch': 0.13}
{'loss': 1.0399, 'learning_rate': 0.00011111111111111112, 'epoch': 0.16}
{'loss': 1.0406, 'learning_rate': 8.888888888888889e-05, 'epoch': 0.19}
{'loss': 1.3114, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.22}
{'loss': 0.9876, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.26}
{'loss': 1.1406, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.29}
{'loss': 1.1728, 'learning_rate': 0.0, 'epoch': 0.32}
{'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [03:45<00:00, 1.13s/it]
TrainOutput(global_step=200, training_loss=1.211241865158081, metrics={'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32})
{'loss': 1.7093, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.6595, 'learning_rate': 1.7777777777777777e-05, 'epoch': 0.03}
{'loss': 1.5172, 'learning_rate': 1.555555555555556e-05, 'epoch': 0.05}
{'loss': 1.3666, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.06}
{'loss': 1.2738, 'learning_rate': 1.1111111111111113e-05, 'epoch': 0.08}
{'loss': 1.2199, 'learning_rate': 8.888888888888888e-06, 'epoch': 0.09}
{'loss': 1.1703, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.11}
{'loss': 1.108, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.12}
{'loss': 1.1199, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.14}
{'loss': 1.0668, 'learning_rate': 0.0, 'epoch': 0.15}
{'train_runtime': 279.3049, 'train_samples_per_second': 2.864, 'train_steps_per_second': 0.716, 'train_loss': 1.321143569946289, 'epoch': 0.15}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [04:39<00:00, 1.40s/it]
TrainOutput(global_step=200, training_loss=1.321143569946289, metrics={'train_runtime': 279.3049, 'train_samples_per_second': 2.864, 'train_steps_per_second': 0.716, 'train_loss': 1.321143569946289, 'epoch': 0.15})
```
### 4. Merge the adapter into the original model

View file

@ -26,22 +26,38 @@ from bigdl.llm.transformers import AutoModelForCausalLM
from datasets import load_dataset
import argparse
current_dir = os.path.dirname(os.path.realpath(__file__))
common_util_path = os.path.join(current_dir, '..', '..')
import sys
sys.path.append(common_util_path)
from common.utils import Prompter, get_train_val_data
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Simple example of how to qlora finetune llama2 model using bigdl-llm')
parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf",
help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
', or the path to the huggingface checkpoint folder')
parser.add_argument('--dataset', type=str, default="Abirate/english_quotes")
parser.add_argument('--dataset', type=str, default="yahma/alpaca-cleaned")
args = parser.parse_args()
model_path = args.repo_id_or_model_path
dataset_path = args.dataset
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
data = load_dataset(dataset_path)
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
if dataset_path.endswith(".json") or dataset_path.endswith(".jsonl"):
data = load_dataset("json", data_files=dataset_path)
else:
data = load_dataset(dataset_path)
# For illustration purpose, only use part of data to train
data = data["train"].train_test_split(train_size=0.1, shuffle=False)
# Data processing
prompter = Prompter("alpaca")
train_data, _ = get_train_val_data(data, tokenizer, prompter, train_on_inputs=True,
add_eos_token=False, cutoff_len=256, val_set_size=0, seed=42)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=False,
@ -76,7 +92,7 @@ if __name__ == "__main__":
tokenizer.padding_side = "left"
trainer = transformers.Trainer(
model=model,
train_dataset=data["train"],
train_dataset=train_data,
args=transformers.TrainingArguments(
per_device_train_batch_size=4,
gradient_accumulation_steps= 1,
@ -90,7 +106,9 @@ if __name__ == "__main__":
optim="adamw_hf", # paged_adamw_8bit is not supported yet
# gradient_checkpointing=True, # can further reduce memory but slower
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
data_collator=transformers.DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
),
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
result = trainer.train()

View file

@ -8,7 +8,7 @@ To run this example with BigDL-LLM on Intel GPUs, we have some recommended requi
## Example: Finetune llama2-7b using qlora
The `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).
This example utilizes a subset of [yahma/alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned) for training. And the `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).
### 1. Install
@ -36,14 +36,19 @@ python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH
#### Sample Output
```log
{'loss': 1.7386, 'learning_rate': 8.888888888888888e-06, 'epoch': 0.19}
{'loss': 1.9242, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.22}
{'loss': 1.6819, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.26}
{'loss': 1.755, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.29}
{'loss': 1.7455, 'learning_rate': 0.0, 'epoch': 0.32}
{'train_runtime': 172.8523, 'train_samples_per_second': 4.628, 'train_steps_per_second': 1.157, 'train_loss': 1.9101631927490235, 'epoch': 0.32}
100%|████████████████████████████████████████████| 200/200 [02:52<00:00, 1.16it/s]
TrainOutput(global_step=200, training_loss=1.9101631927490235, metrics={'train_runtime': 172.8523, 'train_samples_per_second': 4.628, 'train_steps_per_second': 1.157, 'train_loss': 1.9101631927490235, 'epoch': 0.32})
{'loss': 3.1898, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 3.1854, 'learning_rate': 1.7777777777777777e-05, 'epoch': 0.03}
{'loss': 3.0359, 'learning_rate': 1.555555555555556e-05, 'epoch': 0.05}
{'loss': 2.9661, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.06}
{'loss': 2.7779, 'learning_rate': 1.1111111111111113e-05, 'epoch': 0.08}
{'loss': 2.7795, 'learning_rate': 8.888888888888888e-06, 'epoch': 0.09}
{'loss': 2.5149, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.11}
{'loss': 2.5759, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.12}
{'loss': 2.5976, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.14}
{'loss': 2.5744, 'learning_rate': 0.0, 'epoch': 0.15}
{'train_runtime': 116.1914, 'train_samples_per_second': 6.885, 'train_steps_per_second': 1.721, 'train_loss': 2.819730052947998, 'epoch': 0.15}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:56<00:00, 1.72it/s]
TrainOutput(global_step=200, training_loss=2.819730052947998, metrics={'train_runtime': 116.1914, 'train_samples_per_second': 6.885, 'train_steps_per_second': 1.721, 'train_loss': 2.819730052947998, 'epoch': 0.15})
```
### 4. Merge the adapter into the original model

View file

@ -27,20 +27,37 @@ from datasets import load_dataset
from trl import SFTTrainer
import argparse
current_dir = os.path.dirname(os.path.realpath(__file__))
common_util_path = os.path.join(current_dir, '..', '..')
import sys
sys.path.append(common_util_path)
from common.utils import Prompter, get_train_val_data
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Simple example of how to qlora finetune llama2 model using bigdl-llm and TRL')
parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf",
help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
', or the path to the huggingface checkpoint folder')
parser.add_argument('--dataset', type=str, default="Abirate/english_quotes")
parser.add_argument('--dataset', type=str, default="yahma/alpaca-cleaned")
args = parser.parse_args()
model_path = args.repo_id_or_model_path
dataset_path = args.dataset
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
data = load_dataset(dataset_path, split="train")
if dataset_path.endswith(".json") or dataset_path.endswith(".jsonl"):
data = load_dataset("json", data_files=dataset_path)
else:
data = load_dataset(dataset_path)
# For illustration purpose, only use part of data to train
data = data["train"].train_test_split(train_size=0.1, shuffle=False)
# Data processing
prompter = Prompter("alpaca")
train_data, _ = get_train_val_data(data, tokenizer, prompter, train_on_inputs=True,
add_eos_token=False, cutoff_len=256, val_set_size=0, seed=42)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
@ -73,7 +90,7 @@ if __name__ == "__main__":
trainer = SFTTrainer(
model=model,
train_dataset=data,
train_dataset=train_data,
args=transformers.TrainingArguments(
per_device_train_batch_size=4,
gradient_accumulation_steps= 1,
@ -87,7 +104,7 @@ if __name__ == "__main__":
optim="adamw_hf", # paged_adamw_8bit is not supported yet
gradient_checkpointing=True, # can further reduce memory but slower
),
dataset_text_field="quote",
dataset_text_field="instruction",
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
result = trainer.train()

View file

@ -12,7 +12,7 @@ sed -i 's/max_steps=200/max_steps=2/; s/save_steps=100/save_steps=2/; s/logging_
python ${ANALYTICS_ZOO_ROOT}/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py \
--repo-id-or-model-path ${LLAMA2_7B_ORIGIN_PATH} \
--dataset ${ABIRATE_ENGLISH_QUOTES_PATH}
--dataset ${YAHMA_ALPACA_CLEANED_PATH}
python ${ANALYTICS_ZOO_ROOT}/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/export_merged_model.py \
--repo-id-or-model-path ${LLAMA2_7B_ORIGIN_PATH} \