LLM: remove english_quotes dataset (#10370)
This commit is contained in:
parent
df2b84f7de
commit
df3bcc0e65
6 changed files with 77 additions and 37 deletions
8
.github/workflows/llm_unit_tests.yml
vendored
8
.github/workflows/llm_unit_tests.yml
vendored
|
|
@ -237,7 +237,7 @@ jobs:
|
|||
shell: bash
|
||||
run: |
|
||||
echo "DATASET_DIR=${ORIGIN_DIR}/../datasets" >> "$GITHUB_ENV"
|
||||
echo "ABIRATE_ENGLISH_QUOTES_PATH=${ORIGIN_DIR}/../datasets/abirate_english_quotes" >> "$GITHUB_ENV"
|
||||
echo "YAHMA_ALPACA_CLEANED_PATH=${ORIGIN_DIR}/../datasets/yahma_alpaca_cleaned" >> "$GITHUB_ENV"
|
||||
echo "SPEECH_DATASET_PATH=${ORIGIN_DIR}/../datasets/librispeech_asr_dummy" >> "$GITHUB_ENV"
|
||||
|
||||
echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
|
||||
|
|
@ -308,9 +308,9 @@ jobs:
|
|||
if [ ! -d $DATASET_DIR ]; then
|
||||
mkdir -p $DATASET_DIR
|
||||
fi
|
||||
if [ ! -d $ABIRATE_ENGLISH_QUOTES_PATH ]; then
|
||||
echo "Directory $ABIRATE_ENGLISH_QUOTES_PATH not found. Downloading from FTP server..."
|
||||
wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/datasets/abirate_english_quotes -P $DATASET_DIR
|
||||
if [ ! -d $YAHMA_ALPACA_CLEANED_PATH ]; then
|
||||
echo "Directory $YAHMA_ALPACA_CLEANED_PATH not found. Downloading from FTP server..."
|
||||
wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/datasets/yahma_alpaca_cleaned -P $DATASET_DIR
|
||||
fi
|
||||
if [ ! -d $SPEECH_DATASET_PATH ]; then
|
||||
echo "Directory $SPEECH_DATASET_PATH not found. Downloading from FTP server..."
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ To run this example with BigDL-LLM on Intel GPUs, we have some recommended requi
|
|||
|
||||
## Example: Finetune llama2-7b using qlora
|
||||
|
||||
This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing). The `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).
|
||||
This example is referred to [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing) and utilizes a subset of [yahma/alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned) for training. And the `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).
|
||||
|
||||
### 1. Install
|
||||
|
||||
|
|
@ -36,19 +36,19 @@ python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH
|
|||
|
||||
#### Sample Output
|
||||
```log
|
||||
{'loss': 1.6134, 'learning_rate': 0.0002, 'epoch': 0.03}
|
||||
{'loss': 1.3038, 'learning_rate': 0.00017777777777777779, 'epoch': 0.06}
|
||||
{'loss': 1.2634, 'learning_rate': 0.00015555555555555556, 'epoch': 0.1}
|
||||
{'loss': 1.2389, 'learning_rate': 0.00013333333333333334, 'epoch': 0.13}
|
||||
{'loss': 1.0399, 'learning_rate': 0.00011111111111111112, 'epoch': 0.16}
|
||||
{'loss': 1.0406, 'learning_rate': 8.888888888888889e-05, 'epoch': 0.19}
|
||||
{'loss': 1.3114, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.22}
|
||||
{'loss': 0.9876, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.26}
|
||||
{'loss': 1.1406, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.29}
|
||||
{'loss': 1.1728, 'learning_rate': 0.0, 'epoch': 0.32}
|
||||
{'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32}
|
||||
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [03:45<00:00, 1.13s/it]
|
||||
TrainOutput(global_step=200, training_loss=1.211241865158081, metrics={'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32})
|
||||
{'loss': 1.7093, 'learning_rate': 2e-05, 'epoch': 0.02}
|
||||
{'loss': 1.6595, 'learning_rate': 1.7777777777777777e-05, 'epoch': 0.03}
|
||||
{'loss': 1.5172, 'learning_rate': 1.555555555555556e-05, 'epoch': 0.05}
|
||||
{'loss': 1.3666, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.06}
|
||||
{'loss': 1.2738, 'learning_rate': 1.1111111111111113e-05, 'epoch': 0.08}
|
||||
{'loss': 1.2199, 'learning_rate': 8.888888888888888e-06, 'epoch': 0.09}
|
||||
{'loss': 1.1703, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.11}
|
||||
{'loss': 1.108, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.12}
|
||||
{'loss': 1.1199, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.14}
|
||||
{'loss': 1.0668, 'learning_rate': 0.0, 'epoch': 0.15}
|
||||
{'train_runtime': 279.3049, 'train_samples_per_second': 2.864, 'train_steps_per_second': 0.716, 'train_loss': 1.321143569946289, 'epoch': 0.15}
|
||||
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [04:39<00:00, 1.40s/it]
|
||||
TrainOutput(global_step=200, training_loss=1.321143569946289, metrics={'train_runtime': 279.3049, 'train_samples_per_second': 2.864, 'train_steps_per_second': 0.716, 'train_loss': 1.321143569946289, 'epoch': 0.15})
|
||||
```
|
||||
|
||||
### 4. Merge the adapter into the original model
|
||||
|
|
|
|||
|
|
@ -26,22 +26,38 @@ from bigdl.llm.transformers import AutoModelForCausalLM
|
|||
from datasets import load_dataset
|
||||
import argparse
|
||||
|
||||
current_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
common_util_path = os.path.join(current_dir, '..', '..')
|
||||
import sys
|
||||
sys.path.append(common_util_path)
|
||||
from common.utils import Prompter, get_train_val_data
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser(description='Simple example of how to qlora finetune llama2 model using bigdl-llm')
|
||||
parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf",
|
||||
help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
|
||||
', or the path to the huggingface checkpoint folder')
|
||||
parser.add_argument('--dataset', type=str, default="Abirate/english_quotes")
|
||||
parser.add_argument('--dataset', type=str, default="yahma/alpaca-cleaned")
|
||||
|
||||
args = parser.parse_args()
|
||||
model_path = args.repo_id_or_model_path
|
||||
dataset_path = args.dataset
|
||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
data = load_dataset(dataset_path)
|
||||
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
|
||||
if dataset_path.endswith(".json") or dataset_path.endswith(".jsonl"):
|
||||
data = load_dataset("json", data_files=dataset_path)
|
||||
else:
|
||||
data = load_dataset(dataset_path)
|
||||
|
||||
# For illustration purpose, only use part of data to train
|
||||
data = data["train"].train_test_split(train_size=0.1, shuffle=False)
|
||||
|
||||
# Data processing
|
||||
prompter = Prompter("alpaca")
|
||||
train_data, _ = get_train_val_data(data, tokenizer, prompter, train_on_inputs=True,
|
||||
add_eos_token=False, cutoff_len=256, val_set_size=0, seed=42)
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=False,
|
||||
|
|
@ -76,7 +92,7 @@ if __name__ == "__main__":
|
|||
tokenizer.padding_side = "left"
|
||||
trainer = transformers.Trainer(
|
||||
model=model,
|
||||
train_dataset=data["train"],
|
||||
train_dataset=train_data,
|
||||
args=transformers.TrainingArguments(
|
||||
per_device_train_batch_size=4,
|
||||
gradient_accumulation_steps= 1,
|
||||
|
|
@ -90,7 +106,9 @@ if __name__ == "__main__":
|
|||
optim="adamw_hf", # paged_adamw_8bit is not supported yet
|
||||
# gradient_checkpointing=True, # can further reduce memory but slower
|
||||
),
|
||||
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
|
||||
data_collator=transformers.DataCollatorForSeq2Seq(
|
||||
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
|
||||
),
|
||||
)
|
||||
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
|
||||
result = trainer.train()
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ To run this example with BigDL-LLM on Intel GPUs, we have some recommended requi
|
|||
|
||||
## Example: Finetune llama2-7b using qlora
|
||||
|
||||
The `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).
|
||||
This example utilizes a subset of [yahma/alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned) for training. And the `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).
|
||||
|
||||
### 1. Install
|
||||
|
||||
|
|
@ -36,14 +36,19 @@ python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH
|
|||
|
||||
#### Sample Output
|
||||
```log
|
||||
{'loss': 1.7386, 'learning_rate': 8.888888888888888e-06, 'epoch': 0.19}
|
||||
{'loss': 1.9242, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.22}
|
||||
{'loss': 1.6819, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.26}
|
||||
{'loss': 1.755, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.29}
|
||||
{'loss': 1.7455, 'learning_rate': 0.0, 'epoch': 0.32}
|
||||
{'train_runtime': 172.8523, 'train_samples_per_second': 4.628, 'train_steps_per_second': 1.157, 'train_loss': 1.9101631927490235, 'epoch': 0.32}
|
||||
100%|████████████████████████████████████████████| 200/200 [02:52<00:00, 1.16it/s]
|
||||
TrainOutput(global_step=200, training_loss=1.9101631927490235, metrics={'train_runtime': 172.8523, 'train_samples_per_second': 4.628, 'train_steps_per_second': 1.157, 'train_loss': 1.9101631927490235, 'epoch': 0.32})
|
||||
{'loss': 3.1898, 'learning_rate': 2e-05, 'epoch': 0.02}
|
||||
{'loss': 3.1854, 'learning_rate': 1.7777777777777777e-05, 'epoch': 0.03}
|
||||
{'loss': 3.0359, 'learning_rate': 1.555555555555556e-05, 'epoch': 0.05}
|
||||
{'loss': 2.9661, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.06}
|
||||
{'loss': 2.7779, 'learning_rate': 1.1111111111111113e-05, 'epoch': 0.08}
|
||||
{'loss': 2.7795, 'learning_rate': 8.888888888888888e-06, 'epoch': 0.09}
|
||||
{'loss': 2.5149, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.11}
|
||||
{'loss': 2.5759, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.12}
|
||||
{'loss': 2.5976, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.14}
|
||||
{'loss': 2.5744, 'learning_rate': 0.0, 'epoch': 0.15}
|
||||
{'train_runtime': 116.1914, 'train_samples_per_second': 6.885, 'train_steps_per_second': 1.721, 'train_loss': 2.819730052947998, 'epoch': 0.15}
|
||||
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:56<00:00, 1.72it/s]
|
||||
TrainOutput(global_step=200, training_loss=2.819730052947998, metrics={'train_runtime': 116.1914, 'train_samples_per_second': 6.885, 'train_steps_per_second': 1.721, 'train_loss': 2.819730052947998, 'epoch': 0.15})
|
||||
```
|
||||
|
||||
### 4. Merge the adapter into the original model
|
||||
|
|
|
|||
|
|
@ -27,20 +27,37 @@ from datasets import load_dataset
|
|||
from trl import SFTTrainer
|
||||
import argparse
|
||||
|
||||
current_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
common_util_path = os.path.join(current_dir, '..', '..')
|
||||
import sys
|
||||
sys.path.append(common_util_path)
|
||||
from common.utils import Prompter, get_train_val_data
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser(description='Simple example of how to qlora finetune llama2 model using bigdl-llm and TRL')
|
||||
parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf",
|
||||
help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
|
||||
', or the path to the huggingface checkpoint folder')
|
||||
parser.add_argument('--dataset', type=str, default="Abirate/english_quotes")
|
||||
parser.add_argument('--dataset', type=str, default="yahma/alpaca-cleaned")
|
||||
|
||||
args = parser.parse_args()
|
||||
model_path = args.repo_id_or_model_path
|
||||
dataset_path = args.dataset
|
||||
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
data = load_dataset(dataset_path, split="train")
|
||||
if dataset_path.endswith(".json") or dataset_path.endswith(".jsonl"):
|
||||
data = load_dataset("json", data_files=dataset_path)
|
||||
else:
|
||||
data = load_dataset(dataset_path)
|
||||
|
||||
# For illustration purpose, only use part of data to train
|
||||
data = data["train"].train_test_split(train_size=0.1, shuffle=False)
|
||||
|
||||
# Data processing
|
||||
prompter = Prompter("alpaca")
|
||||
train_data, _ = get_train_val_data(data, tokenizer, prompter, train_on_inputs=True,
|
||||
add_eos_token=False, cutoff_len=256, val_set_size=0, seed=42)
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
|
|
@ -73,7 +90,7 @@ if __name__ == "__main__":
|
|||
|
||||
trainer = SFTTrainer(
|
||||
model=model,
|
||||
train_dataset=data,
|
||||
train_dataset=train_data,
|
||||
args=transformers.TrainingArguments(
|
||||
per_device_train_batch_size=4,
|
||||
gradient_accumulation_steps= 1,
|
||||
|
|
@ -87,7 +104,7 @@ if __name__ == "__main__":
|
|||
optim="adamw_hf", # paged_adamw_8bit is not supported yet
|
||||
gradient_checkpointing=True, # can further reduce memory but slower
|
||||
),
|
||||
dataset_text_field="quote",
|
||||
dataset_text_field="instruction",
|
||||
)
|
||||
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
|
||||
result = trainer.train()
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ sed -i 's/max_steps=200/max_steps=2/; s/save_steps=100/save_steps=2/; s/logging_
|
|||
|
||||
python ${ANALYTICS_ZOO_ROOT}/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py \
|
||||
--repo-id-or-model-path ${LLAMA2_7B_ORIGIN_PATH} \
|
||||
--dataset ${ABIRATE_ENGLISH_QUOTES_PATH}
|
||||
--dataset ${YAHMA_ALPACA_CLEANED_PATH}
|
||||
|
||||
python ${ANALYTICS_ZOO_ROOT}/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/export_merged_model.py \
|
||||
--repo-id-or-model-path ${LLAMA2_7B_ORIGIN_PATH} \
|
||||
|
|
|
|||
Loading…
Reference in a new issue