LLM: remove english_quotes dataset (#10370)

2024-03-12 16:57:40 +08:00 · 2024-03-12 16:57:40 +08:00 · df3bcc0e65
commit df3bcc0e65
parent df2b84f7de
6 changed files with 77 additions and 37 deletions
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@ -237,7 +237,7 @@ jobs:
        shell: bash
        run: |
          echo "DATASET_DIR=${ORIGIN_DIR}/../datasets" >> "$GITHUB_ENV"
-          echo "ABIRATE_ENGLISH_QUOTES_PATH=${ORIGIN_DIR}/../datasets/abirate_english_quotes" >> "$GITHUB_ENV"
+          echo "YAHMA_ALPACA_CLEANED_PATH=${ORIGIN_DIR}/../datasets/yahma_alpaca_cleaned" >> "$GITHUB_ENV"
          echo "SPEECH_DATASET_PATH=${ORIGIN_DIR}/../datasets/librispeech_asr_dummy" >> "$GITHUB_ENV"

          echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
@ -308,9 +308,9 @@ jobs:
          if [ ! -d $DATASET_DIR ]; then
            mkdir -p $DATASET_DIR
          fi
-          if [ ! -d $ABIRATE_ENGLISH_QUOTES_PATH ]; then
-            echo "Directory $ABIRATE_ENGLISH_QUOTES_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/datasets/abirate_english_quotes -P $DATASET_DIR
+          if [ ! -d $YAHMA_ALPACA_CLEANED_PATH ]; then
+            echo "Directory $YAHMA_ALPACA_CLEANED_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/datasets/yahma_alpaca_cleaned -P $DATASET_DIR
          fi
          if [ ! -d $SPEECH_DATASET_PATH ]; then
            echo "Directory $SPEECH_DATASET_PATH not found. Downloading from FTP server..."
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/README.md
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/README.md
@ -8,7 +8,7 @@ To run this example with BigDL-LLM on Intel GPUs, we have some recommended requi

 ## Example: Finetune llama2-7b using qlora

-This example is ported from [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing). The `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).
+This example is referred to [bnb-4bit-training](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing) and utilizes a subset of [yahma/alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned) for training. And the `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).

 ### 1. Install

@ -36,19 +36,19 @@ python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH

 #### Sample Output
 ```log
-{'loss': 1.6134, 'learning_rate': 0.0002, 'epoch': 0.03}                                                                                 
-{'loss': 1.3038, 'learning_rate': 0.00017777777777777779, 'epoch': 0.06}                                                                 
-{'loss': 1.2634, 'learning_rate': 0.00015555555555555556, 'epoch': 0.1}                                                                  
-{'loss': 1.2389, 'learning_rate': 0.00013333333333333334, 'epoch': 0.13}                                                                 
-{'loss': 1.0399, 'learning_rate': 0.00011111111111111112, 'epoch': 0.16}                                                                 
-{'loss': 1.0406, 'learning_rate': 8.888888888888889e-05, 'epoch': 0.19}                                                                  
-{'loss': 1.3114, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.22}                                                                  
-{'loss': 0.9876, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.26}                                                                 
-{'loss': 1.1406, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.29}                                                                 
-{'loss': 1.1728, 'learning_rate': 0.0, 'epoch': 0.32}                                                                                    
-{'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32}
-100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [03:45<00:00,  1.13s/it]
-TrainOutput(global_step=200, training_loss=1.211241865158081, metrics={'train_runtime': 225.8005, 'train_samples_per_second': 3.543, 'train_steps_per_second': 0.886, 'train_loss': 1.211241865158081, 'epoch': 0.32})
+{'loss': 1.7093, 'learning_rate': 2e-05, 'epoch': 0.02}
+{'loss': 1.6595, 'learning_rate': 1.7777777777777777e-05, 'epoch': 0.03}
+{'loss': 1.5172, 'learning_rate': 1.555555555555556e-05, 'epoch': 0.05}
+{'loss': 1.3666, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.06}
+{'loss': 1.2738, 'learning_rate': 1.1111111111111113e-05, 'epoch': 0.08}
+{'loss': 1.2199, 'learning_rate': 8.888888888888888e-06, 'epoch': 0.09}
+{'loss': 1.1703, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.11}
+{'loss': 1.108, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.12}
+{'loss': 1.1199, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.14}
+{'loss': 1.0668, 'learning_rate': 0.0, 'epoch': 0.15}
+{'train_runtime': 279.3049, 'train_samples_per_second': 2.864, 'train_steps_per_second': 0.716, 'train_loss': 1.321143569946289, 'epoch': 0.15}
+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [04:39<00:00,  1.40s/it]
+TrainOutput(global_step=200, training_loss=1.321143569946289, metrics={'train_runtime': 279.3049, 'train_samples_per_second': 2.864, 'train_steps_per_second': 0.716, 'train_loss': 1.321143569946289, 'epoch': 0.15})
 ```

 ### 4. Merge the adapter into the original model
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py
@ -26,22 +26,38 @@ from bigdl.llm.transformers import AutoModelForCausalLM
 from datasets import load_dataset
 import argparse

+current_dir = os.path.dirname(os.path.realpath(__file__))
+common_util_path = os.path.join(current_dir, '..', '..')
+import sys
+sys.path.append(common_util_path)
+from common.utils import Prompter, get_train_val_data
+
 if __name__ == "__main__":

    parser = argparse.ArgumentParser(description='Simple example of how to qlora finetune llama2 model using bigdl-llm')
    parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf",
                        help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
                             ', or the path to the huggingface checkpoint folder')
-    parser.add_argument('--dataset', type=str, default="Abirate/english_quotes")
+    parser.add_argument('--dataset', type=str, default="yahma/alpaca-cleaned")

    args = parser.parse_args()
    model_path = args.repo_id_or_model_path
    dataset_path = args.dataset
    tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)

-    data = load_dataset(dataset_path)
-    data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+    if dataset_path.endswith(".json") or dataset_path.endswith(".jsonl"):
+        data = load_dataset("json", data_files=dataset_path)
+    else:
+        data = load_dataset(dataset_path)
+    
+    # For illustration purpose, only use part of data to train
+    data = data["train"].train_test_split(train_size=0.1, shuffle=False)

+    # Data processing
+    prompter = Prompter("alpaca")
+    train_data, _ = get_train_val_data(data, tokenizer, prompter, train_on_inputs=True,
+                                       add_eos_token=False, cutoff_len=256, val_set_size=0, seed=42)
+    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
@ -76,7 +92,7 @@ if __name__ == "__main__":
    tokenizer.padding_side = "left"
    trainer = transformers.Trainer(
        model=model,
-        train_dataset=data["train"],
+        train_dataset=train_data,
        args=transformers.TrainingArguments(
            per_device_train_batch_size=4,
            gradient_accumulation_steps= 1,
@ -90,7 +106,9 @@ if __name__ == "__main__":
            optim="adamw_hf", # paged_adamw_8bit is not supported yet
            # gradient_checkpointing=True, # can further reduce memory but slower
        ),
-        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
+        data_collator=transformers.DataCollatorForSeq2Seq(
+            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
+        ),
    )
    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
    result = trainer.train()
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/trl-example/README.md
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/trl-example/README.md
@ -8,7 +8,7 @@ To run this example with BigDL-LLM on Intel GPUs, we have some recommended requi

 ## Example: Finetune llama2-7b using qlora

-The `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).
+This example utilizes a subset of [yahma/alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned) for training. And the `export_merged_model.py` is ported from [alpaca-lora](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py).

 ### 1. Install

@ -36,14 +36,19 @@ python ./qlora_finetuning.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH

 #### Sample Output
 ```log
-{'loss': 1.7386, 'learning_rate': 8.888888888888888e-06, 'epoch': 0.19}            
-{'loss': 1.9242, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.22}            
-{'loss': 1.6819, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.26}            
-{'loss': 1.755, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.29}             
-{'loss': 1.7455, 'learning_rate': 0.0, 'epoch': 0.32}                              
-{'train_runtime': 172.8523, 'train_samples_per_second': 4.628, 'train_steps_per_second': 1.157, 'train_loss': 1.9101631927490235, 'epoch': 0.32}
-100%|████████████████████████████████████████████| 200/200 [02:52<00:00,  1.16it/s]
-TrainOutput(global_step=200, training_loss=1.9101631927490235, metrics={'train_runtime': 172.8523, 'train_samples_per_second': 4.628, 'train_steps_per_second': 1.157, 'train_loss': 1.9101631927490235, 'epoch': 0.32})
+{'loss': 3.1898, 'learning_rate': 2e-05, 'epoch': 0.02}
+{'loss': 3.1854, 'learning_rate': 1.7777777777777777e-05, 'epoch': 0.03}
+{'loss': 3.0359, 'learning_rate': 1.555555555555556e-05, 'epoch': 0.05}
+{'loss': 2.9661, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.06}
+{'loss': 2.7779, 'learning_rate': 1.1111111111111113e-05, 'epoch': 0.08}                              
+{'loss': 2.7795, 'learning_rate': 8.888888888888888e-06, 'epoch': 0.09}
+{'loss': 2.5149, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.11}
+{'loss': 2.5759, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.12}
+{'loss': 2.5976, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.14}
+{'loss': 2.5744, 'learning_rate': 0.0, 'epoch': 0.15}
+{'train_runtime': 116.1914, 'train_samples_per_second': 6.885, 'train_steps_per_second': 1.721, 'train_loss': 2.819730052947998, 'epoch': 0.15}                                                          
+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:56<00:00,  1.72it/s]
+TrainOutput(global_step=200, training_loss=2.819730052947998, metrics={'train_runtime': 116.1914, 'train_samples_per_second': 6.885, 'train_steps_per_second': 1.721, 'train_loss': 2.819730052947998, 'epoch': 0.15})
 ```

 ### 4. Merge the adapter into the original model
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/trl-example/qlora_finetuning.py
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/trl-example/qlora_finetuning.py
@ -27,20 +27,37 @@ from datasets import load_dataset
 from trl import SFTTrainer
 import argparse

+current_dir = os.path.dirname(os.path.realpath(__file__))
+common_util_path = os.path.join(current_dir, '..', '..')
+import sys
+sys.path.append(common_util_path)
+from common.utils import Prompter, get_train_val_data
+
 if __name__ == "__main__":

    parser = argparse.ArgumentParser(description='Simple example of how to qlora finetune llama2 model using bigdl-llm and TRL')
    parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf",
                        help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
                             ', or the path to the huggingface checkpoint folder')
-    parser.add_argument('--dataset', type=str, default="Abirate/english_quotes")
+    parser.add_argument('--dataset', type=str, default="yahma/alpaca-cleaned")

    args = parser.parse_args()
    model_path = args.repo_id_or_model_path
    dataset_path = args.dataset
    tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)

-    data = load_dataset(dataset_path, split="train")
+    if dataset_path.endswith(".json") or dataset_path.endswith(".jsonl"):
+        data = load_dataset("json", data_files=dataset_path)
+    else:
+        data = load_dataset(dataset_path)
+    
+    # For illustration purpose, only use part of data to train
+    data = data["train"].train_test_split(train_size=0.1, shuffle=False)
+
+    # Data processing
+    prompter = Prompter("alpaca")
+    train_data, _ = get_train_val_data(data, tokenizer, prompter, train_on_inputs=True,
+                                       add_eos_token=False, cutoff_len=256, val_set_size=0, seed=42)

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
@ -73,7 +90,7 @@ if __name__ == "__main__":

    trainer = SFTTrainer(
        model=model,
-        train_dataset=data,
+        train_dataset=train_data,
        args=transformers.TrainingArguments(
            per_device_train_batch_size=4,
            gradient_accumulation_steps= 1,
@ -87,7 +104,7 @@ if __name__ == "__main__":
            optim="adamw_hf", # paged_adamw_8bit is not supported yet
            gradient_checkpointing=True, # can further reduce memory but slower
        ),
-        dataset_text_field="quote",
+        dataset_text_field="instruction",
    )
    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
    result = trainer.train()
--- a/python/llm/test/run-llm-example-tests-gpu.sh
+++ b/python/llm/test/run-llm-example-tests-gpu.sh
@ -12,7 +12,7 @@ sed -i 's/max_steps=200/max_steps=2/; s/save_steps=100/save_steps=2/; s/logging_

 python ${ANALYTICS_ZOO_ROOT}/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/qlora_finetuning.py \
 --repo-id-or-model-path ${LLAMA2_7B_ORIGIN_PATH} \
--dataset ${ABIRATE_ENGLISH_QUOTES_PATH}
+--dataset ${YAHMA_ALPACA_CLEANED_PATH}

 python ${ANALYTICS_ZOO_ROOT}/python/llm/example/GPU/LLM-Finetuning/QLoRA/simple-example/export_merged_model.py \
 --repo-id-or-model-path ${LLAMA2_7B_ORIGIN_PATH} \