From 581ebf610494f90259b8d3ff38581bb7473c3753 Mon Sep 17 00:00:00 2001
From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com>
Date: Thu, 18 Apr 2024 13:47:41 +0800
Subject: [PATCH] GaLore Finetuning Example (#10722)

* GaLore Finetuning Example

* Update README.md

* Update README.md

* change data to HuggingFaceH4/helpful_instructions

* Update README.md

* Update README.md

* shrink train size and delete cache before starting training to save memory

* Update README.md

* Update galore_finetuning.py

* change model to llama2 3b

* Update README.md
---
 .../GPU/LLM-Finetuning/GaLore/README.md       | 54 ++++++++++
 .../GaLore/galore_finetuning.py               | 99 +++++++++++++++++++
 2 files changed, 153 insertions(+)
 create mode 100644 python/llm/example/GPU/LLM-Finetuning/GaLore/README.md
 create mode 100644 python/llm/example/GPU/LLM-Finetuning/GaLore/galore_finetuning.py

diff --git a/python/llm/example/GPU/LLM-Finetuning/GaLore/README.md b/python/llm/example/GPU/LLM-Finetuning/GaLore/README.md
new file mode 100644
index 00000000..d39e2b92
--- /dev/null
+++ b/python/llm/example/GPU/LLM-Finetuning/GaLore/README.md
@@ -0,0 +1,54 @@
+# GaLore Finetuning with IPEX-LLM
+
+This is an example of IPEX-LLM GaLore fine-tuning on [Intel GPU](../../../README.md), which refers [Huggingface GaLore blog](https://huggingface.co/blog/galore) and changes model to [openlm-research/open_llama_3b_v2](https://huggingface.co/openlm-research/open_llama_3b_v2) and dataset to [HuggingFaceH4/helpful_instructions](https://huggingface.co/datasets/HuggingFaceH4/helpful_instructions).
+
+### 0. Requirements
+To run this example with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information.
+
+### 1. Install
+
+```bash
+conda create -n llm python=3.11
+conda activate llm
+# below command will install intel_extension_for_pytorch==2.1.10+xpu as default
+pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+pip install galore-torch
+pip install accelerate==0.28.0
+pip install bitsandbytes==0.43.0
+pip install datasets==2.18.0
+pip install transformers==4.39.1
+pip install trl==0.8.1
+```
+
+### 2. GaLore Finetune
+
+Currently, GaLore only supports local fine-tuning, and here is how to fine-tune Llama2 7B on an Intel Max GPU server:
+
+```bash
+# Configures OneAPI environment variables
+source /opt/intel/oneapi/setvars.sh
+python galore_finetuning.py # optional parameters as below
+```
+
+Optional parameters for `galore_finetuning.py`:
+
+**--repo-id-or-model-path** : default to `openlm-research/open_llama_3b_v2`, and you can also specify your local model path.
+
+**--data-path** : default to `HuggingFaceH4/helpful_instructions`, and you can also specify your local datal path, while note that changing to the other datasets will introduce code modification effort for yourself.
+
+**--output-dir** : default to `./ipex-llm-galore` to save fine-tuned model, and you can change if needed.
+
+### 3. Sample Output
+```log
+......
+{'loss': 2.0989, 'grad_norm': 0.0, 'learning_rate': 0.001, 'epoch': 0.0}
+{'loss': 1.9064, 'grad_norm': 0.0, 'learning_rate': 0.001, 'epoch': 0.0}
+{'loss': 1.7483, 'grad_norm': 0.0, 'learning_rate': 0.001, 'epoch': 0.01}
+{'loss': 1.9551, 'grad_norm': 0.0, 'learning_rate': 0.001, 'epoch': 0.01}
+{'loss': 1.783, 'grad_norm': 0.0, 'learning_rate': 0.001, 'epoch': 0.01}
+{'loss': 1.3328, 'grad_norm': 0.0, 'learning_rate': 0.001, 'epoch': 0.01}
+{'loss': 1.4622, 'grad_norm': 0.0, 'learning_rate': 0.001, 'epoch': 0.01}
+{'loss': 1.9094, 'grad_norm': 0.0, 'learning_rate': 0.001, 'epoch': 0.02}
+  5%|████▏                                                                                      | 70/1500 [xx:xx<x:xx:xx, xx.xxs/it]
+......
+```
diff --git a/python/llm/example/GPU/LLM-Finetuning/GaLore/galore_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/GaLore/galore_finetuning.py
new file mode 100644
index 00000000..9e812773
--- /dev/null
+++ b/python/llm/example/GPU/LLM-Finetuning/GaLore/galore_finetuning.py
@@ -0,0 +1,99 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+rank = 1024
+update_proj_gap = 200
+scale = 2
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Fine-tune transformers with IPEX-LLM GaLore')
+    parser.add_argument('--repo-id-or-model-path', type=str, default="openlm-research/open_llama_3b_v2",
+                        help='The huggingface repo id for the Llama2 (e.g. `openlm-research/open_llama_3b_v2` or `meta-llama/Llama-2-7b-chat-hf`)')
+    parser.add_argument('--data-path', type=str, default="HuggingFaceH4/helpful_instructions",
+                        help='Dataset path for fine-tuning')
+    parser.add_argument('--output-dir', type=str, default="./ipex-llm-galore",
+                        help='Path to save fine-tuned mode')
+
+    args = parser.parse_args()
+    model_path = args.repo_id_or_model_path
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype = torch.bfloat16,
+        optimize_model=False,
+        use_cache = False,
+        trust_remote_code=True,
+    )
+    model = model.to("xpu")
+    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast = False)
+
+    if tokenizer.pad_token in [None, tokenizer.eos_token]:
+        tokenizer.pad_token = tokenizer.unk_token
+
+    dataset = load_dataset(args.data_path)
+    data = dataset["train"].train_test_split(
+        train_size=8000,test_size=2000, shuffle=False, seed=42
+    )
+    
+    def prompt_format(data):
+        return {"data": f"{tokenizer.bos_token} {data['prompt']} {tokenizer.eos_token} {data['completion']}"}
+    train_data = data["train"].map(prompt_format)
+    test_data = data["test"].map(prompt_format)
+
+    import gc
+    del data
+    del dataset
+    gc.collect()
+
+    from transformers import TrainingArguments
+
+    training_arguments = TrainingArguments(
+        output_dir = f"out",
+        evaluation_strategy = "steps",
+        label_names = ["data"],
+        per_device_train_batch_size = 16,
+        save_steps = 250,
+        eval_steps = 250,
+        logging_steps = 1,
+        learning_rate = 1e-5,
+        num_train_epochs = 3,
+        lr_scheduler_type = "constant",
+        gradient_checkpointing = True,
+        optim = "galore_adamw_layerwise",
+        optim_target_modules = ["attn", "mlp"],
+        optim_args = f"rank={rank}, update_proj_gap={update_proj_gap}, scale={scale}",
+    )
+
+    from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
+
+    trainer = SFTTrainer(
+        model = model,
+        tokenizer = tokenizer,
+        train_dataset = train_data,
+        eval_dataset = test_data,
+        dataset_text_field="data",
+        max_seq_length = 256,
+        data_collator = DataCollatorForCompletionOnlyLM(
+            instruction_template = f"{tokenizer.bos_token}",
+            response_template = f"{tokenizer.eos_token}",
+            tokenizer = tokenizer,
+            mlm = False),
+        dataset_kwargs = dict(add_special_tokens = False),
+        args = training_arguments,
+    )
+
+    trainer.train()
+    model.save_pretrained(args.output_dir)