From f2e923b3ca3744b762e11faaf7a141f3df4bd1ba Mon Sep 17 00:00:00 2001 From: Qiyuan Gong Date: Wed, 17 Apr 2024 09:49:11 +0800 Subject: [PATCH] Axolotl v0.4.0 support (#10773) * Add Axolotl 0.4.0, remove legacy 0.3.0 support. * replace is_torch_bf16_gpu_available * Add HF_HUB_OFFLINE=1 * Move transformers out of requirement * Refine readme and qlora.yml --- .../GPU/LLM-Finetuning/axolotl/README.md | 31 +- .../GPU/LLM-Finetuning/axolotl/finetune.py | 272 ++---------------- .../GPU/LLM-Finetuning/axolotl/qlora.yml | 23 +- .../axolotl/requirements-xpu.txt | 30 +- .../GPU/LLM-Finetuning/axolotl/train.py | 83 ++++++ python/llm/src/ipex_llm/llm_patching.py | 2 + python/llm/src/ipex_llm/transformers/utils.py | 4 + 7 files changed, 165 insertions(+), 280 deletions(-) create mode 100644 python/llm/example/GPU/LLM-Finetuning/axolotl/train.py diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md b/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md index ae6c7ca4..206e98ae 100644 --- a/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md +++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md @@ -1,6 +1,6 @@ -# Finetune LLM on Intel GPU using axolotl v0.3.0 without writing code +# Finetune LLM on Intel GPU using axolotl v0.4.0 without writing code -This example demonstrates how to easily run LLM finetuning application using [axolotl v0.3.0](https://github.com/OpenAccess-AI-Collective/axolotl/tree/v0.3.0) and IPEX-LLM 4bit optimizations with [Intel GPUs](../../../README.md). By applying IPEX-LLM patch, you could use axolotl on Intel GPUs using IPEX-LLM optimization without writing code. +This example demonstrates how to easily run LLM finetuning application using [axolotl v0.4.0](https://github.com/OpenAccess-AI-Collective/axolotl/tree/v0.4.0) and IPEX-LLM 4bit optimizations with [Intel GPUs](../../../README.md). By applying IPEX-LLM patch, you could use axolotl on Intel GPUs using IPEX-LLM optimization without writing code. Note, this example is just used for illustrating related usage and don't guarantee convergence of training. @@ -15,23 +15,24 @@ conda create -n llm python=3.11 conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -# install axolotl v0.3.0 +# install axolotl v0.4.0 git clone https://github.com/OpenAccess-AI-Collective/axolotl cd axolotl -git checkout v0.3.0 +git checkout v0.4.0 cp ../requirements-xpu.txt requirements.txt pip install -e . +pip install transformers==4.36.0 ``` ### 2. Configures OneAPI environment variables and accelerate -Configures OneAPI environment variables +#### 2.1 Configures OneAPI environment variables ```bash source /opt/intel/oneapi/setvars.sh ``` -Configures `accelerate` in command line interactively. +#### 2.2 Configures `accelerate` in command line interactively. ```bash accelerate config @@ -41,16 +42,30 @@ Please answer `NO` in option `Do you want to run your training on CPU only (even After finish accelerate config, check if `use_cpu` is disable (i.e., ` use_cpu: false`) in accelerate config file (`~/.cache/huggingface/accelerate/default_config.yaml`). +#### 2.3 (Optional) Set ` HF_HUB_OFFLINE=1` to avoid huggingface hug signing. + +```bash +export HF_HUB_OFFLINE=1 +``` + +For more details, please refer [hfhuboffline](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhuboffline). + ### 3. Finetune Llama-2-7B -This example shows how to run [Alpaca QLoRA finetune on Llama-2](https://github.com/artidoro/qlora) directly on Intel GPU, based on [axolotl Llama-2 qlora example](https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/examples/llama-2/qlora.yml). Note that only Llama-2-7B QLora example is verified on Intel ARC 770 with 16GB memory. +This example shows how to run [Alpaca QLoRA finetune on Llama-2](https://github.com/artidoro/qlora) directly on Intel GPU, based on [axolotl Llama-2 qlora example](https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/examples/llama-2/qlora.yml). Note that only Llama-2-7B QLora example is verified on Intel ARC 770 with 16GB memory. -Modify parameters in `qlora.yml` based on your requirements. +Modify parameters in `qlora.yml` based on your requirements. Then, launch finetuning with the following command. ``` accelerate launch finetune.py qlora.yml ``` +In v0.4.0, you can also use `train.py` instead of `-m axolotl.cli.train` or `finetune.py`. + +``` +accelerate launch train.py qlora.yml +``` + Output in console ``` diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/finetune.py b/python/llm/example/GPU/LLM-Finetuning/axolotl/finetune.py index 15d9e7fc..2cf98f7d 100644 --- a/python/llm/example/GPU/LLM-Finetuning/axolotl/finetune.py +++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/finetune.py @@ -14,7 +14,7 @@ # limitations under the License. # # This file is copied from -# https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/scripts/finetune.py +# https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/scripts/finetune.py # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,264 +33,40 @@ llm_patch(train=True) # The following is the original axolotl finetune code (without IPEX-LLM) """Prepare and train a model on a dataset. Can also infer from a model or merge lora""" - -import importlib import logging -import os -import random -import sys from pathlib import Path -from typing import Any, Dict, List, Optional, Union import fire -import torch import transformers -import yaml -# add src to the pythonpath so we don't need to pip install this -from art import text2art -from transformers import GenerationConfig, TextStreamer +from axolotl.cli import ( + check_accelerate_default_config, + check_user_token, + do_inference, + do_merge_lora, + load_cfg, + load_datasets, + print_axolotl_text_art, +) +from axolotl.cli.shard import shard +from axolotl.common.cli import TrainerCliArgs +from axolotl.train import train -from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer -from axolotl.logging_config import configure_logging -from axolotl.train import TrainDatasetMeta, train -from axolotl.utils.config import normalize_config, validate_config -from axolotl.utils.data import prepare_dataset -from axolotl.utils.dict import DictDefault -from axolotl.utils.distributed import is_main_process -from axolotl.utils.models import load_tokenizer -from axolotl.utils.tokenization import check_dataset_labels -from axolotl.utils.wandb import setup_wandb_env_vars - -project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) -src_dir = os.path.join(project_root, "src") -sys.path.insert(0, src_dir) - -configure_logging() -LOG = logging.getLogger("axolotl.scripts") - -os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" - - -def print_axolotl_text_art(suffix=None): - font = "nancyj" - ascii_text = " axolotl" - if suffix: - ascii_text += f" x {suffix}" - ascii_art = text2art(" axolotl", font=font) - - if is_main_process(): - print(ascii_art) - - -def get_multi_line_input() -> Optional[str]: - print("Give me an instruction (Ctrl + D to finish): ") - instruction = "" - for line in sys.stdin: - instruction += line # pylint: disable=consider-using-join - # instruction = pathlib.Path("/proc/self/fd/0").read_text() - return instruction - - -def do_merge_lora( - *, - cfg: DictDefault, - cli_args: TrainerCliArgs, -): - model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args) - safe_serialization = cfg.save_safetensors is True - - LOG.info("running merge of LoRA with base model") - model = model.merge_and_unload() - model.to(dtype=torch.float16) - - if cfg.local_rank == 0: - LOG.info("saving merged model") - model.save_pretrained( - str(Path(cfg.output_dir) / "merged"), - safe_serialization=safe_serialization, - ) - tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged")) - - -def shard( - *, - cfg: DictDefault, - cli_args: TrainerCliArgs, -): - model, _ = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args) - safe_serialization = cfg.save_safetensors is True - LOG.debug("Re-saving model w/ sharding") - model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization) - - -def do_inference( - *, - cfg: DictDefault, - cli_args: TrainerCliArgs, -): - model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args) - prompter = cli_args.prompter - default_tokens = {"unk_token": "", "bos_token": "", "eos_token": ""} - - for token, symbol in default_tokens.items(): - # If the token isn't already specified in the config, add it - if not (cfg.special_tokens and token in cfg.special_tokens): - tokenizer.add_special_tokens({token: symbol}) - - prompter_module = None - if prompter: - prompter_module = getattr( - importlib.import_module("axolotl.prompters"), prompter - ) - - if cfg.landmark_attention: - from axolotl.monkeypatch.llama_landmark_attn import set_model_mem_id - - set_model_mem_id(model, tokenizer) - model.set_mem_cache_args( - max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None - ) - - model = model.to(cfg.device) - - while True: - print("=" * 80) - # support for multiline inputs - instruction = get_multi_line_input() - if not instruction: - return - if prompter_module: - prompt: str = next( - prompter_module().build_prompt(instruction=instruction.strip("\n")) - ) - else: - prompt = instruction.strip() - batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) - - print("=" * 40) - model.eval() - with torch.no_grad(): - generation_config = GenerationConfig( - repetition_penalty=1.1, - max_new_tokens=1024, - temperature=0.9, - top_p=0.95, - top_k=40, - bos_token_id=tokenizer.bos_token_id, - eos_token_id=tokenizer.eos_token_id, - pad_token_id=tokenizer.pad_token_id, - do_sample=True, - use_cache=True, - return_dict_in_generate=True, - output_attentions=False, - output_hidden_states=False, - output_scores=False, - ) - streamer = TextStreamer(tokenizer) - generated = model.generate( - inputs=batch["input_ids"].to(cfg.device), - generation_config=generation_config, - streamer=streamer, - ) - print("=" * 40) - print(tokenizer.decode(generated["sequences"].cpu().tolist()[0])) - - -def choose_config(path: Path): - yaml_files = list(path.glob("*.yml")) - - if not yaml_files: - raise ValueError( - "No YAML config files found in the specified directory. Are you using a .yml extension?" - ) - - if len(yaml_files) == 1: - print(f"Using default YAML file '{yaml_files[0]}'") - return yaml_files[0] - - print("Choose a YAML file:") - for idx, file in enumerate(yaml_files): - print(f"{idx + 1}. {file}") - - chosen_file = None - while chosen_file is None: - try: - choice = int(input("Enter the number of your choice: ")) - if 1 <= choice <= len(yaml_files): - chosen_file = yaml_files[choice - 1] - else: - print("Invalid choice. Please choose a number from the list.") - except ValueError: - print("Invalid input. Please enter a number.") - - return chosen_file - - -def check_not_in(list1: List[str], list2: Union[Dict[str, Any], List[str]]) -> bool: - return not any(el in list2 for el in list1) - - -def load_cfg(config: Path = Path("examples/"), **kwargs): - if Path(config).is_dir(): - config = choose_config(config) - - # load the config from the yaml file - with open(config, encoding="utf-8") as file: - cfg: DictDefault = DictDefault(yaml.safe_load(file)) - # if there are any options passed in the cli, if it is something that seems valid from the yaml, - # then overwrite the value - cfg_keys = cfg.keys() - for k, _ in kwargs.items(): - # if not strict, allow writing to cfg even if it's not in the yml already - if k in cfg_keys or not cfg.strict: - # handle booleans - if isinstance(cfg[k], bool): - cfg[k] = bool(kwargs[k]) - else: - cfg[k] = kwargs[k] - - validate_config(cfg) - - normalize_config(cfg) - - setup_wandb_env_vars(cfg) - return cfg - - -def load_datasets( - *, - cfg: DictDefault, - cli_args: TrainerCliArgs, -) -> TrainDatasetMeta: - tokenizer = load_tokenizer(cfg) - - train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer) - - if cli_args.debug or cfg.debug: - LOG.info("check_dataset_labels...") - check_dataset_labels( - train_dataset.select( - [ - random.randrange(0, len(train_dataset) - 1) # nosec - for _ in range(cli_args.debug_num_examples) - ] - ), - tokenizer, - num_examples=cli_args.debug_num_examples, - text_only=cli_args.debug_text_only, - ) - - return TrainDatasetMeta( - train_dataset=train_dataset, - eval_dataset=eval_dataset, - total_num_steps=total_num_steps, - ) +LOG = logging.getLogger("axolotl.scripts.finetune") def do_cli(config: Path = Path("examples/"), **kwargs): print_axolotl_text_art() + LOG.warning( + str( + PendingDeprecationWarning( + "scripts/finetune.py will be replaced with calling axolotl.cli.train" + ) + ) + ) parsed_cfg = load_cfg(config, **kwargs) + check_accelerate_default_config() + check_user_token() parser = transformers.HfArgumentParser((TrainerCliArgs)) parsed_cli_args, _ = parser.parse_args_into_dataclasses( return_remaining_strings=True @@ -303,8 +79,6 @@ def do_cli(config: Path = Path("examples/"), **kwargs): shard(cfg=parsed_cfg, cli_args=parsed_cli_args) else: dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args) - if parsed_cli_args.prepare_ds_only: - return train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta) diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml b/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml index 7e5c2fbe..c6f446f9 100644 --- a/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml +++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml @@ -1,6 +1,5 @@ -# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/examples/llama-2/qlora.yml -base_model: meta-llama/Llama-2-7b-hf -base_model_config: meta-llama/Llama-2-7b-hf +# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/examples/llama-2/qlora.yml +base_model: NousResearch/Llama-2-7b-hf model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer is_llama_derived_model: true @@ -12,8 +11,8 @@ strict: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca -dataset_prepared_path: last_run_prepared -val_set_size: 0.01 +dataset_prepared_path: +val_set_size: 0.05 output_dir: ./qlora-out adapter: qlora @@ -33,15 +32,12 @@ lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: -wandb_run_id: +wandb_name: wandb_log_model: -gradient_accumulation_steps: 2 +gradient_accumulation_steps: 4 micro_batch_size: 1 -num_epochs: 3 -# paged_adamw_32bit is not supported -# due to bitsandbytes issue https://github.com/TimDettmers/bitsandbytes/issues/1180 -# optimizer: paged_adamw_32bit +num_epochs: 4 optimizer: adamw_torch lr_scheduler: cosine learning_rate: 0.0002 @@ -61,8 +57,9 @@ xformers_attention: flash_attention: false warmup_steps: 10 -eval_steps: 20 -save_steps: +evals_per_epoch: 4 +eval_table_size: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt b/python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt index 942a5ea0..e2862cec 100644 --- a/python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt +++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt @@ -1,28 +1,26 @@ -# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/requirements.txt ---extra-index-url https://download.pytorch.org/whl/cu118 +# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/requirements.txt --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ -# torch==2.1.0 -# auto-gptq -packaging +packaging==23.2 peft==0.5.0 -transformers==4.34.0 +tokenizers bitsandbytes>=0.41.1 accelerate==0.23.0 +deepspeed>=0.13.1 addict -evaluate fire PyYAML>=6.0 datasets -flash-attn>=2.2.1 +#flash-attn==2.3.3 sentencepiece wandb einops -# xformers -optimum +#xformers==0.0.22 +optimum==1.13.2 hf_transfer colorama numba numpy>=1.24.4 +mlflow # qlora things bert-score==0.3.13 evaluate==0.4.0 @@ -31,3 +29,15 @@ scipy scikit-learn==1.2.2 pynvml art +fschat==0.2.34 +gradio==3.50.2 +tensorboard + +mamba-ssm==1.1.1 + +# remote filesystems +s3fs +gcsfs +# adlfs + +trl>=0.7.9 diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/train.py b/python/llm/example/GPU/LLM-Finetuning/axolotl/train.py new file mode 100644 index 00000000..9db65470 --- /dev/null +++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/train.py @@ -0,0 +1,83 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This file is copied from +# https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/src/axolotl/cli/train.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ipex_llm import llm_patch +llm_patch(train=True) +# The following is the original axolotl train code (without IPEX-LLM) + +""" +CLI to run training on a model +""" +import logging +from pathlib import Path +from typing import Tuple + +import fire +import transformers +from transformers import PreTrainedModel, PreTrainedTokenizer + +from axolotl.cli import ( + check_accelerate_default_config, + check_user_token, + load_cfg, + load_datasets, + load_rl_datasets, + print_axolotl_text_art, +) +from axolotl.common.cli import TrainerCliArgs +from axolotl.train import train + +LOG = logging.getLogger("axolotl.cli.train") + + +def do_cli(config: Path = Path("examples/"), **kwargs): + # pylint: disable=duplicate-code + parsed_cfg = load_cfg(config, **kwargs) + parser = transformers.HfArgumentParser((TrainerCliArgs)) + parsed_cli_args, _ = parser.parse_args_into_dataclasses( + return_remaining_strings=True + ) + return do_train(parsed_cfg, parsed_cli_args) + + +def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]: + print_axolotl_text_art() + check_accelerate_default_config() + check_user_token() + if cfg.rl: + dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args) + else: + dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args) + + return train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta) + + +if __name__ == "__main__": + fire.Fire(do_cli) diff --git a/python/llm/src/ipex_llm/llm_patching.py b/python/llm/src/ipex_llm/llm_patching.py index 8c0a94e5..d68fac0c 100644 --- a/python/llm/src/ipex_llm/llm_patching.py +++ b/python/llm/src/ipex_llm/llm_patching.py @@ -47,6 +47,8 @@ def llm_patch(train=False): replace_attr(transformers, "AutoModelForCausalLM", AutoModelForCausalLM) replace_attr(transformers, "LlamaForCausalLM", AutoModelForCausalLM) replace_attr(transformers, "AutoModel", AutoModel) + from ipex_llm.transformers.utils import is_torch_bf16_gpu_available + replace_attr(transformers.utils, "is_torch_bf16_gpu_available", is_torch_bf16_gpu_available) import_peft_check = 'peft' in sys.modules or 'peft.utils' in sys.modules or \ 'peft.tuners' in sys.modules or 'peft.mapping' in sys.modules diff --git a/python/llm/src/ipex_llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py index b7873045..1e728473 100644 --- a/python/llm/src/ipex_llm/transformers/utils.py +++ b/python/llm/src/ipex_llm/transformers/utils.py @@ -335,3 +335,7 @@ def get_modelscope_hf_config(model_id_or_path: str, elif os.path.isfile(model_id_or_path): local_path = model_id_or_path return Config._file2dict(local_path) + +def is_torch_bf16_gpu_available(): + # always true for XPU and CPU + return True