Axolotl v0.4.0 support (#10773)

* Add Axolotl 0.4.0, remove legacy 0.3.0 support.
* replace is_torch_bf16_gpu_available
* Add HF_HUB_OFFLINE=1
* Move transformers out of requirement
* Refine readme and qlora.yml
This commit is contained in:
Qiyuan Gong 2024-04-17 09:49:11 +08:00 committed by GitHub
parent 26cae0a39c
commit f2e923b3ca
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 165 additions and 280 deletions

View file

@ -1,6 +1,6 @@
# Finetune LLM on Intel GPU using axolotl v0.3.0 without writing code # Finetune LLM on Intel GPU using axolotl v0.4.0 without writing code
This example demonstrates how to easily run LLM finetuning application using [axolotl v0.3.0](https://github.com/OpenAccess-AI-Collective/axolotl/tree/v0.3.0) and IPEX-LLM 4bit optimizations with [Intel GPUs](../../../README.md). By applying IPEX-LLM patch, you could use axolotl on Intel GPUs using IPEX-LLM optimization without writing code. This example demonstrates how to easily run LLM finetuning application using [axolotl v0.4.0](https://github.com/OpenAccess-AI-Collective/axolotl/tree/v0.4.0) and IPEX-LLM 4bit optimizations with [Intel GPUs](../../../README.md). By applying IPEX-LLM patch, you could use axolotl on Intel GPUs using IPEX-LLM optimization without writing code.
Note, this example is just used for illustrating related usage and don't guarantee convergence of training. Note, this example is just used for illustrating related usage and don't guarantee convergence of training.
@ -15,23 +15,24 @@ conda create -n llm python=3.11
conda activate llm conda activate llm
# below command will install intel_extension_for_pytorch==2.1.10+xpu as default # below command will install intel_extension_for_pytorch==2.1.10+xpu as default
pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
# install axolotl v0.3.0 # install axolotl v0.4.0
git clone https://github.com/OpenAccess-AI-Collective/axolotl git clone https://github.com/OpenAccess-AI-Collective/axolotl
cd axolotl cd axolotl
git checkout v0.3.0 git checkout v0.4.0
cp ../requirements-xpu.txt requirements.txt cp ../requirements-xpu.txt requirements.txt
pip install -e . pip install -e .
pip install transformers==4.36.0
``` ```
### 2. Configures OneAPI environment variables and accelerate ### 2. Configures OneAPI environment variables and accelerate
Configures OneAPI environment variables #### 2.1 Configures OneAPI environment variables
```bash ```bash
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
``` ```
Configures `accelerate` in command line interactively. #### 2.2 Configures `accelerate` in command line interactively.
```bash ```bash
accelerate config accelerate config
@ -41,16 +42,30 @@ Please answer `NO` in option `Do you want to run your training on CPU only (even
After finish accelerate config, check if `use_cpu` is disable (i.e., ` use_cpu: false`) in accelerate config file (`~/.cache/huggingface/accelerate/default_config.yaml`). After finish accelerate config, check if `use_cpu` is disable (i.e., ` use_cpu: false`) in accelerate config file (`~/.cache/huggingface/accelerate/default_config.yaml`).
#### 2.3 (Optional) Set ` HF_HUB_OFFLINE=1` to avoid huggingface hug signing.
```bash
export HF_HUB_OFFLINE=1
```
For more details, please refer [hfhuboffline](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhuboffline).
### 3. Finetune Llama-2-7B ### 3. Finetune Llama-2-7B
This example shows how to run [Alpaca QLoRA finetune on Llama-2](https://github.com/artidoro/qlora) directly on Intel GPU, based on [axolotl Llama-2 qlora example](https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/examples/llama-2/qlora.yml). Note that only Llama-2-7B QLora example is verified on Intel ARC 770 with 16GB memory. This example shows how to run [Alpaca QLoRA finetune on Llama-2](https://github.com/artidoro/qlora) directly on Intel GPU, based on [axolotl Llama-2 qlora example](https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/examples/llama-2/qlora.yml). Note that only Llama-2-7B QLora example is verified on Intel ARC 770 with 16GB memory.
Modify parameters in `qlora.yml` based on your requirements. Modify parameters in `qlora.yml` based on your requirements. Then, launch finetuning with the following command.
``` ```
accelerate launch finetune.py qlora.yml accelerate launch finetune.py qlora.yml
``` ```
In v0.4.0, you can also use `train.py` instead of `-m axolotl.cli.train` or `finetune.py`.
```
accelerate launch train.py qlora.yml
```
Output in console Output in console
``` ```

View file

@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
# #
# This file is copied from # This file is copied from
# https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/scripts/finetune.py # https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/scripts/finetune.py
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@ -33,264 +33,40 @@ llm_patch(train=True)
# The following is the original axolotl finetune code (without IPEX-LLM) # The following is the original axolotl finetune code (without IPEX-LLM)
"""Prepare and train a model on a dataset. Can also infer from a model or merge lora""" """Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
import importlib
import logging import logging
import os
import random
import sys
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Union
import fire import fire
import torch
import transformers import transformers
import yaml
# add src to the pythonpath so we don't need to pip install this from axolotl.cli import (
from art import text2art check_accelerate_default_config,
from transformers import GenerationConfig, TextStreamer check_user_token,
do_inference,
from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer do_merge_lora,
from axolotl.logging_config import configure_logging load_cfg,
from axolotl.train import TrainDatasetMeta, train load_datasets,
from axolotl.utils.config import normalize_config, validate_config print_axolotl_text_art,
from axolotl.utils.data import prepare_dataset
from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import is_main_process
from axolotl.utils.models import load_tokenizer
from axolotl.utils.tokenization import check_dataset_labels
from axolotl.utils.wandb import setup_wandb_env_vars
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
src_dir = os.path.join(project_root, "src")
sys.path.insert(0, src_dir)
configure_logging()
LOG = logging.getLogger("axolotl.scripts")
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
def print_axolotl_text_art(suffix=None):
font = "nancyj"
ascii_text = " axolotl"
if suffix:
ascii_text += f" x {suffix}"
ascii_art = text2art(" axolotl", font=font)
if is_main_process():
print(ascii_art)
def get_multi_line_input() -> Optional[str]:
print("Give me an instruction (Ctrl + D to finish): ")
instruction = ""
for line in sys.stdin:
instruction += line # pylint: disable=consider-using-join
# instruction = pathlib.Path("/proc/self/fd/0").read_text()
return instruction
def do_merge_lora(
*,
cfg: DictDefault,
cli_args: TrainerCliArgs,
):
model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
safe_serialization = cfg.save_safetensors is True
LOG.info("running merge of LoRA with base model")
model = model.merge_and_unload()
model.to(dtype=torch.float16)
if cfg.local_rank == 0:
LOG.info("saving merged model")
model.save_pretrained(
str(Path(cfg.output_dir) / "merged"),
safe_serialization=safe_serialization,
) )
tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged")) from axolotl.cli.shard import shard
from axolotl.common.cli import TrainerCliArgs
from axolotl.train import train
LOG = logging.getLogger("axolotl.scripts.finetune")
def shard(
*,
cfg: DictDefault,
cli_args: TrainerCliArgs,
):
model, _ = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
safe_serialization = cfg.save_safetensors is True
LOG.debug("Re-saving model w/ sharding")
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
def do_inference(
*,
cfg: DictDefault,
cli_args: TrainerCliArgs,
):
model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
prompter = cli_args.prompter
default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
for token, symbol in default_tokens.items():
# If the token isn't already specified in the config, add it
if not (cfg.special_tokens and token in cfg.special_tokens):
tokenizer.add_special_tokens({token: symbol})
prompter_module = None
if prompter:
prompter_module = getattr(
importlib.import_module("axolotl.prompters"), prompter
)
if cfg.landmark_attention:
from axolotl.monkeypatch.llama_landmark_attn import set_model_mem_id
set_model_mem_id(model, tokenizer)
model.set_mem_cache_args(
max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
)
model = model.to(cfg.device)
while True:
print("=" * 80)
# support for multiline inputs
instruction = get_multi_line_input()
if not instruction:
return
if prompter_module:
prompt: str = next(
prompter_module().build_prompt(instruction=instruction.strip("\n"))
)
else:
prompt = instruction.strip()
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
print("=" * 40)
model.eval()
with torch.no_grad():
generation_config = GenerationConfig(
repetition_penalty=1.1,
max_new_tokens=1024,
temperature=0.9,
top_p=0.95,
top_k=40,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
do_sample=True,
use_cache=True,
return_dict_in_generate=True,
output_attentions=False,
output_hidden_states=False,
output_scores=False,
)
streamer = TextStreamer(tokenizer)
generated = model.generate(
inputs=batch["input_ids"].to(cfg.device),
generation_config=generation_config,
streamer=streamer,
)
print("=" * 40)
print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))
def choose_config(path: Path):
yaml_files = list(path.glob("*.yml"))
if not yaml_files:
raise ValueError(
"No YAML config files found in the specified directory. Are you using a .yml extension?"
)
if len(yaml_files) == 1:
print(f"Using default YAML file '{yaml_files[0]}'")
return yaml_files[0]
print("Choose a YAML file:")
for idx, file in enumerate(yaml_files):
print(f"{idx + 1}. {file}")
chosen_file = None
while chosen_file is None:
try:
choice = int(input("Enter the number of your choice: "))
if 1 <= choice <= len(yaml_files):
chosen_file = yaml_files[choice - 1]
else:
print("Invalid choice. Please choose a number from the list.")
except ValueError:
print("Invalid input. Please enter a number.")
return chosen_file
def check_not_in(list1: List[str], list2: Union[Dict[str, Any], List[str]]) -> bool:
return not any(el in list2 for el in list1)
def load_cfg(config: Path = Path("examples/"), **kwargs):
if Path(config).is_dir():
config = choose_config(config)
# load the config from the yaml file
with open(config, encoding="utf-8") as file:
cfg: DictDefault = DictDefault(yaml.safe_load(file))
# if there are any options passed in the cli, if it is something that seems valid from the yaml,
# then overwrite the value
cfg_keys = cfg.keys()
for k, _ in kwargs.items():
# if not strict, allow writing to cfg even if it's not in the yml already
if k in cfg_keys or not cfg.strict:
# handle booleans
if isinstance(cfg[k], bool):
cfg[k] = bool(kwargs[k])
else:
cfg[k] = kwargs[k]
validate_config(cfg)
normalize_config(cfg)
setup_wandb_env_vars(cfg)
return cfg
def load_datasets(
*,
cfg: DictDefault,
cli_args: TrainerCliArgs,
) -> TrainDatasetMeta:
tokenizer = load_tokenizer(cfg)
train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)
if cli_args.debug or cfg.debug:
LOG.info("check_dataset_labels...")
check_dataset_labels(
train_dataset.select(
[
random.randrange(0, len(train_dataset) - 1) # nosec
for _ in range(cli_args.debug_num_examples)
]
),
tokenizer,
num_examples=cli_args.debug_num_examples,
text_only=cli_args.debug_text_only,
)
return TrainDatasetMeta(
train_dataset=train_dataset,
eval_dataset=eval_dataset,
total_num_steps=total_num_steps,
)
def do_cli(config: Path = Path("examples/"), **kwargs): def do_cli(config: Path = Path("examples/"), **kwargs):
print_axolotl_text_art() print_axolotl_text_art()
LOG.warning(
str(
PendingDeprecationWarning(
"scripts/finetune.py will be replaced with calling axolotl.cli.train"
)
)
)
parsed_cfg = load_cfg(config, **kwargs) parsed_cfg = load_cfg(config, **kwargs)
check_accelerate_default_config()
check_user_token()
parser = transformers.HfArgumentParser((TrainerCliArgs)) parser = transformers.HfArgumentParser((TrainerCliArgs))
parsed_cli_args, _ = parser.parse_args_into_dataclasses( parsed_cli_args, _ = parser.parse_args_into_dataclasses(
return_remaining_strings=True return_remaining_strings=True
@ -303,8 +79,6 @@ def do_cli(config: Path = Path("examples/"), **kwargs):
shard(cfg=parsed_cfg, cli_args=parsed_cli_args) shard(cfg=parsed_cfg, cli_args=parsed_cli_args)
else: else:
dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args) dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
if parsed_cli_args.prepare_ds_only:
return
train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta) train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)

View file

@ -1,6 +1,5 @@
# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/examples/llama-2/qlora.yml # This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/examples/llama-2/qlora.yml
base_model: meta-llama/Llama-2-7b-hf base_model: NousResearch/Llama-2-7b-hf
base_model_config: meta-llama/Llama-2-7b-hf
model_type: LlamaForCausalLM model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer tokenizer_type: LlamaTokenizer
is_llama_derived_model: true is_llama_derived_model: true
@ -12,8 +11,8 @@ strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test
type: alpaca type: alpaca
dataset_prepared_path: last_run_prepared dataset_prepared_path:
val_set_size: 0.01 val_set_size: 0.05
output_dir: ./qlora-out output_dir: ./qlora-out
adapter: qlora adapter: qlora
@ -33,15 +32,12 @@ lora_fan_in_fan_out:
wandb_project: wandb_project:
wandb_entity: wandb_entity:
wandb_watch: wandb_watch:
wandb_run_id: wandb_name:
wandb_log_model: wandb_log_model:
gradient_accumulation_steps: 2 gradient_accumulation_steps: 4
micro_batch_size: 1 micro_batch_size: 1
num_epochs: 3 num_epochs: 4
# paged_adamw_32bit is not supported
# due to bitsandbytes issue https://github.com/TimDettmers/bitsandbytes/issues/1180
# optimizer: paged_adamw_32bit
optimizer: adamw_torch optimizer: adamw_torch
lr_scheduler: cosine lr_scheduler: cosine
learning_rate: 0.0002 learning_rate: 0.0002
@ -61,8 +57,9 @@ xformers_attention:
flash_attention: false flash_attention: false
warmup_steps: 10 warmup_steps: 10
eval_steps: 20 evals_per_epoch: 4
save_steps: eval_table_size:
saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View file

@ -1,28 +1,26 @@
# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/requirements.txt # This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/requirements.txt
--extra-index-url https://download.pytorch.org/whl/cu118
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
# torch==2.1.0 packaging==23.2
# auto-gptq
packaging
peft==0.5.0 peft==0.5.0
transformers==4.34.0 tokenizers
bitsandbytes>=0.41.1 bitsandbytes>=0.41.1
accelerate==0.23.0 accelerate==0.23.0
deepspeed>=0.13.1
addict addict
evaluate
fire fire
PyYAML>=6.0 PyYAML>=6.0
datasets datasets
flash-attn>=2.2.1 #flash-attn==2.3.3
sentencepiece sentencepiece
wandb wandb
einops einops
# xformers #xformers==0.0.22
optimum optimum==1.13.2
hf_transfer hf_transfer
colorama colorama
numba numba
numpy>=1.24.4 numpy>=1.24.4
mlflow
# qlora things # qlora things
bert-score==0.3.13 bert-score==0.3.13
evaluate==0.4.0 evaluate==0.4.0
@ -31,3 +29,15 @@ scipy
scikit-learn==1.2.2 scikit-learn==1.2.2
pynvml pynvml
art art
fschat==0.2.34
gradio==3.50.2
tensorboard
mamba-ssm==1.1.1
# remote filesystems
s3fs
gcsfs
# adlfs
trl>=0.7.9

View file

@ -0,0 +1,83 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This file is copied from
# https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/src/axolotl/cli/train.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ipex_llm import llm_patch
llm_patch(train=True)
# The following is the original axolotl train code (without IPEX-LLM)
"""
CLI to run training on a model
"""
import logging
from pathlib import Path
from typing import Tuple
import fire
import transformers
from transformers import PreTrainedModel, PreTrainedTokenizer
from axolotl.cli import (
check_accelerate_default_config,
check_user_token,
load_cfg,
load_datasets,
load_rl_datasets,
print_axolotl_text_art,
)
from axolotl.common.cli import TrainerCliArgs
from axolotl.train import train
LOG = logging.getLogger("axolotl.cli.train")
def do_cli(config: Path = Path("examples/"), **kwargs):
# pylint: disable=duplicate-code
parsed_cfg = load_cfg(config, **kwargs)
parser = transformers.HfArgumentParser((TrainerCliArgs))
parsed_cli_args, _ = parser.parse_args_into_dataclasses(
return_remaining_strings=True
)
return do_train(parsed_cfg, parsed_cli_args)
def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
print_axolotl_text_art()
check_accelerate_default_config()
check_user_token()
if cfg.rl:
dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
else:
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
return train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
if __name__ == "__main__":
fire.Fire(do_cli)

View file

@ -47,6 +47,8 @@ def llm_patch(train=False):
replace_attr(transformers, "AutoModelForCausalLM", AutoModelForCausalLM) replace_attr(transformers, "AutoModelForCausalLM", AutoModelForCausalLM)
replace_attr(transformers, "LlamaForCausalLM", AutoModelForCausalLM) replace_attr(transformers, "LlamaForCausalLM", AutoModelForCausalLM)
replace_attr(transformers, "AutoModel", AutoModel) replace_attr(transformers, "AutoModel", AutoModel)
from ipex_llm.transformers.utils import is_torch_bf16_gpu_available
replace_attr(transformers.utils, "is_torch_bf16_gpu_available", is_torch_bf16_gpu_available)
import_peft_check = 'peft' in sys.modules or 'peft.utils' in sys.modules or \ import_peft_check = 'peft' in sys.modules or 'peft.utils' in sys.modules or \
'peft.tuners' in sys.modules or 'peft.mapping' in sys.modules 'peft.tuners' in sys.modules or 'peft.mapping' in sys.modules

View file

@ -335,3 +335,7 @@ def get_modelscope_hf_config(model_id_or_path: str,
elif os.path.isfile(model_id_or_path): elif os.path.isfile(model_id_or_path):
local_path = model_id_or_path local_path = model_id_or_path
return Config._file2dict(local_path) return Config._file2dict(local_path)
def is_torch_bf16_gpu_available():
# always true for XPU and CPU
return True