Axolotl v0.4.0 support (#10773)

* Add Axolotl 0.4.0, remove legacy 0.3.0 support. * replace is_torch_bf16_gpu_available * Add HF_HUB_OFFLINE=1 * Move transformers out of requirement * Refine readme and qlora.yml
2024-04-17 09:49:11 +08:00 · 2024-04-17 09:49:11 +08:00 · f2e923b3ca
commit f2e923b3ca
parent 26cae0a39c
7 changed files with 165 additions and 280 deletions
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md
@ -1,6 +1,6 @@
-# Finetune LLM on Intel GPU using axolotl v0.3.0 without writing code
+# Finetune LLM on Intel GPU using axolotl v0.4.0 without writing code
-This example demonstrates how to easily run LLM finetuning application using [axolotl v0.3.0](https://github.com/OpenAccess-AI-Collective/axolotl/tree/v0.3.0) and IPEX-LLM 4bit optimizations with [Intel GPUs](../../../README.md). By applying IPEX-LLM patch, you could use axolotl on Intel GPUs using IPEX-LLM optimization without writing code.
+This example demonstrates how to easily run LLM finetuning application using [axolotl v0.4.0](https://github.com/OpenAccess-AI-Collective/axolotl/tree/v0.4.0) and IPEX-LLM 4bit optimizations with [Intel GPUs](../../../README.md). By applying IPEX-LLM patch, you could use axolotl on Intel GPUs using IPEX-LLM optimization without writing code.
 Note, this example is just used for illustrating related usage and don't guarantee convergence of training.
@ -15,23 +15,24 @@ conda create -n llm python=3.11
 conda activate llm
 # below command will install intel_extension_for_pytorch==2.1.10+xpu as default
 pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-# install axolotl v0.3.0
+# install axolotl v0.4.0
 git clone https://github.com/OpenAccess-AI-Collective/axolotl
 cd axolotl
-git checkout v0.3.0
+git checkout v0.4.0
 cp ../requirements-xpu.txt requirements.txt
 pip install -e .
 pip install transformers==4.36.0
 ```
 ### 2. Configures OneAPI environment variables and accelerate
-Configures OneAPI environment variables 
+#### 2.1 Configures OneAPI environment variables 
 ```bash
 source /opt/intel/oneapi/setvars.sh
 ```
-Configures `accelerate` in command line interactively. 
+#### 2.2 Configures `accelerate` in command line interactively. 
 ```bash
 accelerate config
@ -41,16 +42,30 @@ Please answer `NO` in option `Do you want to run your training on CPU only (even
 After finish accelerate config, check if `use_cpu` is disable (i.e., ` use_cpu: false`) in accelerate config file (`~/.cache/huggingface/accelerate/default_config.yaml`).
 #### 2.3 (Optional) Set ` HF_HUB_OFFLINE=1` to avoid huggingface hug signing.
 ```bash
 export  HF_HUB_OFFLINE=1
 ```
 For more details, please refer [hfhuboffline](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhuboffline).
 ### 3. Finetune Llama-2-7B
-This example shows how to run [Alpaca QLoRA finetune on Llama-2](https://github.com/artidoro/qlora) directly on Intel GPU, based on [axolotl Llama-2 qlora example](https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/examples/llama-2/qlora.yml). Note that only Llama-2-7B QLora example is verified on Intel ARC 770 with 16GB memory.
+This example shows how to run [Alpaca QLoRA finetune on Llama-2](https://github.com/artidoro/qlora) directly on Intel GPU, based on [axolotl Llama-2 qlora example](https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/examples/llama-2/qlora.yml). Note that only Llama-2-7B QLora example is verified on Intel ARC 770 with 16GB memory.
-Modify parameters in `qlora.yml` based on your requirements.
+Modify parameters in `qlora.yml` based on your requirements. Then, launch finetuning with the following command.
 ```
 accelerate launch finetune.py qlora.yml
 ```
 In v0.4.0, you can also use `train.py` instead of `-m axolotl.cli.train` or `finetune.py`.
 ```
 accelerate launch train.py qlora.yml
 ```
 Output in console
 ```
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/finetune.py
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/finetune.py
@ -14,7 +14,7 @@
 # limitations under the License.
 #
 # This file is copied from
-# https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/scripts/finetune.py
+# https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/scripts/finetune.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -33,264 +33,40 @@ llm_patch(train=True)
 # The following is the original axolotl finetune code (without IPEX-LLM)
 """Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
 import importlib
 import logging
 import os
 import random
 import sys
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 import fire
 import torch
 import transformers
 import yaml
-# add src to the pythonpath so we don't need to pip install this
+from axolotl.cli import (
-from art import text2art
+    check_accelerate_default_config,
-from transformers import GenerationConfig, TextStreamer
+    check_user_token,
-
+    do_inference,
-from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
+    do_merge_lora,
-from axolotl.logging_config import configure_logging
+    load_cfg,
-from axolotl.train import TrainDatasetMeta, train
+    load_datasets,
-from axolotl.utils.config import normalize_config, validate_config
+    print_axolotl_text_art,
 from axolotl.utils.data import prepare_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process
 from axolotl.utils.models import load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
 from axolotl.utils.wandb import setup_wandb_env_vars
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)
 configure_logging()
 LOG = logging.getLogger("axolotl.scripts")
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 def print_axolotl_text_art(suffix=None):
    font = "nancyj"
    ascii_text = "  axolotl"
    if suffix:
        ascii_text += f"  x  {suffix}"
    ascii_art = text2art(" axolotl", font=font)
    if is_main_process():
        print(ascii_art)
 def get_multi_line_input() -> Optional[str]:
    print("Give me an instruction (Ctrl + D to finish): ")
    instruction = ""
    for line in sys.stdin:
        instruction += line  # pylint: disable=consider-using-join
    # instruction = pathlib.Path("/proc/self/fd/0").read_text()
    return instruction
 def do_merge_lora(
    *,
    cfg: DictDefault,
    cli_args: TrainerCliArgs,
 ):
    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
    safe_serialization = cfg.save_safetensors is True
    LOG.info("running merge of LoRA with base model")
    model = model.merge_and_unload()
    model.to(dtype=torch.float16)
    if cfg.local_rank == 0:
        LOG.info("saving merged model")
        model.save_pretrained(
            str(Path(cfg.output_dir) / "merged"),
            safe_serialization=safe_serialization,
 )
-        tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
+from axolotl.cli.shard import shard
 from axolotl.common.cli import TrainerCliArgs
 from axolotl.train import train
-
+LOG = logging.getLogger("axolotl.scripts.finetune")
 def shard(
    *,
    cfg: DictDefault,
    cli_args: TrainerCliArgs,
 ):
    model, _ = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
    safe_serialization = cfg.save_safetensors is True
    LOG.debug("Re-saving model w/ sharding")
    model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
 def do_inference(
    *,
    cfg: DictDefault,
    cli_args: TrainerCliArgs,
 ):
    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
    prompter = cli_args.prompter
    default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
    for token, symbol in default_tokens.items():
        # If the token isn't already specified in the config, add it
        if not (cfg.special_tokens and token in cfg.special_tokens):
            tokenizer.add_special_tokens({token: symbol})
    prompter_module = None
    if prompter:
        prompter_module = getattr(
            importlib.import_module("axolotl.prompters"), prompter
        )
    if cfg.landmark_attention:
        from axolotl.monkeypatch.llama_landmark_attn import set_model_mem_id
        set_model_mem_id(model, tokenizer)
        model.set_mem_cache_args(
            max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
        )
    model = model.to(cfg.device)
    while True:
        print("=" * 80)
        # support for multiline inputs
        instruction = get_multi_line_input()
        if not instruction:
            return
        if prompter_module:
            prompt: str = next(
                prompter_module().build_prompt(instruction=instruction.strip("\n"))
            )
        else:
            prompt = instruction.strip()
        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
        print("=" * 40)
        model.eval()
        with torch.no_grad():
            generation_config = GenerationConfig(
                repetition_penalty=1.1,
                max_new_tokens=1024,
                temperature=0.9,
                top_p=0.95,
                top_k=40,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=True,
                use_cache=True,
                return_dict_in_generate=True,
                output_attentions=False,
                output_hidden_states=False,
                output_scores=False,
            )
            streamer = TextStreamer(tokenizer)
            generated = model.generate(
                inputs=batch["input_ids"].to(cfg.device),
                generation_config=generation_config,
                streamer=streamer,
            )
        print("=" * 40)
        print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))
 def choose_config(path: Path):
    yaml_files = list(path.glob("*.yml"))
    if not yaml_files:
        raise ValueError(
            "No YAML config files found in the specified directory. Are you using a .yml extension?"
        )
    if len(yaml_files) == 1:
        print(f"Using default YAML file '{yaml_files[0]}'")
        return yaml_files[0]
    print("Choose a YAML file:")
    for idx, file in enumerate(yaml_files):
        print(f"{idx + 1}. {file}")
    chosen_file = None
    while chosen_file is None:
        try:
            choice = int(input("Enter the number of your choice: "))
            if 1 <= choice <= len(yaml_files):
                chosen_file = yaml_files[choice - 1]
            else:
                print("Invalid choice. Please choose a number from the list.")
        except ValueError:
            print("Invalid input. Please enter a number.")
    return chosen_file
 def check_not_in(list1: List[str], list2: Union[Dict[str, Any], List[str]]) -> bool:
    return not any(el in list2 for el in list1)
 def load_cfg(config: Path = Path("examples/"), **kwargs):
    if Path(config).is_dir():
        config = choose_config(config)
    # load the config from the yaml file
    with open(config, encoding="utf-8") as file:
        cfg: DictDefault = DictDefault(yaml.safe_load(file))
    # if there are any options passed in the cli, if it is something that seems valid from the yaml,
    # then overwrite the value
    cfg_keys = cfg.keys()
    for k, _ in kwargs.items():
        # if not strict, allow writing to cfg even if it's not in the yml already
        if k in cfg_keys or not cfg.strict:
            # handle booleans
            if isinstance(cfg[k], bool):
                cfg[k] = bool(kwargs[k])
            else:
                cfg[k] = kwargs[k]
    validate_config(cfg)
    normalize_config(cfg)
    setup_wandb_env_vars(cfg)
    return cfg
 def load_datasets(
    *,
    cfg: DictDefault,
    cli_args: TrainerCliArgs,
 ) -> TrainDatasetMeta:
    tokenizer = load_tokenizer(cfg)
    train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)
    if cli_args.debug or cfg.debug:
        LOG.info("check_dataset_labels...")
        check_dataset_labels(
            train_dataset.select(
                [
                    random.randrange(0, len(train_dataset) - 1)  # nosec
                    for _ in range(cli_args.debug_num_examples)
                ]
            ),
            tokenizer,
            num_examples=cli_args.debug_num_examples,
            text_only=cli_args.debug_text_only,
        )
    return TrainDatasetMeta(
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        total_num_steps=total_num_steps,
    )
 def do_cli(config: Path = Path("examples/"), **kwargs):
    print_axolotl_text_art()
    LOG.warning(
        str(
            PendingDeprecationWarning(
                "scripts/finetune.py will be replaced with calling axolotl.cli.train"
            )
        )
    )
    parsed_cfg = load_cfg(config, **kwargs)
    check_accelerate_default_config()
    check_user_token()
    parser = transformers.HfArgumentParser((TrainerCliArgs))
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
        return_remaining_strings=True
@ -303,8 +79,6 @@ def do_cli(config: Path = Path("examples/"), **kwargs):
        shard(cfg=parsed_cfg, cli_args=parsed_cli_args)
    else:
        dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
        if parsed_cli_args.prepare_ds_only:
            return
        train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
@ -1,6 +1,5 @@
-# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/examples/llama-2/qlora.yml
+# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/examples/llama-2/qlora.yml
-base_model: meta-llama/Llama-2-7b-hf
+base_model: NousResearch/Llama-2-7b-hf
 base_model_config: meta-llama/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 is_llama_derived_model: true
@ -12,8 +11,8 @@ strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
-dataset_prepared_path: last_run_prepared
+dataset_prepared_path:
-val_set_size: 0.01
+val_set_size: 0.05
 output_dir: ./qlora-out
 adapter: qlora
@ -33,15 +32,12 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
-gradient_accumulation_steps: 2
+gradient_accumulation_steps: 4
 micro_batch_size: 1
-num_epochs: 3
+num_epochs: 4
 # paged_adamw_32bit is not supported
 # due to bitsandbytes issue https://github.com/TimDettmers/bitsandbytes/issues/1180
 # optimizer: paged_adamw_32bit
 optimizer: adamw_torch
 lr_scheduler: cosine
 learning_rate: 0.0002
@ -61,8 +57,9 @@ xformers_attention:
 flash_attention: false
 warmup_steps: 10
-eval_steps: 20
+evals_per_epoch: 4
-save_steps:
+eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt
@ -1,28 +1,26 @@
-# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/requirements.txt
+# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/requirements.txt
 --extra-index-url https://download.pytorch.org/whl/cu118
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
-# torch==2.1.0
+packaging==23.2
 # auto-gptq
 packaging
 peft==0.5.0
-transformers==4.34.0
+tokenizers
 bitsandbytes>=0.41.1
 accelerate==0.23.0
 deepspeed>=0.13.1
 addict
 evaluate
 fire
 PyYAML>=6.0
 datasets
-flash-attn>=2.2.1
+#flash-attn==2.3.3
 sentencepiece
 wandb
 einops
-# xformers
+#xformers==0.0.22
-optimum
+optimum==1.13.2
 hf_transfer
 colorama
 numba
 numpy>=1.24.4
 mlflow
 # qlora things
 bert-score==0.3.13
 evaluate==0.4.0
@ -31,3 +29,15 @@ scipy
 scikit-learn==1.2.2
 pynvml
 art
 fschat==0.2.34
 gradio==3.50.2
 tensorboard
 mamba-ssm==1.1.1
 # remote filesystems
 s3fs
 gcsfs
 # adlfs
 trl>=0.7.9
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/train.py
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/train.py
@ -0,0 +1,83 @@
 #
 # Copyright 2016 The BigDL Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # This file is copied from
 # https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/src/axolotl/cli/train.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ipex_llm import llm_patch
 llm_patch(train=True)
 # The following is the original axolotl train code (without IPEX-LLM)
 """
 CLI to run training on a model
 """
 import logging
 from pathlib import Path
 from typing import Tuple
 import fire
 import transformers
 from transformers import PreTrainedModel, PreTrainedTokenizer
 from axolotl.cli import (
    check_accelerate_default_config,
    check_user_token,
    load_cfg,
    load_datasets,
    load_rl_datasets,
    print_axolotl_text_art,
 )
 from axolotl.common.cli import TrainerCliArgs
 from axolotl.train import train
 LOG = logging.getLogger("axolotl.cli.train")
 def do_cli(config: Path = Path("examples/"), **kwargs):
    # pylint: disable=duplicate-code
    parsed_cfg = load_cfg(config, **kwargs)
    parser = transformers.HfArgumentParser((TrainerCliArgs))
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
        return_remaining_strings=True
    )
    return do_train(parsed_cfg, parsed_cli_args)
 def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    print_axolotl_text_art()
    check_accelerate_default_config()
    check_user_token()
    if cfg.rl:
        dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
    else:
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
    return train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
 if __name__ == "__main__":
    fire.Fire(do_cli)
--- a/python/llm/src/ipex_llm/llm_patching.py
+++ b/python/llm/src/ipex_llm/llm_patching.py
@ -47,6 +47,8 @@ def llm_patch(train=False):
        replace_attr(transformers, "AutoModelForCausalLM", AutoModelForCausalLM)
        replace_attr(transformers, "LlamaForCausalLM", AutoModelForCausalLM)
        replace_attr(transformers, "AutoModel", AutoModel)
        from ipex_llm.transformers.utils import is_torch_bf16_gpu_available
        replace_attr(transformers.utils, "is_torch_bf16_gpu_available", is_torch_bf16_gpu_available)
        import_peft_check = 'peft' in sys.modules or 'peft.utils' in sys.modules or \
            'peft.tuners' in sys.modules or 'peft.mapping' in sys.modules
--- a/python/llm/src/ipex_llm/transformers/utils.py
+++ b/python/llm/src/ipex_llm/transformers/utils.py
@ -335,3 +335,7 @@ def get_modelscope_hf_config(model_id_or_path: str,
    elif os.path.isfile(model_id_or_path):
        local_path = model_id_or_path
    return Config._file2dict(local_path)
 def is_torch_bf16_gpu_available():
    # always true for XPU and CPU
    return True