From f2e923b3ca3744b762e11faaf7a141f3df4bd1ba Mon Sep 17 00:00:00 2001
From: Qiyuan Gong <qiyuan.gong@intel.com>
Date: Wed, 17 Apr 2024 09:49:11 +0800
Subject: [PATCH] Axolotl v0.4.0 support  (#10773)

* Add Axolotl 0.4.0, remove legacy 0.3.0 support.
* replace is_torch_bf16_gpu_available
* Add HF_HUB_OFFLINE=1
* Move transformers out of requirement
* Refine readme and qlora.yml
---
 .../GPU/LLM-Finetuning/axolotl/README.md      |  31 +-
 .../GPU/LLM-Finetuning/axolotl/finetune.py    | 272 ++----------------
 .../GPU/LLM-Finetuning/axolotl/qlora.yml      |  23 +-
 .../axolotl/requirements-xpu.txt              |  30 +-
 .../GPU/LLM-Finetuning/axolotl/train.py       |  83 ++++++
 python/llm/src/ipex_llm/llm_patching.py       |   2 +
 python/llm/src/ipex_llm/transformers/utils.py |   4 +
 7 files changed, 165 insertions(+), 280 deletions(-)
 create mode 100644 python/llm/example/GPU/LLM-Finetuning/axolotl/train.py

diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md b/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md
index ae6c7ca4..206e98ae 100644
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/README.md
@@ -1,6 +1,6 @@
-# Finetune LLM on Intel GPU using axolotl v0.3.0 without writing code
+# Finetune LLM on Intel GPU using axolotl v0.4.0 without writing code
 
-This example demonstrates how to easily run LLM finetuning application using [axolotl v0.3.0](https://github.com/OpenAccess-AI-Collective/axolotl/tree/v0.3.0) and IPEX-LLM 4bit optimizations with [Intel GPUs](../../../README.md). By applying IPEX-LLM patch, you could use axolotl on Intel GPUs using IPEX-LLM optimization without writing code.
+This example demonstrates how to easily run LLM finetuning application using [axolotl v0.4.0](https://github.com/OpenAccess-AI-Collective/axolotl/tree/v0.4.0) and IPEX-LLM 4bit optimizations with [Intel GPUs](../../../README.md). By applying IPEX-LLM patch, you could use axolotl on Intel GPUs using IPEX-LLM optimization without writing code.
 
 Note, this example is just used for illustrating related usage and don't guarantee convergence of training.
 
@@ -15,23 +15,24 @@ conda create -n llm python=3.11
 conda activate llm
 # below command will install intel_extension_for_pytorch==2.1.10+xpu as default
 pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-# install axolotl v0.3.0
+# install axolotl v0.4.0
 git clone https://github.com/OpenAccess-AI-Collective/axolotl
 cd axolotl
-git checkout v0.3.0
+git checkout v0.4.0
 cp ../requirements-xpu.txt requirements.txt
 pip install -e .
+pip install transformers==4.36.0
 ```
 
 ### 2. Configures OneAPI environment variables and accelerate
 
-Configures OneAPI environment variables 
+#### 2.1 Configures OneAPI environment variables 
 
 ```bash
 source /opt/intel/oneapi/setvars.sh
 ```
 
-Configures `accelerate` in command line interactively. 
+#### 2.2 Configures `accelerate` in command line interactively. 
 
 ```bash
 accelerate config
@@ -41,16 +42,30 @@ Please answer `NO` in option `Do you want to run your training on CPU only (even
 
 After finish accelerate config, check if `use_cpu` is disable (i.e., ` use_cpu: false`) in accelerate config file (`~/.cache/huggingface/accelerate/default_config.yaml`).
 
+#### 2.3 (Optional) Set ` HF_HUB_OFFLINE=1` to avoid huggingface hug signing.
+
+```bash
+export  HF_HUB_OFFLINE=1
+```
+
+For more details, please refer [hfhuboffline](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhuboffline).
+
 ### 3. Finetune Llama-2-7B
 
-This example shows how to run [Alpaca QLoRA finetune on Llama-2](https://github.com/artidoro/qlora) directly on Intel GPU, based on [axolotl Llama-2 qlora example](https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/examples/llama-2/qlora.yml). Note that only Llama-2-7B QLora example is verified on Intel ARC 770 with 16GB memory.
+This example shows how to run [Alpaca QLoRA finetune on Llama-2](https://github.com/artidoro/qlora) directly on Intel GPU, based on [axolotl Llama-2 qlora example](https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/examples/llama-2/qlora.yml). Note that only Llama-2-7B QLora example is verified on Intel ARC 770 with 16GB memory.
 
-Modify parameters in `qlora.yml` based on your requirements.
+Modify parameters in `qlora.yml` based on your requirements. Then, launch finetuning with the following command.
 
 ```
 accelerate launch finetune.py qlora.yml
 ```
 
+In v0.4.0, you can also use `train.py` instead of `-m axolotl.cli.train` or `finetune.py`.
+
+```
+accelerate launch train.py qlora.yml
+```
+
 Output in console
 
 ```
diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/finetune.py b/python/llm/example/GPU/LLM-Finetuning/axolotl/finetune.py
index 15d9e7fc..2cf98f7d 100644
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/finetune.py
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/finetune.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 # This file is copied from
-# https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/scripts/finetune.py
+# https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/scripts/finetune.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,264 +33,40 @@ llm_patch(train=True)
 # The following is the original axolotl finetune code (without IPEX-LLM)
 
 """Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
-
-import importlib
 import logging
-import os
-import random
-import sys
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
 
 import fire
-import torch
 import transformers
-import yaml
 
-# add src to the pythonpath so we don't need to pip install this
-from art import text2art
-from transformers import GenerationConfig, TextStreamer
+from axolotl.cli import (
+    check_accelerate_default_config,
+    check_user_token,
+    do_inference,
+    do_merge_lora,
+    load_cfg,
+    load_datasets,
+    print_axolotl_text_art,
+)
+from axolotl.cli.shard import shard
+from axolotl.common.cli import TrainerCliArgs
+from axolotl.train import train
 
-from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
-from axolotl.logging_config import configure_logging
-from axolotl.train import TrainDatasetMeta, train
-from axolotl.utils.config import normalize_config, validate_config
-from axolotl.utils.data import prepare_dataset
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import is_main_process
-from axolotl.utils.models import load_tokenizer
-from axolotl.utils.tokenization import check_dataset_labels
-from axolotl.utils.wandb import setup_wandb_env_vars
-
-project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
-src_dir = os.path.join(project_root, "src")
-sys.path.insert(0, src_dir)
-
-configure_logging()
-LOG = logging.getLogger("axolotl.scripts")
-
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-
-
-def print_axolotl_text_art(suffix=None):
-    font = "nancyj"
-    ascii_text = "  axolotl"
-    if suffix:
-        ascii_text += f"  x  {suffix}"
-    ascii_art = text2art(" axolotl", font=font)
-
-    if is_main_process():
-        print(ascii_art)
-
-
-def get_multi_line_input() -> Optional[str]:
-    print("Give me an instruction (Ctrl + D to finish): ")
-    instruction = ""
-    for line in sys.stdin:
-        instruction += line  # pylint: disable=consider-using-join
-    # instruction = pathlib.Path("/proc/self/fd/0").read_text()
-    return instruction
-
-
-def do_merge_lora(
-    *,
-    cfg: DictDefault,
-    cli_args: TrainerCliArgs,
-):
-    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
-    safe_serialization = cfg.save_safetensors is True
-
-    LOG.info("running merge of LoRA with base model")
-    model = model.merge_and_unload()
-    model.to(dtype=torch.float16)
-
-    if cfg.local_rank == 0:
-        LOG.info("saving merged model")
-        model.save_pretrained(
-            str(Path(cfg.output_dir) / "merged"),
-            safe_serialization=safe_serialization,
-        )
-        tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
-
-
-def shard(
-    *,
-    cfg: DictDefault,
-    cli_args: TrainerCliArgs,
-):
-    model, _ = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
-    safe_serialization = cfg.save_safetensors is True
-    LOG.debug("Re-saving model w/ sharding")
-    model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
-
-
-def do_inference(
-    *,
-    cfg: DictDefault,
-    cli_args: TrainerCliArgs,
-):
-    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
-    prompter = cli_args.prompter
-    default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
-
-    for token, symbol in default_tokens.items():
-        # If the token isn't already specified in the config, add it
-        if not (cfg.special_tokens and token in cfg.special_tokens):
-            tokenizer.add_special_tokens({token: symbol})
-
-    prompter_module = None
-    if prompter:
-        prompter_module = getattr(
-            importlib.import_module("axolotl.prompters"), prompter
-        )
-
-    if cfg.landmark_attention:
-        from axolotl.monkeypatch.llama_landmark_attn import set_model_mem_id
-
-        set_model_mem_id(model, tokenizer)
-        model.set_mem_cache_args(
-            max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
-        )
-
-    model = model.to(cfg.device)
-
-    while True:
-        print("=" * 80)
-        # support for multiline inputs
-        instruction = get_multi_line_input()
-        if not instruction:
-            return
-        if prompter_module:
-            prompt: str = next(
-                prompter_module().build_prompt(instruction=instruction.strip("\n"))
-            )
-        else:
-            prompt = instruction.strip()
-        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
-
-        print("=" * 40)
-        model.eval()
-        with torch.no_grad():
-            generation_config = GenerationConfig(
-                repetition_penalty=1.1,
-                max_new_tokens=1024,
-                temperature=0.9,
-                top_p=0.95,
-                top_k=40,
-                bos_token_id=tokenizer.bos_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-                pad_token_id=tokenizer.pad_token_id,
-                do_sample=True,
-                use_cache=True,
-                return_dict_in_generate=True,
-                output_attentions=False,
-                output_hidden_states=False,
-                output_scores=False,
-            )
-            streamer = TextStreamer(tokenizer)
-            generated = model.generate(
-                inputs=batch["input_ids"].to(cfg.device),
-                generation_config=generation_config,
-                streamer=streamer,
-            )
-        print("=" * 40)
-        print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))
-
-
-def choose_config(path: Path):
-    yaml_files = list(path.glob("*.yml"))
-
-    if not yaml_files:
-        raise ValueError(
-            "No YAML config files found in the specified directory. Are you using a .yml extension?"
-        )
-
-    if len(yaml_files) == 1:
-        print(f"Using default YAML file '{yaml_files[0]}'")
-        return yaml_files[0]
-
-    print("Choose a YAML file:")
-    for idx, file in enumerate(yaml_files):
-        print(f"{idx + 1}. {file}")
-
-    chosen_file = None
-    while chosen_file is None:
-        try:
-            choice = int(input("Enter the number of your choice: "))
-            if 1 <= choice <= len(yaml_files):
-                chosen_file = yaml_files[choice - 1]
-            else:
-                print("Invalid choice. Please choose a number from the list.")
-        except ValueError:
-            print("Invalid input. Please enter a number.")
-
-    return chosen_file
-
-
-def check_not_in(list1: List[str], list2: Union[Dict[str, Any], List[str]]) -> bool:
-    return not any(el in list2 for el in list1)
-
-
-def load_cfg(config: Path = Path("examples/"), **kwargs):
-    if Path(config).is_dir():
-        config = choose_config(config)
-
-    # load the config from the yaml file
-    with open(config, encoding="utf-8") as file:
-        cfg: DictDefault = DictDefault(yaml.safe_load(file))
-    # if there are any options passed in the cli, if it is something that seems valid from the yaml,
-    # then overwrite the value
-    cfg_keys = cfg.keys()
-    for k, _ in kwargs.items():
-        # if not strict, allow writing to cfg even if it's not in the yml already
-        if k in cfg_keys or not cfg.strict:
-            # handle booleans
-            if isinstance(cfg[k], bool):
-                cfg[k] = bool(kwargs[k])
-            else:
-                cfg[k] = kwargs[k]
-
-    validate_config(cfg)
-
-    normalize_config(cfg)
-
-    setup_wandb_env_vars(cfg)
-    return cfg
-
-
-def load_datasets(
-    *,
-    cfg: DictDefault,
-    cli_args: TrainerCliArgs,
-) -> TrainDatasetMeta:
-    tokenizer = load_tokenizer(cfg)
-
-    train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)
-
-    if cli_args.debug or cfg.debug:
-        LOG.info("check_dataset_labels...")
-        check_dataset_labels(
-            train_dataset.select(
-                [
-                    random.randrange(0, len(train_dataset) - 1)  # nosec
-                    for _ in range(cli_args.debug_num_examples)
-                ]
-            ),
-            tokenizer,
-            num_examples=cli_args.debug_num_examples,
-            text_only=cli_args.debug_text_only,
-        )
-
-    return TrainDatasetMeta(
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        total_num_steps=total_num_steps,
-    )
+LOG = logging.getLogger("axolotl.scripts.finetune")
 
 
 def do_cli(config: Path = Path("examples/"), **kwargs):
     print_axolotl_text_art()
+    LOG.warning(
+        str(
+            PendingDeprecationWarning(
+                "scripts/finetune.py will be replaced with calling axolotl.cli.train"
+            )
+        )
+    )
     parsed_cfg = load_cfg(config, **kwargs)
+    check_accelerate_default_config()
+    check_user_token()
     parser = transformers.HfArgumentParser((TrainerCliArgs))
     parsed_cli_args, _ = parser.parse_args_into_dataclasses(
         return_remaining_strings=True
@@ -303,8 +79,6 @@ def do_cli(config: Path = Path("examples/"), **kwargs):
         shard(cfg=parsed_cfg, cli_args=parsed_cli_args)
     else:
         dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
-        if parsed_cli_args.prepare_ds_only:
-            return
         train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
 
 
diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml b/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
index 7e5c2fbe..c6f446f9 100644
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/qlora.yml
@@ -1,6 +1,5 @@
-# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/examples/llama-2/qlora.yml
-base_model: meta-llama/Llama-2-7b-hf
-base_model_config: meta-llama/Llama-2-7b-hf
+# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/examples/llama-2/qlora.yml
+base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 is_llama_derived_model: true
@@ -12,8 +11,8 @@ strict: false
 datasets:
   - path: mhenrichsen/alpaca_2k_test
     type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
+dataset_prepared_path:
+val_set_size: 0.05
 output_dir: ./qlora-out
 
 adapter: qlora
@@ -33,15 +32,12 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 
-gradient_accumulation_steps: 2
+gradient_accumulation_steps: 4
 micro_batch_size: 1
-num_epochs: 3
-# paged_adamw_32bit is not supported
-# due to bitsandbytes issue https://github.com/TimDettmers/bitsandbytes/issues/1180
-# optimizer: paged_adamw_32bit
+num_epochs: 4
 optimizer: adamw_torch
 lr_scheduler: cosine
 learning_rate: 0.0002
@@ -61,8 +57,9 @@ xformers_attention:
 flash_attention: false
 
 warmup_steps: 10
-eval_steps: 20
-save_steps:
+evals_per_epoch: 4
+eval_table_size:
+saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt b/python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt
index 942a5ea0..e2862cec 100644
--- a/python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt
@@ -1,28 +1,26 @@
-# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.3.0/requirements.txt
---extra-index-url https://download.pytorch.org/whl/cu118
+# This file is copied from https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/requirements.txt
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
-# torch==2.1.0
-# auto-gptq
-packaging
+packaging==23.2
 peft==0.5.0
-transformers==4.34.0
+tokenizers
 bitsandbytes>=0.41.1
 accelerate==0.23.0
+deepspeed>=0.13.1
 addict
-evaluate
 fire
 PyYAML>=6.0
 datasets
-flash-attn>=2.2.1
+#flash-attn==2.3.3
 sentencepiece
 wandb
 einops
-# xformers
-optimum
+#xformers==0.0.22
+optimum==1.13.2
 hf_transfer
 colorama
 numba
 numpy>=1.24.4
+mlflow
 # qlora things
 bert-score==0.3.13
 evaluate==0.4.0
@@ -31,3 +29,15 @@ scipy
 scikit-learn==1.2.2
 pynvml
 art
+fschat==0.2.34
+gradio==3.50.2
+tensorboard
+
+mamba-ssm==1.1.1
+
+# remote filesystems
+s3fs
+gcsfs
+# adlfs
+
+trl>=0.7.9
diff --git a/python/llm/example/GPU/LLM-Finetuning/axolotl/train.py b/python/llm/example/GPU/LLM-Finetuning/axolotl/train.py
new file mode 100644
index 00000000..9db65470
--- /dev/null
+++ b/python/llm/example/GPU/LLM-Finetuning/axolotl/train.py
@@ -0,0 +1,83 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file is copied from
+# https://github.com/OpenAccess-AI-Collective/axolotl/blob/v0.4.0/src/axolotl/cli/train.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ipex_llm import llm_patch
+llm_patch(train=True)
+# The following is the original axolotl train code (without IPEX-LLM)
+
+"""
+CLI to run training on a model
+"""
+import logging
+from pathlib import Path
+from typing import Tuple
+
+import fire
+import transformers
+from transformers import PreTrainedModel, PreTrainedTokenizer
+
+from axolotl.cli import (
+    check_accelerate_default_config,
+    check_user_token,
+    load_cfg,
+    load_datasets,
+    load_rl_datasets,
+    print_axolotl_text_art,
+)
+from axolotl.common.cli import TrainerCliArgs
+from axolotl.train import train
+
+LOG = logging.getLogger("axolotl.cli.train")
+
+
+def do_cli(config: Path = Path("examples/"), **kwargs):
+    # pylint: disable=duplicate-code
+    parsed_cfg = load_cfg(config, **kwargs)
+    parser = transformers.HfArgumentParser((TrainerCliArgs))
+    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
+        return_remaining_strings=True
+    )
+    return do_train(parsed_cfg, parsed_cli_args)
+
+
+def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+    print_axolotl_text_art()
+    check_accelerate_default_config()
+    check_user_token()
+    if cfg.rl:
+        dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
+    else:
+        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+
+    return train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
+
+
+if __name__ == "__main__":
+    fire.Fire(do_cli)
diff --git a/python/llm/src/ipex_llm/llm_patching.py b/python/llm/src/ipex_llm/llm_patching.py
index 8c0a94e5..d68fac0c 100644
--- a/python/llm/src/ipex_llm/llm_patching.py
+++ b/python/llm/src/ipex_llm/llm_patching.py
@@ -47,6 +47,8 @@ def llm_patch(train=False):
         replace_attr(transformers, "AutoModelForCausalLM", AutoModelForCausalLM)
         replace_attr(transformers, "LlamaForCausalLM", AutoModelForCausalLM)
         replace_attr(transformers, "AutoModel", AutoModel)
+        from ipex_llm.transformers.utils import is_torch_bf16_gpu_available
+        replace_attr(transformers.utils, "is_torch_bf16_gpu_available", is_torch_bf16_gpu_available)
 
         import_peft_check = 'peft' in sys.modules or 'peft.utils' in sys.modules or \
             'peft.tuners' in sys.modules or 'peft.mapping' in sys.modules
diff --git a/python/llm/src/ipex_llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py
index b7873045..1e728473 100644
--- a/python/llm/src/ipex_llm/transformers/utils.py
+++ b/python/llm/src/ipex_llm/transformers/utils.py
@@ -335,3 +335,7 @@ def get_modelscope_hf_config(model_id_or_path: str,
     elif os.path.isfile(model_id_or_path):
         local_path = model_id_or_path
     return Config._file2dict(local_path)
+
+def is_torch_bf16_gpu_available():
+    # always true for XPU and CPU
+    return True