LLM: bigdl-llm lora support & lora example (#9740)

* lora support and single card example * support multi-card, refactor code * fix model id and style * remove torch patch, add two new class for bf16, update example * fix style * change to training_mode * small fix * add more info in help * fixstyle, update readme * fix ut * fix ut * Handling compatibility issues with default LoraConfig
2023-12-22 11:05:39 +08:00 · 2023-12-22 11:05:39 +08:00 · 2f36769208
commit 2f36769208
parent ba0b939579
16 changed files with 481 additions and 186 deletions
--- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md
+++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md
@ -1,6 +1,6 @@
-# Alpaca QLoRA & QA-LoRA Finetuning (experimental support)
+# Alpaca Finetuning with BigDL-LLM

-This example ports [Alpaca-LoRA](https://github.com/tloen/alpaca-lora/tree/main) to BigDL-LLM (using either [QLoRA](https://arxiv.org/abs/2305.14314) or [QA-LoRA](https://arxiv.org/abs/2309.14717) algorithm) on [Intel GPU](../../README.md).
+This example ports [Alpaca-LoRA](https://github.com/tloen/alpaca-lora/tree/main) to BigDL-LLM (using either [QLoRA](https://arxiv.org/abs/2305.14314) / [QA-LoRA](https://arxiv.org/abs/2309.14717) or [LoRA](https://arxiv.org/abs/2106.09685) algorithm) on [Intel GPU](../../README.md).

 ### 0. Requirements
 To run this example with BigDL-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../README.md#requirements) for more information.
@ -26,6 +26,8 @@ source /opt/intel/oneapi/setvars.sh

 ### 3. Finetune

+Now we support three training modes ([QLoRA](https://arxiv.org/abs/2305.14314) / [QA-LoRA](https://arxiv.org/abs/2309.14717) / [LoRA](https://arxiv.org/abs/2106.09685)), to run different mode, just change `training_mode` to `qlora` / `qalora` / `lora` in below script.
+
 Here, we provide example usages on different hardware. Please refer to the appropriate script based on your device:

 #### QLoRA
@ -97,6 +99,26 @@ bash qalora_finetune_llama2_7b_arc_2_card.sh
 bash qalora_finetune_llama2_7b_pvc_1550_1_tile.sh
 ```

+#### LoRA
+
+##### Finetuning LLaMA2-7B on four Intel Data Center GPU Max 1100
+
+```bash
+bash lora_finetune_llama2_7b_pvc_1100_1_card.sh
+```
+
+##### Finetuning LLaMA2-7B on single Tile Intel Data Center GPU Max 1550
+
+```bash
+bash lora_finetune_llama2_7b_pvc_1550_1_tile.sh
+```
+
+##### Finetuning LLaMA2-7B on four Intel Data Center GPU Max 1550
+
+```bash
+bash lora_finetune_llama2_7b_pvc_1550_4_card.sh
+```
+
 ### 4. (Optional) Resume Training
 If you fail to complete the whole finetuning process, it is suggested to resume training from a previously saved checkpoint by specifying `resume_from_checkpoint` to the local checkpoint folder as following:**
 ```bash
--- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py
+++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py
@ -48,10 +48,11 @@ from utils.prompter import Prompter

 import intel_extension_for_pytorch as ipex
 from bigdl.llm.transformers import AutoModelForCausalLM
-
 # import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model
 from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
-    cast_lora_weight, LoraConfig
+    LoraConfig
+from bigdl.llm.utils.common import invalidInputError
+

 def get_int_from_env(env_keys, default):
    """Returns the first positive env value found in the `env_keys` list or the default."""
@ -109,8 +110,10 @@ def train(
    prompt_template_name: str = "alpaca",  # The prompt template to use, will default to alpaca.
    gradient_checkpointing: bool = False,
    deepspeed: str = None,
-    qa_lora: bool = False, # if True, use qa-lora https://arxiv.org/abs/2309.14717
+    training_mode: str = "qlora",
 ):
+    invalidInputError(training_mode in ["qlora", "qalora", "lora"],
+                      "Only qlora / qalora / lora are supported for training_mode now.")
    if int(os.environ.get("LOCAL_RANK", 0)) == 0:
        print(
            f"Training Alpaca-LoRA model with params:\n"
@ -136,7 +139,7 @@ def train(
            f"wandb_log_model: {wandb_log_model}\n"
            f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
            f"prompt template: {prompt_template_name}\n"
-            f"qa_lora: {qa_lora}\n"
+            f"training_mode: {training_mode}\n"
        )
    assert (
        base_model
@ -175,7 +178,12 @@ def train(
    else:
        # According to the QLoRA paper, using "nf4" could yield better model quality than "int4"
        # Default 4-bit format for qa-lora is sym_int4
-        low_bit_format = "sym_int4" if qa_lora else "nf4" 
+        if training_mode == "qalora":
+            low_bit_format = "sym_int4"
+        elif training_mode == "lora":
+            low_bit_format = "bf16"
+        else:
+            low_bit_format = "nf4"
        # Load the base model from a directory or the HF Hub to 4-bit format
        model = AutoModelForCausalLM.from_pretrained(
            base_model,
@ -196,7 +204,7 @@ def train(
        0  # unk. we want this to be different from the eos token
    )
    tokenizer.padding_side = "left"  # Allow batched inference
-    
+
    print(model)

    def tokenize(prompt, add_eos_token=True):
@ -257,7 +265,7 @@ def train(
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
-        qa_lora=qa_lora,
+        training_mode=training_mode,
    )
    print(f"Lora Config: {config}")
    model = get_peft_model(model, config)
@ -301,7 +309,7 @@ def train(
            max_grad_norm=0.3,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
-            lr_scheduler_type="constant" if qa_lora else "cosine",
+            lr_scheduler_type="constant" if training_mode=="qalora" else "cosine",
            bf16=True,  # ensure training more stable
            logging_steps=1,
            optim="adamw_torch",
--- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/lora_finetune_llama2_7b_pvc_1110_4_card.sh
+++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/lora_finetune_llama2_7b_pvc_1110_4_card.sh
@ -0,0 +1,31 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+export MASTER_ADDR=127.0.0.1
+export OMP_NUM_THREADS=14
+export FI_PROVIDER=tcp
+export CCL_ATL_TRANSPORT=ofi
+
+mpirun -n 4 \
+    python -u ./alpaca_qlora_finetuning.py \
+    --micro_batch_size 8 \
+    --batch_size 128 \
+    --base_model "meta-llama/Llama-2-7b-hf" \
+    --data_path "yahma/alpaca-cleaned" \
+    --output_dir "./bigdl-lora-alpaca" \
+    --gradient_checkpointing True \
+    --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']" \
+    --training_mode "lora"
--- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/lora_finetune_llama2_7b_pvc_1550_1_tile.sh
+++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/lora_finetune_llama2_7b_pvc_1550_1_tile.sh
@ -0,0 +1,26 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
+python ./alpaca_qlora_finetuning.py \
+    --micro_batch_size 8 \
+    --batch_size 128 \
+    --base_model "meta-llama/Llama-2-7b-hf" \
+    --data_path "yahma/alpaca-cleaned" \
+    --output_dir "./bigdl-lora-alpaca" \
+    --gradient_checkpointing True \
+    --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']" \
+    --training_mode "lora"
--- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/lora_finetune_llama2_7b_pvc_1550_4_card.sh
+++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/lora_finetune_llama2_7b_pvc_1550_4_card.sh
@ -0,0 +1,31 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+export MASTER_ADDR=127.0.0.1
+export OMP_NUM_THREADS=7
+export FI_PROVIDER=tcp
+export CCL_ATL_TRANSPORT=ofi
+
+mpirun -n 8 \
+    python -u ./alpaca_qlora_finetuning.py \
+    --micro_batch_size 8 \
+    --batch_size 128 \
+    --base_model "meta-llama/Llama-2-7b-hf" \
+    --data_path "yahma/alpaca-cleaned" \
+    --output_dir "./bigdl-lora-alpaca" \
+    --gradient_checkpointing False \
+    --lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']" \
+    --training_mode "lora"
--- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/qalora_finetune_llama2_7b_arc_1_card.sh
+++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/qalora_finetune_llama2_7b_arc_1_card.sh
@ -26,4 +26,4 @@ python ./alpaca_qlora_finetuning.py \
    --lora_alpha 16 \
    --lora_dropout 0.05 \
    --val_set_size 2000 \
-    --qa_lora True
+    --training_mode "qalora"
--- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/qalora_finetune_llama2_7b_arc_2_card.sh
+++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/qalora_finetune_llama2_7b_arc_2_card.sh
@ -31,4 +31,4 @@ mpirun -n 2 \
       --lora_alpha 16 \
       --lora_dropout 0.05 \
       --val_set_size 2000 \
-       --qa_lora True > training.log
+    --training_mode "qalora" > training.log
--- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/qalora_finetune_llama2_7b_pvc_1550_1_card.sh
+++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/qalora_finetune_llama2_7b_pvc_1550_1_card.sh
@ -24,7 +24,7 @@ mpirun -n 2 \
       --base_model "meta-llama/Llama-2-7b-hf" \
       --data_path "yahma/alpaca-cleaned" \
       --output_dir "./bigdl-qlora-alpaca" \
-       --qa_lora True \
+       --training_mode "qalora" \
       --learning_rate 9e-5 \
       --micro_batch_size 8 \
       --batch_size 128 \
--- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/qalora_finetune_llama2_7b_pvc_1550_1_tile.sh
+++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/qalora_finetune_llama2_7b_pvc_1550_1_tile.sh
@ -28,4 +28,4 @@ python ./alpaca_qlora_finetuning.py \
    --lora_alpha 16 \
    --lora_dropout 0.05 \
    --val_set_size 2000 \
-    --qa_lora True
+    --training_mode "qalora"
--- a/python/llm/example/GPU/QLoRA-FineTuning/qlora_finetuning.py
+++ b/python/llm/example/GPU/QLoRA-FineTuning/qlora_finetuning.py
@ -57,7 +57,7 @@ if __name__ == "__main__":
        target_modules=["q_proj", "k_proj", "v_proj"], 
        lora_dropout=0.05, 
        bias="none", 
-        task_type="CAUSAL_LM"
+        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, config)

--- a/python/llm/src/bigdl/llm/ggml/quantize.py
+++ b/python/llm/src/bigdl/llm/ggml/quantize.py
@ -38,7 +38,8 @@ ggml_tensor_qtype = {"sym_int4": 2,   # q4_0 in ggml
                     "mixed_fp4": 17,     # Mixture of Formats Quantization 4 bits
                     "mixed_fp8": 18,     # Mixture of Formats Quantization 8 bits
                     "fp8_e5m2": 19,      # fp8 in e5m2 format
-                     "fp8": 15}           # fp8 in e4m3 format
+                     "fp8": 15,           # fp8 in e4m3 format
+                     "bf16": 20}

 _llama_quantize_type = {"q4_0": 2,
                        "q4_1": 3,
--- a/python/llm/src/bigdl/llm/transformers/convert.py
+++ b/python/llm/src/bigdl/llm/transformers/convert.py
@ -172,7 +172,8 @@ def convert_gptq(module, awq=False):
 def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                                 current_key_name=None, convert_shape_only=False,
                                 cpu_embedding=False):
-    from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params, FP16Linear
+    from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \
+        FP16Linear, BF16Linear
    from bigdl.llm.transformers.embedding import LLMEmbedding
    has_been_replaced = False

@ -212,7 +213,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                        if has_bias:
                            new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
                                .to(device_type)
-                    elif qtype != ggml_tensor_qtype["fp16"]:
+                    elif qtype not in [ggml_tensor_qtype["fp16"], ggml_tensor_qtype["bf16"]]:
                        new_linear = LowBitLinear(
                            in_features,
                            out_features,
@ -233,7 +234,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                        if module.bias is not None:
                            new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
                                .to(device_type)
-                    else:
+                    elif qtype == ggml_tensor_qtype["fp16"]:
                        #  only support two size now
                        #  may generalize to other sizes
                        if module.in_features in [4096, 11008]:
@ -259,8 +260,20 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                            if module.bias is not None:
                                new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
                                    .to(device_type)
+                    elif qtype == ggml_tensor_qtype["bf16"]:
+                        new_linear = BF16Linear(
+                            in_features,
+                            out_features,
+                            module.bias is not None,
+                            mp_group=mp_group,
+                        )
+                        device_type = module.weight.data.device.type
+                        # convert here
+                        new_linear._parameters['weight'] = nn.Parameter(module.weight)
+                        if module.bias is not None:
+                            new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
+                                .to(device_type)

-                    #  fp16 may generalize to other sizes later
                    if new_linear is not None:
                        if not module.training:
                            new_linear.eval()
--- a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py
+++ b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py
@ -589,3 +589,37 @@ class FP16Linear(nn.Linear):
            result += self.bias

        return result.to(x.dtype)
+
+
+class BF16Linear(nn.Linear):
+    def __init__(self, input_features, output_features, bias=True,
+                 mp_group=None, compute_dtype=None):
+        super().__init__(input_features, output_features, bias)
+        self.in_len = input_features
+        self.out_len = output_features
+        self.weight_shape = (self.out_len, self.in_len)
+        self.weight_length = self.out_len * self.in_len
+        self.mp_group = mp_group
+        self.compute_dtype = compute_dtype
+
+    def forward(self, x: torch.Tensor):
+        # only work for GPU now
+        invalidInputError(x.device.type == "xpu",
+                          "bf16 only works for GPU now")
+        is_training = self.training and not torch.is_inference_mode_enabled()
+        if is_training:
+            # below logic is only for training
+            autocast_dtype = get_autocast_dtype(x)
+            if self.compute_dtype is not None and x.device.type == "xpu":
+                x = x.to(self.compute_dtype)  # solve GC issue for unlora module
+            elif autocast_dtype is not None:
+                x = x.to(autocast_dtype)
+
+        if self.bias is not None and self.bias.dtype != x.dtype:
+            self.bias.data = self.bias.data.to(x.dtype)
+
+        result = F.linear(x, self.weight)
+        if self.bias is not None:
+            result += self.bias
+
+        return result.to(x.dtype)
--- a/python/llm/src/bigdl/llm/transformers/model.py
+++ b/python/llm/src/bigdl/llm/transformers/model.py
@ -106,7 +106,7 @@ class _BaseAutoModelClass:
                                if the model is GPTQ model.
                             Default to be False.
        :param load_in_low_bit: str value, options are sym_int4, asym_int4, sym_int5, asym_int5
-                                , sym_int8, nf3, nf4, fp4, fp8, fp8_e4m3, fp8_e5m2 or fp16.
+                                , sym_int8, nf3, nf4, fp4, fp8, fp8_e4m3, fp8_e5m2, fp16 or bf16.
                                sym_int4 means symmetric int 4, asym_int4 means asymmetric int 4,
                                nf4 means 4-bit NormalFloat, etc. Relevant low bit optimizations
                                will be applied to the model.
@ -231,7 +231,7 @@ class _BaseAutoModelClass:
        invalidInputError(q_k in ggml_tensor_qtype,
                          f"Unknown load_in_low_bit value: {q_k}, expected:"
                          f" sym_int4, asym_int4, sym_int5, asym_int5, sym_int8, nf3, nf4, "
-                          "fp4, fp8, fp8_e4m3, fp8_e5m2, fp16, mixed_fp4 or mixed_fp8.")
+                          "fp4, fp8, fp8_e4m3, fp8_e5m2, fp16,  bf16, mixed_fp4 or mixed_fp8.")
        qtype = ggml_tensor_qtype[q_k]

        # In case it needs a second try,
--- a/python/llm/src/bigdl/llm/transformers/qlora.py
+++ b/python/llm/src/bigdl/llm/transformers/qlora.py
@ -32,7 +32,7 @@
 # limitations under the License.
 #
 # Some parts of this file is adapted from
-# https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/training_args.py
+# https://github.com/huggingface/peft/blob/v0.5.0/src/peft/tuners/lora.py
 #
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
@ -49,11 +49,12 @@
 # limitations under the License.

 import torch
-from bigdl.llm.transformers.low_bit_linear import LowBitLinear, get_qk_size
+from bigdl.llm.transformers.low_bit_linear import LowBitLinear, BF16Linear, get_qk_size
 from peft.tuners.lora import LoraLayer
 from bigdl.llm.utils.common import invalidInputError
 from bigdl.llm.transformers.utils import get_autocast_dtype
 import functools
+from bigdl.llm.transformers import training_patch


 class LoraLowBitLinear(LowBitLinear, LoraLayer):
@ -128,22 +129,98 @@ class LoraLowBitLinear(LowBitLinear, LoraLayer):
        return result


+class LoraBF16Linear(BF16Linear, LoraLayer):
+    # Lora implemented in a dense layer
+    def __init__(
+        self,
+        adapter_name,
+        in_features,
+        out_features,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        **kwargs,
+    ):
+        BF16Linear.__init__(
+            self,
+            in_features,
+            out_features,
+            bias=kwargs.get("bias", True),
+            compute_dtype=torch.bfloat16,
+        )
+
+        LoraLayer.__init__(self, in_features=in_features, out_features=out_features)
+
+        # Freezing the pre-trained weight matrix
+        self.weight.requires_grad = False
+
+        init_lora_weights = kwargs.pop("init_lora_weights", True)
+        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+        self.active_adapter = adapter_name
+
+    def forward(self, x: torch.Tensor):
+        autocast_dtype = get_autocast_dtype(x)
+        if x.device.type == "xpu":
+            # force to use bf16 on gpu
+            x = x.to(torch.bfloat16)
+        elif autocast_dtype is not None:
+            x = x.to(autocast_dtype)
+        result = super().forward(x)
+
+        if self.disable_adapters or self.active_adapter not in self.lora_A.keys():
+            return result
+        elif self.r[self.active_adapter] > 0:
+            result = result.clone()
+            if autocast_dtype is None and x.device.type == "cpu":
+                expected_dtype = result.dtype
+                x = x.to(self.lora_A[self.active_adapter].weight.dtype)
+                output = (
+                    self.lora_B[self.active_adapter](
+                        self.lora_A[self.active_adapter](
+                            self.lora_dropout[self.active_adapter](x))
+                    ).to(expected_dtype)
+                    * self.scaling[self.active_adapter]
+                )
+            else:
+                output = (
+                    self.lora_B[self.active_adapter](
+                        self.lora_A[self.active_adapter](
+                            self.lora_dropout[self.active_adapter](x))
+                    )
+                    * self.scaling[self.active_adapter]
+                )
+            result += output
+        return result
+
+
 def _create_new_module(create_new_module_func, lora_config, adapter_name, target, **kwargs):

-    if isinstance(target, LowBitLinear):
+    if isinstance(target, LowBitLinear) or isinstance(target, BF16Linear):
        low_bit_kwargs = kwargs.copy()
        bias = low_bit_kwargs.pop("bias", False)
-        low_bit_kwargs.update(
-            {
-                "qtype": target.qtype,
-                "qa_lora": lora_config.qa_lora if hasattr(lora_config, "qa_lora") else False,
-            }
-        )
-        new_module = LoraLowBitLinear(adapter_name,
-                                      target.in_features,
-                                      target.out_features,
-                                      bias=bias,
-                                      **low_bit_kwargs)
+
+        if hasattr(lora_config, "training_mode") and lora_config.training_mode == "lora":
+            new_module = LoraBF16Linear(adapter_name,
+                                        target.in_features,
+                                        target.out_features,
+                                        bias=bias,
+                                        **low_bit_kwargs)
+        else:
+            if hasattr(lora_config, "training_mode"):
+                qa_lora = lora_config.training_mode == "qalora"
+            else:
+                qa_lora = False
+            low_bit_kwargs.update(
+                {
+                    "qtype": target.qtype,
+                    "qa_lora": qa_lora
+                }
+            )
+            new_module = LoraLowBitLinear(adapter_name,
+                                          target.in_features,
+                                          target.out_features,
+                                          bias=bias,
+                                          **low_bit_kwargs)
    else:
        new_module = create_new_module_func(lora_config, adapter_name, target, **kwargs)

@ -157,8 +234,7 @@ from dataclasses import dataclass, field

@dataclass
 class LoraConfig(LoraConfigBase):
-
-    qa_lora: bool = field(default=False, metadata={"help": "enable qa-lora"})
+    training_mode: str = field(default="qlora", metadata={"help": "determine training mode"})


 def get_peft_model(*args, **kwargs):
@ -237,158 +313,10 @@ class PeftModel:
        return model


-def patch_prepare_ipex(self, *args):
-    return tuple(args)
-
-
-from transformers.utils import (
-    requires_backends,
-    is_sagemaker_mp_enabled,
-    is_accelerate_available,
-    is_torch_xpu_available,
-    is_sagemaker_dp_enabled,
-    is_torch_tpu_available,
-    is_torch_npu_available)
-from transformers.utils.generic import strtobool
-from transformers.utils import cached_property
-from transformers.training_args import logger, ParallelMode, DistributedType
-import torch
-import torch.distributed as dist
-import os
-import warnings
-from datetime import timedelta
-
-if is_accelerate_available():
-    from accelerate.state import AcceleratorState, PartialState
-    from accelerate.utils import DistributedType
-
-if is_sagemaker_mp_enabled():
-    import smdistributed.modelparallel.torch as smp
-
-    smp.init()
-
-
-@cached_property
-def _setup_devices(self) -> "torch.device":
-    requires_backends(self, ["torch"])
-    logger.info("PyTorch: setting up devices")
-    if not is_sagemaker_mp_enabled():
-        if not is_accelerate_available(min_version="0.20.1"):
-            invalidInputError(
-                False,
-                "Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: "
-                "Please run `pip install transformers[torch]` or `pip install accelerate -U`"
-            )
-        AcceleratorState._reset_state(reset_partial_state=True)
-    self.distributed_state = None
-    if not self.use_ipex and "ACCELERATE_USE_IPEX" not in os.environ:
-        os.environ["ACCELERATE_USE_IPEX"] = "false"
-    if self.use_cpu or strtobool(os.environ.get("ACCELERATE_USE_CPU", "False")):
-        self.distributed_state = PartialState(cpu=True, backend=self.ddp_backend)
-        self._n_gpu = 0
-    elif is_sagemaker_mp_enabled():
-        local_rank = smp.local_rank()
-        device = torch.device("cuda", local_rank)
-        self._n_gpu = 1
-        torch.cuda.set_device(device)
-    elif is_torch_xpu_available() and "ACCELERATE_USE_XPU" not in os.environ:
-        os.environ["ACCELERATE_USE_XPU"] = "true"
-        self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
-        # device = torch.device("xpu:0")
-        device = self.distributed_state.device
-        self._n_gpu = 1
-    elif is_sagemaker_dp_enabled():
-        self.distributed_state = PartialState(_use_sagemaker_dp=True)
-        self._n_gpu = 1
-    elif self.deepspeed:
-        # Need to do similar for Accelerator init
-        os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
-        self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
-        del os.environ["ACCELERATE_USE_DEEPSPEED"]
-        self._n_gpu = 1
-    else:
-        self.distributed_state = PartialState(
-            backend=self.ddp_backend, timeout=timedelta(seconds=self.ddp_timeout)
-        )
-        self._n_gpu = 1
-    if not is_sagemaker_mp_enabled():
-        device = self.distributed_state.device
-        self.local_rank = self.distributed_state.local_process_index
-    if dist.is_available() and dist.is_initialized() and \
-            self.parallel_mode != ParallelMode.DISTRIBUTED:
-        logger.warning(
-            "torch.distributed process group is initialized, "
-            "but parallel_mode != ParallelMode.DISTRIBUTED. "
-            "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
-        )
-    if is_torch_tpu_available():
-        device = self.distributed_state.device
-        self._n_gpu = 0
-    elif is_sagemaker_dp_enabled() or is_sagemaker_mp_enabled():
-        # Already set _n_gpu
-        pass
-    elif self.distributed_state.distributed_type == DistributedType.MULTI_XPU:
-        if "ACCELERATE_USE_XPU" not in os.environ:
-            os.environ["ACCELERATE_USE_XPU"] = "true"
-        # self._n_gpu = torch.xpu.device_count()
-        # device = torch.device("xpu:0")
-        # torch.xpu.set_device(device)
-    elif self.distributed_state.distributed_type == DistributedType.NO:
-        if self.use_mps_device:
-            warnings.warn(
-                "`use_mps_device` is deprecated and will be removed in"
-                " version 5.0 of 🤗 Transformers."
-                "`mps` device will be used by default if available similar"
-                " to the way `cuda` device is used."
-                "Therefore, no action from user is required. "
-            )
-            if device.type != "mps":
-                invalidInputError(False,
-                                  ("Either you do not have an MPS-enabled device"
-                                   " on this machine or MacOS"
-                                   " version is not 12.3+ "
-                                   "or current PyTorch install was not built with MPS enabled."))
-        if device.type == "mps":
-            self._n_gpu = 1
-        elif self.use_cpu:
-            device = torch.device("cpu")
-            self._n_gpu = 0
-        elif is_torch_xpu_available():
-            device = torch.device("xpu:0")
-            torch.xpu.set_device(device)
-            self._n_gpu = 1
-        elif is_torch_npu_available():
-            device = torch.device("npu:0")
-            torch.npu.set_device(device)
-            self._n_gpu = 1
-        else:
-            # if n_gpu is > 1 we'll use nn.DataParallel.
-            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
-            # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
-            # trigger an error that a device index is missing. Index 0 takes into account the
-            # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
-            # will use the first GPU in that env, i.e. GPU#1
-            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-            # Sometimes the line in the postinit has not been run before we end up here,
-            # so just checking we're not at
-            # the default value.
-            self._n_gpu = torch.cuda.device_count()
-            if device.type == "cuda":
-                torch.cuda.set_device(device)
-    return device
-
 from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING

 PEFT_TYPE_TO_CONFIG_MAPPING["lora"] = LoraConfig

-# workaround a IPEX bug that prevents resume training in bf16
-from accelerate import Accelerator
-Accelerator._prepare_ipex = patch_prepare_ipex
-
-# patch transformer for xpu DDP traing
-from transformers import TrainingArguments
-TrainingArguments._setup_devices = _setup_devices
-

 def cast_lora_weight(model, dtype=torch.bfloat16):
    for name, module in model.named_modules():
@ -396,6 +324,9 @@ def cast_lora_weight(model, dtype=torch.bfloat16):
            module.compute_dtype = dtype
        if isinstance(module, LoraLayer):
            module = module.to(dtype)
+        if isinstance(module, BF16Linear):
+            module = module.to(dtype)
+            module.compute_dtype = dtype
        if 'norm' in name:
            module = module.to(torch.float32)
        if 'lm_head' in name or 'embed_tokens' in name:
--- a/python/llm/src/bigdl/llm/transformers/training_patch.py
+++ b/python/llm/src/bigdl/llm/transformers/training_patch.py
@ -0,0 +1,198 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some parts of this file is adapted from
+# https://github.com/huggingface/peft/blob/v0.5.0/src/peft/tuners/lora.py
+#
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some parts of this file is adapted from
+# https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/training_args.py
+#
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def patch_prepare_ipex(self, *args):
+    return tuple(args)
+
+
+from transformers.utils import (
+    requires_backends,
+    is_sagemaker_mp_enabled,
+    is_accelerate_available,
+    is_torch_xpu_available,
+    is_sagemaker_dp_enabled,
+    is_torch_tpu_available,
+    is_torch_npu_available)
+from transformers.utils.generic import strtobool
+from transformers.utils import cached_property
+from transformers.training_args import logger, ParallelMode, DistributedType
+import torch
+import torch.distributed as dist
+import os
+import warnings
+from datetime import timedelta
+
+if is_accelerate_available():
+    from accelerate.state import AcceleratorState, PartialState
+    from accelerate.utils import DistributedType
+
+if is_sagemaker_mp_enabled():
+    import smdistributed.modelparallel.torch as smp
+
+    smp.init()
+
+
+@cached_property
+def _setup_devices(self) -> "torch.device":
+    requires_backends(self, ["torch"])
+    logger.info("PyTorch: setting up devices")
+    if not is_sagemaker_mp_enabled():
+        if not is_accelerate_available(min_version="0.20.1"):
+            invalidInputError(
+                False,
+                "Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: "
+                "Please run `pip install transformers[torch]` or `pip install accelerate -U`"
+            )
+        AcceleratorState._reset_state(reset_partial_state=True)
+    self.distributed_state = None
+    if not self.use_ipex and "ACCELERATE_USE_IPEX" not in os.environ:
+        os.environ["ACCELERATE_USE_IPEX"] = "false"
+    if self.use_cpu or strtobool(os.environ.get("ACCELERATE_USE_CPU", "False")):
+        self.distributed_state = PartialState(cpu=True, backend=self.ddp_backend)
+        self._n_gpu = 0
+    elif is_sagemaker_mp_enabled():
+        local_rank = smp.local_rank()
+        device = torch.device("cuda", local_rank)
+        self._n_gpu = 1
+        torch.cuda.set_device(device)
+    elif is_torch_xpu_available() and "ACCELERATE_USE_XPU" not in os.environ:
+        os.environ["ACCELERATE_USE_XPU"] = "true"
+        self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
+        # device = torch.device("xpu:0")
+        device = self.distributed_state.device
+        self._n_gpu = 1
+    elif is_sagemaker_dp_enabled():
+        self.distributed_state = PartialState(_use_sagemaker_dp=True)
+        self._n_gpu = 1
+    elif self.deepspeed:
+        # Need to do similar for Accelerator init
+        os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
+        self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
+        del os.environ["ACCELERATE_USE_DEEPSPEED"]
+        self._n_gpu = 1
+    else:
+        self.distributed_state = PartialState(
+            backend=self.ddp_backend, timeout=timedelta(seconds=self.ddp_timeout)
+        )
+        self._n_gpu = 1
+    if not is_sagemaker_mp_enabled():
+        device = self.distributed_state.device
+        self.local_rank = self.distributed_state.local_process_index
+    if dist.is_available() and dist.is_initialized() and \
+            self.parallel_mode != ParallelMode.DISTRIBUTED:
+        logger.warning(
+            "torch.distributed process group is initialized, "
+            "but parallel_mode != ParallelMode.DISTRIBUTED. "
+            "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
+        )
+    if is_torch_tpu_available():
+        device = self.distributed_state.device
+        self._n_gpu = 0
+    elif is_sagemaker_dp_enabled() or is_sagemaker_mp_enabled():
+        # Already set _n_gpu
+        pass
+    elif self.distributed_state.distributed_type == DistributedType.MULTI_XPU:
+        if "ACCELERATE_USE_XPU" not in os.environ:
+            os.environ["ACCELERATE_USE_XPU"] = "true"
+        # self._n_gpu = torch.xpu.device_count()
+        # device = torch.device("xpu:0")
+        # torch.xpu.set_device(device)
+    elif self.distributed_state.distributed_type == DistributedType.NO:
+        if self.use_mps_device:
+            warnings.warn(
+                "`use_mps_device` is deprecated and will be removed in"
+                " version 5.0 of 🤗 Transformers."
+                "`mps` device will be used by default if available similar"
+                " to the way `cuda` device is used."
+                "Therefore, no action from user is required. "
+            )
+            if device.type != "mps":
+                invalidInputError(False,
+                                  ("Either you do not have an MPS-enabled device"
+                                   " on this machine or MacOS"
+                                   " version is not 12.3+ "
+                                   "or current PyTorch install was not built with MPS enabled."))
+        if device.type == "mps":
+            self._n_gpu = 1
+        elif self.use_cpu:
+            device = torch.device("cpu")
+            self._n_gpu = 0
+        elif is_torch_xpu_available():
+            device = torch.device("xpu:0")
+            torch.xpu.set_device(device)
+            self._n_gpu = 1
+        elif is_torch_npu_available():
+            device = torch.device("npu:0")
+            torch.npu.set_device(device)
+            self._n_gpu = 1
+        else:
+            # if n_gpu is > 1 we'll use nn.DataParallel.
+            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
+            # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
+            # trigger an error that a device index is missing. Index 0 takes into account the
+            # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
+            # will use the first GPU in that env, i.e. GPU#1
+            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+            # Sometimes the line in the postinit has not been run before we end up here,
+            # so just checking we're not at
+            # the default value.
+            self._n_gpu = torch.cuda.device_count()
+            if device.type == "cuda":
+                torch.cuda.set_device(device)
+    return device
+
+# remove ipex.optimize
+from accelerate import Accelerator
+Accelerator._prepare_ipex = patch_prepare_ipex
+
+# patch transformer for xpu DDP traing
+from transformers import TrainingArguments
+TrainingArguments._setup_devices = _setup_devices