LLM: Support bitsandbytes config on qlora finetune (#9715)

* test support bitsandbytesconfig * update style * update cpu example * update example * update readme * update unit test * use bfloat16 * update logic * use int4 * set defalut bnb_4bit_use_double_quant * update * update example * update model.py * update * support lora example
2024-01-04 11:23:16 +08:00 · 2024-01-04 11:23:16 +08:00 · 4ceefc9b18
commit 4ceefc9b18
parent 9a14465560
10 changed files with 126 additions and 27 deletions
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@ -304,5 +304,6 @@ jobs:
        shell: bash
        run: |
          python -m pip install transformers==4.34.0 peft==0.5.0 accelerate==0.23.0
          python -m pip install bitsandbytes scipy
          source /opt/intel/oneapi/setvars.sh
          bash python/llm/test/run-llm-example-tests-gpu.sh
--- a/python/llm/example/CPU/QLoRA-FineTuning/README.md
+++ b/python/llm/example/CPU/QLoRA-FineTuning/README.md
@ -23,6 +23,7 @@ pip install transformers==4.34.0
 pip install peft==0.5.0
 pip install datasets
 pip install accelerate==0.23.0
 pip install bitsandbytes scipy
 ```
 ### 2. Finetune model
--- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md
+++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/README.md
@ -11,6 +11,7 @@ pip install --pre --upgrade bigdl-llm[all]
 pip install datasets transformers==4.35.0
 pip install fire peft==0.5.0
 pip install accelerate==0.23.0
 pip install bitsandbytes scipy
 ```
 ### 2. Configures environment variables
--- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py
+++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py
@ -46,6 +46,7 @@ from peft import (
 )
 from utils.prompter import Prompter
 from transformers import BitsAndBytesConfig
 from bigdl.llm.transformers import AutoModelForCausalLM
 # import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model
@ -177,13 +178,22 @@ def train(
        )
    else:
        # Load the base model from a directory or the HF Hub to 4-bit NormalFloat format
-        model = AutoModelForCausalLM.from_pretrained(
+        bnb_config = BitsAndBytesConfig(
-            base_model,
+            load_in_4bit=True,
-            load_in_low_bit="sym_int4", # not support "nf4"
+            bnb_4bit_use_double_quant=False,
-            optimize_model=False,
+            bnb_4bit_quant_type="int4",  # nf4 not supported on cpu yet
-            torch_dtype=torch.bfloat16,
+            bnb_4bit_compute_dtype=torch.bfloat16
            modules_to_not_convert=["lm_head"],
        )
        model = AutoModelForCausalLM.from_pretrained(base_model,
                                                     quantization_config=bnb_config, )
        # below is also supported
        # model = AutoModelForCausalLM.from_pretrained(
        #     base_model,
        #     load_in_low_bit="sym_int4", # nf4 not supported on cpu yet
        #     optimize_model=False,
        #     torch_dtype=torch.bfloat16,
        #     modules_to_not_convert=["lm_head"],
        # )
    print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}")
    model = model.to("cpu")
    print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}")
--- a/python/llm/example/CPU/QLoRA-FineTuning/qlora_finetuning_cpu.py
+++ b/python/llm/example/CPU/QLoRA-FineTuning/qlora_finetuning_cpu.py
@ -20,6 +20,7 @@ import os
 import transformers
 from transformers import LlamaTokenizer
 from transformers import BitsAndBytesConfig
 from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, LoraConfig
 from bigdl.llm.transformers import AutoModelForCausalLM
 from datasets import load_dataset
@ -45,11 +46,24 @@ if __name__ == "__main__":
    data['train'] = data['train'].map(merge)
    # use the max_length to reduce memory usage, should be adjusted by different datasets
    data = data.map(lambda samples: tokenizer(samples["prediction"], max_length=256), batched=True)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="int4",  # nf4 not supported on cpu yet
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    model = AutoModelForCausalLM.from_pretrained(model_path,
-                                                 load_in_low_bit="sym_int4",
+                                                 quantization_config=bnb_config, )
-                                                 optimize_model=False,
+
-                                                 torch_dtype=torch.float16,
+    # below is also supported
-                                                 modules_to_not_convert=["lm_head"], )
+    # model = AutoModelForCausalLM.from_pretrained(model_path,
    #                                              # nf4 not supported on cpu yet
    #                                              load_in_low_bit="sym_int4",
    #                                              optimize_model=False,
    #                                              torch_dtype=torch.bfloat16,
    #                                              modules_to_not_convert=["lm_head"], )
    model = model.to('cpu')
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)
    model.enable_input_require_grads()
--- a/python/llm/example/GPU/QLoRA-FineTuning/README.md
+++ b/python/llm/example/GPU/QLoRA-FineTuning/README.md
@ -20,6 +20,7 @@ pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-w
 pip install datasets transformers==4.34.0
 pip install peft==0.5.0
 pip install accelerate==0.23.0
 pip install bitsandbytes scipy
 ```
 ### 2. Configures OneAPI environment variables
--- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md
+++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/README.md
@ -17,6 +17,7 @@ pip install datasets transformers==4.34.0
 pip install fire peft==0.5.0
 pip install oneccl_bind_pt==2.0.100 -f https://developer.intel.com/ipex-whl-stable-xpu # necessary to run distributed finetuning
 pip install accelerate==0.23.0
 pip install bitsandbytes scipy
 ```
 ### 2. Configures OneAPI environment variables
--- a/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py
+++ b/python/llm/example/GPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning.py
@ -47,6 +47,7 @@ from peft import (
 from utils.prompter import Prompter
 import intel_extension_for_pytorch as ipex
 from transformers import BitsAndBytesConfig
 from bigdl.llm.transformers import AutoModelForCausalLM
 # import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model
 from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
@ -197,20 +198,36 @@ def train(
        # According to the QLoRA paper, using "nf4" could yield better model quality than "int4"
        # Default 4-bit format for qa-lora is sym_int4
        if training_mode == "qalora":
-            low_bit_format = "sym_int4"
+            low_bit_format = "int4"
        elif training_mode == "lora":
            low_bit_format = "bf16"
        else:
            low_bit_format = "nf4"
-        # Load the base model from a directory or the HF Hub to 4-bit format
+        bnb_config = BitsAndBytesConfig(
-        model = AutoModelForCausalLM.from_pretrained(
+            load_in_4bit=True,
-            base_model,
+            bnb_4bit_use_double_quant=False,
-            load_in_low_bit=low_bit_format,
+            bnb_4bit_quant_type=low_bit_format,
-            optimize_model=False,
+            bnb_4bit_compute_dtype=torch.bfloat16
            torch_dtype=torch.bfloat16,
            # device_map=device_map,
            modules_to_not_convert=["lm_head"],
        )
        model = AutoModelForCausalLM.from_pretrained(base_model,
                                                     quantization_config=bnb_config, )
        # below is also supported
        # Load the base model from a directory or the HF Hub to 4-bit format
        # if training_mode == "qalora":
        #     low_bit_format = "sym_int4"
        # elif training_mode == "lora":
        #     low_bit_format = "bf16"
        # else:
        #     low_bit_format = "nf4"
        # model = AutoModelForCausalLM.from_pretrained(
        #     base_model,
        #     load_in_low_bit=low_bit_format,
        #     optimize_model=False,
        #     torch_dtype=torch.bfloat16,
        #     # device_map=device_map,
        #     modules_to_not_convert=["lm_head"],
        # )
    print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}")
    model = model.to(f'xpu:{os.environ.get("LOCAL_RANK", 0)}')
    print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}")
--- a/python/llm/example/GPU/QLoRA-FineTuning/qlora_finetuning.py
+++ b/python/llm/example/GPU/QLoRA-FineTuning/qlora_finetuning.py
@ -21,6 +21,7 @@ import transformers
 from transformers import LlamaTokenizer
 import intel_extension_for_pytorch as ipex
 from peft import LoraConfig
 from transformers import BitsAndBytesConfig
 from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training
 from bigdl.llm.transformers import AutoModelForCausalLM
 from datasets import load_dataset
@ -41,11 +42,22 @@ if __name__ == "__main__":
    data = load_dataset(dataset_path)
    data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    model = AutoModelForCausalLM.from_pretrained(model_path,
-                                                load_in_low_bit="nf4",
+                                                 quantization_config=bnb_config, )
-                                                optimize_model=False,
+
-                                                torch_dtype=torch.float16,
+    # below is also supported
-                                                modules_to_not_convert=["lm_head"],)
+    # model = AutoModelForCausalLM.from_pretrained(model_path,
    #                                             load_in_low_bit="nf4",
    #                                             optimize_model=False,
    #                                             torch_dtype=torch.bfloat16,
    #                                             modules_to_not_convert=["lm_head"],)
    model = model.to('xpu')
    # Enable gradient_checkpointing if your memory is not enough,
    # it will slowdown the training speed
--- a/python/llm/src/bigdl/llm/transformers/model.py
+++ b/python/llm/src/bigdl/llm/transformers/model.py
@ -88,7 +88,6 @@ def save_low_bit(self, *args, **kwargs):
 class _BaseAutoModelClass:
    HF_MODEL = None
    @classmethod
@ -136,6 +135,48 @@ class _BaseAutoModelClass:
        optimize_model = kwargs.pop("optimize_model", True)
        user_quantization_config = kwargs.pop("quantization_config", None)
        if user_quantization_config is not None and \
                "BitsAndBytesConfig" in str(user_quantization_config.__class__):
            if user_quantization_config.bnb_4bit_quant_type is not None:
                bnb_4bit_type = user_quantization_config.bnb_4bit_quant_type
                if bnb_4bit_type == "nf4":
                    load_in_low_bit = "nf4"
                elif bnb_4bit_type == "fp4":
                    warnings.warn(
                        "BigDL LLM QLoRA does not support fp4 now, use default nf4", FutureWarning)
                    load_in_low_bit = "nf4"
                elif bnb_4bit_type == "int4":
                    load_in_low_bit = "sym_int4"
                elif bnb_4bit_type == "bf16":
                    load_in_low_bit = "bf16"
                else:
                    invalidInputError(False,
                                      "Only nf4 or int4 is supported for bnb_4bit_quant_type")
            else:
                warnings.warn(
                    "bnb_4bit_quant_type is None, use default int4", FutureWarning)
                load_in_low_bit = "sym_int4"
            if user_quantization_config.bnb_4bit_use_double_quant is True:
                warnings.warn(
                    "BigDL LLM QLoRA does not support double quant now, set to False",
                    FutureWarning)
            if user_quantization_config.bnb_4bit_compute_dtype is not None:
                bnb_dtype = user_quantization_config.bnb_4bit_compute_dtype
                if bnb_dtype == torch.float32:
                    kwargs["torch_dtype"] = bnb_dtype
                elif bnb_dtype == torch.bfloat16:
                    kwargs["torch_dtype"] = bnb_dtype
                else:
                    invalidInputError(False,
                                      "Only float32 or bfloat16"
                                      " is supported for bnb_4bit_compute_dtype")
            else:
                warnings.warn(
                    "torch_dtype is None, use default float32", FutureWarning)
                kwargs["torch_dtype"] = torch.float32
            optimize_model = False
            kwargs["modules_to_not_convert"] = ["lm_head"]
        if load_in_4bit or load_in_low_bit:
            if config_dict.get("quantization_config", None) is not None: