LLM: bigdl-llm lora support & lora example (#9740)

* lora support and single card example

* support multi-card, refactor code

* fix model id and style

* remove torch patch, add two new class for bf16, update example

* fix style

* change to training_mode

* small fix

* add more info in help

* fixstyle, update readme

* fix ut

* fix ut

* Handling compatibility issues with default LoraConfig
This commit is contained in:
Ruonan Wang 2023-12-22 11:05:39 +08:00 committed by GitHub
parent ba0b939579
commit 2f36769208
16 changed files with 481 additions and 186 deletions

View file

@ -1,6 +1,6 @@
# Alpaca QLoRA & QA-LoRA Finetuning (experimental support)
# Alpaca Finetuning with BigDL-LLM
This example ports [Alpaca-LoRA](https://github.com/tloen/alpaca-lora/tree/main) to BigDL-LLM (using either [QLoRA](https://arxiv.org/abs/2305.14314) or [QA-LoRA](https://arxiv.org/abs/2309.14717) algorithm) on [Intel GPU](../../README.md).
This example ports [Alpaca-LoRA](https://github.com/tloen/alpaca-lora/tree/main) to BigDL-LLM (using either [QLoRA](https://arxiv.org/abs/2305.14314) / [QA-LoRA](https://arxiv.org/abs/2309.14717) or [LoRA](https://arxiv.org/abs/2106.09685) algorithm) on [Intel GPU](../../README.md).
### 0. Requirements
To run this example with BigDL-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../README.md#requirements) for more information.
@ -26,6 +26,8 @@ source /opt/intel/oneapi/setvars.sh
### 3. Finetune
Now we support three training modes ([QLoRA](https://arxiv.org/abs/2305.14314) / [QA-LoRA](https://arxiv.org/abs/2309.14717) / [LoRA](https://arxiv.org/abs/2106.09685)), to run different mode, just change `training_mode` to `qlora` / `qalora` / `lora` in below script.
Here, we provide example usages on different hardware. Please refer to the appropriate script based on your device:
#### QLoRA
@ -97,6 +99,26 @@ bash qalora_finetune_llama2_7b_arc_2_card.sh
bash qalora_finetune_llama2_7b_pvc_1550_1_tile.sh
```
#### LoRA
##### Finetuning LLaMA2-7B on four Intel Data Center GPU Max 1100
```bash
bash lora_finetune_llama2_7b_pvc_1100_1_card.sh
```
##### Finetuning LLaMA2-7B on single Tile Intel Data Center GPU Max 1550
```bash
bash lora_finetune_llama2_7b_pvc_1550_1_tile.sh
```
##### Finetuning LLaMA2-7B on four Intel Data Center GPU Max 1550
```bash
bash lora_finetune_llama2_7b_pvc_1550_4_card.sh
```
### 4. (Optional) Resume Training
If you fail to complete the whole finetuning process, it is suggested to resume training from a previously saved checkpoint by specifying `resume_from_checkpoint` to the local checkpoint folder as following:**
```bash

View file

@ -48,10 +48,11 @@ from utils.prompter import Prompter
import intel_extension_for_pytorch as ipex
from bigdl.llm.transformers import AutoModelForCausalLM
# import them from bigdl.llm.transformers.qlora to get a BigDL-LLM compatible Peft model
from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
cast_lora_weight, LoraConfig
LoraConfig
from bigdl.llm.utils.common import invalidInputError
def get_int_from_env(env_keys, default):
"""Returns the first positive env value found in the `env_keys` list or the default."""
@ -109,8 +110,10 @@ def train(
prompt_template_name: str = "alpaca", # The prompt template to use, will default to alpaca.
gradient_checkpointing: bool = False,
deepspeed: str = None,
qa_lora: bool = False, # if True, use qa-lora https://arxiv.org/abs/2309.14717
training_mode: str = "qlora",
):
invalidInputError(training_mode in ["qlora", "qalora", "lora"],
"Only qlora / qalora / lora are supported for training_mode now.")
if int(os.environ.get("LOCAL_RANK", 0)) == 0:
print(
f"Training Alpaca-LoRA model with params:\n"
@ -136,7 +139,7 @@ def train(
f"wandb_log_model: {wandb_log_model}\n"
f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
f"prompt template: {prompt_template_name}\n"
f"qa_lora: {qa_lora}\n"
f"training_mode: {training_mode}\n"
)
assert (
base_model
@ -175,7 +178,12 @@ def train(
else:
# According to the QLoRA paper, using "nf4" could yield better model quality than "int4"
# Default 4-bit format for qa-lora is sym_int4
low_bit_format = "sym_int4" if qa_lora else "nf4"
if training_mode == "qalora":
low_bit_format = "sym_int4"
elif training_mode == "lora":
low_bit_format = "bf16"
else:
low_bit_format = "nf4"
# Load the base model from a directory or the HF Hub to 4-bit format
model = AutoModelForCausalLM.from_pretrained(
base_model,
@ -196,7 +204,7 @@ def train(
0 # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left" # Allow batched inference
print(model)
def tokenize(prompt, add_eos_token=True):
@ -257,7 +265,7 @@ def train(
lora_dropout=lora_dropout,
bias="none",
task_type="CAUSAL_LM",
qa_lora=qa_lora,
training_mode=training_mode,
)
print(f"Lora Config: {config}")
model = get_peft_model(model, config)
@ -301,7 +309,7 @@ def train(
max_grad_norm=0.3,
num_train_epochs=num_epochs,
learning_rate=learning_rate,
lr_scheduler_type="constant" if qa_lora else "cosine",
lr_scheduler_type="constant" if training_mode=="qalora" else "cosine",
bf16=True, # ensure training more stable
logging_steps=1,
optim="adamw_torch",

View file

@ -0,0 +1,31 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
export MASTER_ADDR=127.0.0.1
export OMP_NUM_THREADS=14
export FI_PROVIDER=tcp
export CCL_ATL_TRANSPORT=ofi
mpirun -n 4 \
python -u ./alpaca_qlora_finetuning.py \
--micro_batch_size 8 \
--batch_size 128 \
--base_model "meta-llama/Llama-2-7b-hf" \
--data_path "yahma/alpaca-cleaned" \
--output_dir "./bigdl-lora-alpaca" \
--gradient_checkpointing True \
--lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']" \
--training_mode "lora"

View file

@ -0,0 +1,26 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# You could also specify `--base_model` to the local path of the huggingface model checkpoint folder and `--data_path` to the local path of the dataset JSON file
python ./alpaca_qlora_finetuning.py \
--micro_batch_size 8 \
--batch_size 128 \
--base_model "meta-llama/Llama-2-7b-hf" \
--data_path "yahma/alpaca-cleaned" \
--output_dir "./bigdl-lora-alpaca" \
--gradient_checkpointing True \
--lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']" \
--training_mode "lora"

View file

@ -0,0 +1,31 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
export MASTER_ADDR=127.0.0.1
export OMP_NUM_THREADS=7
export FI_PROVIDER=tcp
export CCL_ATL_TRANSPORT=ofi
mpirun -n 8 \
python -u ./alpaca_qlora_finetuning.py \
--micro_batch_size 8 \
--batch_size 128 \
--base_model "meta-llama/Llama-2-7b-hf" \
--data_path "yahma/alpaca-cleaned" \
--output_dir "./bigdl-lora-alpaca" \
--gradient_checkpointing False \
--lora_target_modules "['k_proj', 'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj']" \
--training_mode "lora"

View file

@ -26,4 +26,4 @@ python ./alpaca_qlora_finetuning.py \
--lora_alpha 16 \
--lora_dropout 0.05 \
--val_set_size 2000 \
--qa_lora True
--training_mode "qalora"

View file

@ -31,4 +31,4 @@ mpirun -n 2 \
--lora_alpha 16 \
--lora_dropout 0.05 \
--val_set_size 2000 \
--qa_lora True > training.log
--training_mode "qalora" > training.log

View file

@ -24,7 +24,7 @@ mpirun -n 2 \
--base_model "meta-llama/Llama-2-7b-hf" \
--data_path "yahma/alpaca-cleaned" \
--output_dir "./bigdl-qlora-alpaca" \
--qa_lora True \
--training_mode "qalora" \
--learning_rate 9e-5 \
--micro_batch_size 8 \
--batch_size 128 \

View file

@ -28,4 +28,4 @@ python ./alpaca_qlora_finetuning.py \
--lora_alpha 16 \
--lora_dropout 0.05 \
--val_set_size 2000 \
--qa_lora True
--training_mode "qalora"

View file

@ -57,7 +57,7 @@ if __name__ == "__main__":
target_modules=["q_proj", "k_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

View file

@ -38,7 +38,8 @@ ggml_tensor_qtype = {"sym_int4": 2, # q4_0 in ggml
"mixed_fp4": 17, # Mixture of Formats Quantization 4 bits
"mixed_fp8": 18, # Mixture of Formats Quantization 8 bits
"fp8_e5m2": 19, # fp8 in e5m2 format
"fp8": 15} # fp8 in e4m3 format
"fp8": 15, # fp8 in e4m3 format
"bf16": 20}
_llama_quantize_type = {"q4_0": 2,
"q4_1": 3,

View file

@ -172,7 +172,8 @@ def convert_gptq(module, awq=False):
def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
current_key_name=None, convert_shape_only=False,
cpu_embedding=False):
from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params, FP16Linear
from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \
FP16Linear, BF16Linear
from bigdl.llm.transformers.embedding import LLMEmbedding
has_been_replaced = False
@ -212,7 +213,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
if has_bias:
new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
.to(device_type)
elif qtype != ggml_tensor_qtype["fp16"]:
elif qtype not in [ggml_tensor_qtype["fp16"], ggml_tensor_qtype["bf16"]]:
new_linear = LowBitLinear(
in_features,
out_features,
@ -233,7 +234,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
if module.bias is not None:
new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
.to(device_type)
else:
elif qtype == ggml_tensor_qtype["fp16"]:
# only support two size now
# may generalize to other sizes
if module.in_features in [4096, 11008]:
@ -259,8 +260,20 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
if module.bias is not None:
new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
.to(device_type)
elif qtype == ggml_tensor_qtype["bf16"]:
new_linear = BF16Linear(
in_features,
out_features,
module.bias is not None,
mp_group=mp_group,
)
device_type = module.weight.data.device.type
# convert here
new_linear._parameters['weight'] = nn.Parameter(module.weight)
if module.bias is not None:
new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
.to(device_type)
# fp16 may generalize to other sizes later
if new_linear is not None:
if not module.training:
new_linear.eval()

View file

@ -589,3 +589,37 @@ class FP16Linear(nn.Linear):
result += self.bias
return result.to(x.dtype)
class BF16Linear(nn.Linear):
def __init__(self, input_features, output_features, bias=True,
mp_group=None, compute_dtype=None):
super().__init__(input_features, output_features, bias)
self.in_len = input_features
self.out_len = output_features
self.weight_shape = (self.out_len, self.in_len)
self.weight_length = self.out_len * self.in_len
self.mp_group = mp_group
self.compute_dtype = compute_dtype
def forward(self, x: torch.Tensor):
# only work for GPU now
invalidInputError(x.device.type == "xpu",
"bf16 only works for GPU now")
is_training = self.training and not torch.is_inference_mode_enabled()
if is_training:
# below logic is only for training
autocast_dtype = get_autocast_dtype(x)
if self.compute_dtype is not None and x.device.type == "xpu":
x = x.to(self.compute_dtype) # solve GC issue for unlora module
elif autocast_dtype is not None:
x = x.to(autocast_dtype)
if self.bias is not None and self.bias.dtype != x.dtype:
self.bias.data = self.bias.data.to(x.dtype)
result = F.linear(x, self.weight)
if self.bias is not None:
result += self.bias
return result.to(x.dtype)

View file

@ -106,7 +106,7 @@ class _BaseAutoModelClass:
if the model is GPTQ model.
Default to be False.
:param load_in_low_bit: str value, options are sym_int4, asym_int4, sym_int5, asym_int5
, sym_int8, nf3, nf4, fp4, fp8, fp8_e4m3, fp8_e5m2 or fp16.
, sym_int8, nf3, nf4, fp4, fp8, fp8_e4m3, fp8_e5m2, fp16 or bf16.
sym_int4 means symmetric int 4, asym_int4 means asymmetric int 4,
nf4 means 4-bit NormalFloat, etc. Relevant low bit optimizations
will be applied to the model.
@ -231,7 +231,7 @@ class _BaseAutoModelClass:
invalidInputError(q_k in ggml_tensor_qtype,
f"Unknown load_in_low_bit value: {q_k}, expected:"
f" sym_int4, asym_int4, sym_int5, asym_int5, sym_int8, nf3, nf4, "
"fp4, fp8, fp8_e4m3, fp8_e5m2, fp16, mixed_fp4 or mixed_fp8.")
"fp4, fp8, fp8_e4m3, fp8_e5m2, fp16, bf16, mixed_fp4 or mixed_fp8.")
qtype = ggml_tensor_qtype[q_k]
# In case it needs a second try,

View file

@ -32,7 +32,7 @@
# limitations under the License.
#
# Some parts of this file is adapted from
# https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/training_args.py
# https://github.com/huggingface/peft/blob/v0.5.0/src/peft/tuners/lora.py
#
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
@ -49,11 +49,12 @@
# limitations under the License.
import torch
from bigdl.llm.transformers.low_bit_linear import LowBitLinear, get_qk_size
from bigdl.llm.transformers.low_bit_linear import LowBitLinear, BF16Linear, get_qk_size
from peft.tuners.lora import LoraLayer
from bigdl.llm.utils.common import invalidInputError
from bigdl.llm.transformers.utils import get_autocast_dtype
import functools
from bigdl.llm.transformers import training_patch
class LoraLowBitLinear(LowBitLinear, LoraLayer):
@ -128,22 +129,98 @@ class LoraLowBitLinear(LowBitLinear, LoraLayer):
return result
class LoraBF16Linear(BF16Linear, LoraLayer):
# Lora implemented in a dense layer
def __init__(
self,
adapter_name,
in_features,
out_features,
r: int = 0,
lora_alpha: int = 1,
lora_dropout: float = 0.0,
**kwargs,
):
BF16Linear.__init__(
self,
in_features,
out_features,
bias=kwargs.get("bias", True),
compute_dtype=torch.bfloat16,
)
LoraLayer.__init__(self, in_features=in_features, out_features=out_features)
# Freezing the pre-trained weight matrix
self.weight.requires_grad = False
init_lora_weights = kwargs.pop("init_lora_weights", True)
self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
self.active_adapter = adapter_name
def forward(self, x: torch.Tensor):
autocast_dtype = get_autocast_dtype(x)
if x.device.type == "xpu":
# force to use bf16 on gpu
x = x.to(torch.bfloat16)
elif autocast_dtype is not None:
x = x.to(autocast_dtype)
result = super().forward(x)
if self.disable_adapters or self.active_adapter not in self.lora_A.keys():
return result
elif self.r[self.active_adapter] > 0:
result = result.clone()
if autocast_dtype is None and x.device.type == "cpu":
expected_dtype = result.dtype
x = x.to(self.lora_A[self.active_adapter].weight.dtype)
output = (
self.lora_B[self.active_adapter](
self.lora_A[self.active_adapter](
self.lora_dropout[self.active_adapter](x))
).to(expected_dtype)
* self.scaling[self.active_adapter]
)
else:
output = (
self.lora_B[self.active_adapter](
self.lora_A[self.active_adapter](
self.lora_dropout[self.active_adapter](x))
)
* self.scaling[self.active_adapter]
)
result += output
return result
def _create_new_module(create_new_module_func, lora_config, adapter_name, target, **kwargs):
if isinstance(target, LowBitLinear):
if isinstance(target, LowBitLinear) or isinstance(target, BF16Linear):
low_bit_kwargs = kwargs.copy()
bias = low_bit_kwargs.pop("bias", False)
low_bit_kwargs.update(
{
"qtype": target.qtype,
"qa_lora": lora_config.qa_lora if hasattr(lora_config, "qa_lora") else False,
}
)
new_module = LoraLowBitLinear(adapter_name,
target.in_features,
target.out_features,
bias=bias,
**low_bit_kwargs)
if hasattr(lora_config, "training_mode") and lora_config.training_mode == "lora":
new_module = LoraBF16Linear(adapter_name,
target.in_features,
target.out_features,
bias=bias,
**low_bit_kwargs)
else:
if hasattr(lora_config, "training_mode"):
qa_lora = lora_config.training_mode == "qalora"
else:
qa_lora = False
low_bit_kwargs.update(
{
"qtype": target.qtype,
"qa_lora": qa_lora
}
)
new_module = LoraLowBitLinear(adapter_name,
target.in_features,
target.out_features,
bias=bias,
**low_bit_kwargs)
else:
new_module = create_new_module_func(lora_config, adapter_name, target, **kwargs)
@ -157,8 +234,7 @@ from dataclasses import dataclass, field
@dataclass
class LoraConfig(LoraConfigBase):
qa_lora: bool = field(default=False, metadata={"help": "enable qa-lora"})
training_mode: str = field(default="qlora", metadata={"help": "determine training mode"})
def get_peft_model(*args, **kwargs):
@ -237,158 +313,10 @@ class PeftModel:
return model
def patch_prepare_ipex(self, *args):
return tuple(args)
from transformers.utils import (
requires_backends,
is_sagemaker_mp_enabled,
is_accelerate_available,
is_torch_xpu_available,
is_sagemaker_dp_enabled,
is_torch_tpu_available,
is_torch_npu_available)
from transformers.utils.generic import strtobool
from transformers.utils import cached_property
from transformers.training_args import logger, ParallelMode, DistributedType
import torch
import torch.distributed as dist
import os
import warnings
from datetime import timedelta
if is_accelerate_available():
from accelerate.state import AcceleratorState, PartialState
from accelerate.utils import DistributedType
if is_sagemaker_mp_enabled():
import smdistributed.modelparallel.torch as smp
smp.init()
@cached_property
def _setup_devices(self) -> "torch.device":
requires_backends(self, ["torch"])
logger.info("PyTorch: setting up devices")
if not is_sagemaker_mp_enabled():
if not is_accelerate_available(min_version="0.20.1"):
invalidInputError(
False,
"Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: "
"Please run `pip install transformers[torch]` or `pip install accelerate -U`"
)
AcceleratorState._reset_state(reset_partial_state=True)
self.distributed_state = None
if not self.use_ipex and "ACCELERATE_USE_IPEX" not in os.environ:
os.environ["ACCELERATE_USE_IPEX"] = "false"
if self.use_cpu or strtobool(os.environ.get("ACCELERATE_USE_CPU", "False")):
self.distributed_state = PartialState(cpu=True, backend=self.ddp_backend)
self._n_gpu = 0
elif is_sagemaker_mp_enabled():
local_rank = smp.local_rank()
device = torch.device("cuda", local_rank)
self._n_gpu = 1
torch.cuda.set_device(device)
elif is_torch_xpu_available() and "ACCELERATE_USE_XPU" not in os.environ:
os.environ["ACCELERATE_USE_XPU"] = "true"
self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
# device = torch.device("xpu:0")
device = self.distributed_state.device
self._n_gpu = 1
elif is_sagemaker_dp_enabled():
self.distributed_state = PartialState(_use_sagemaker_dp=True)
self._n_gpu = 1
elif self.deepspeed:
# Need to do similar for Accelerator init
os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
del os.environ["ACCELERATE_USE_DEEPSPEED"]
self._n_gpu = 1
else:
self.distributed_state = PartialState(
backend=self.ddp_backend, timeout=timedelta(seconds=self.ddp_timeout)
)
self._n_gpu = 1
if not is_sagemaker_mp_enabled():
device = self.distributed_state.device
self.local_rank = self.distributed_state.local_process_index
if dist.is_available() and dist.is_initialized() and \
self.parallel_mode != ParallelMode.DISTRIBUTED:
logger.warning(
"torch.distributed process group is initialized, "
"but parallel_mode != ParallelMode.DISTRIBUTED. "
"In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
)
if is_torch_tpu_available():
device = self.distributed_state.device
self._n_gpu = 0
elif is_sagemaker_dp_enabled() or is_sagemaker_mp_enabled():
# Already set _n_gpu
pass
elif self.distributed_state.distributed_type == DistributedType.MULTI_XPU:
if "ACCELERATE_USE_XPU" not in os.environ:
os.environ["ACCELERATE_USE_XPU"] = "true"
# self._n_gpu = torch.xpu.device_count()
# device = torch.device("xpu:0")
# torch.xpu.set_device(device)
elif self.distributed_state.distributed_type == DistributedType.NO:
if self.use_mps_device:
warnings.warn(
"`use_mps_device` is deprecated and will be removed in"
" version 5.0 of 🤗 Transformers."
"`mps` device will be used by default if available similar"
" to the way `cuda` device is used."
"Therefore, no action from user is required. "
)
if device.type != "mps":
invalidInputError(False,
("Either you do not have an MPS-enabled device"
" on this machine or MacOS"
" version is not 12.3+ "
"or current PyTorch install was not built with MPS enabled."))
if device.type == "mps":
self._n_gpu = 1
elif self.use_cpu:
device = torch.device("cpu")
self._n_gpu = 0
elif is_torch_xpu_available():
device = torch.device("xpu:0")
torch.xpu.set_device(device)
self._n_gpu = 1
elif is_torch_npu_available():
device = torch.device("npu:0")
torch.npu.set_device(device)
self._n_gpu = 1
else:
# if n_gpu is > 1 we'll use nn.DataParallel.
# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
# Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
# trigger an error that a device index is missing. Index 0 takes into account the
# GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
# will use the first GPU in that env, i.e. GPU#1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Sometimes the line in the postinit has not been run before we end up here,
# so just checking we're not at
# the default value.
self._n_gpu = torch.cuda.device_count()
if device.type == "cuda":
torch.cuda.set_device(device)
return device
from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING
PEFT_TYPE_TO_CONFIG_MAPPING["lora"] = LoraConfig
# workaround a IPEX bug that prevents resume training in bf16
from accelerate import Accelerator
Accelerator._prepare_ipex = patch_prepare_ipex
# patch transformer for xpu DDP traing
from transformers import TrainingArguments
TrainingArguments._setup_devices = _setup_devices
def cast_lora_weight(model, dtype=torch.bfloat16):
for name, module in model.named_modules():
@ -396,6 +324,9 @@ def cast_lora_weight(model, dtype=torch.bfloat16):
module.compute_dtype = dtype
if isinstance(module, LoraLayer):
module = module.to(dtype)
if isinstance(module, BF16Linear):
module = module.to(dtype)
module.compute_dtype = dtype
if 'norm' in name:
module = module.to(torch.float32)
if 'lm_head' in name or 'embed_tokens' in name:

View file

@ -0,0 +1,198 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Some parts of this file is adapted from
# https://github.com/huggingface/peft/blob/v0.5.0/src/peft/tuners/lora.py
#
# coding=utf-8
# Copyright 2023-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Some parts of this file is adapted from
# https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/training_args.py
#
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def patch_prepare_ipex(self, *args):
return tuple(args)
from transformers.utils import (
requires_backends,
is_sagemaker_mp_enabled,
is_accelerate_available,
is_torch_xpu_available,
is_sagemaker_dp_enabled,
is_torch_tpu_available,
is_torch_npu_available)
from transformers.utils.generic import strtobool
from transformers.utils import cached_property
from transformers.training_args import logger, ParallelMode, DistributedType
import torch
import torch.distributed as dist
import os
import warnings
from datetime import timedelta
if is_accelerate_available():
from accelerate.state import AcceleratorState, PartialState
from accelerate.utils import DistributedType
if is_sagemaker_mp_enabled():
import smdistributed.modelparallel.torch as smp
smp.init()
@cached_property
def _setup_devices(self) -> "torch.device":
requires_backends(self, ["torch"])
logger.info("PyTorch: setting up devices")
if not is_sagemaker_mp_enabled():
if not is_accelerate_available(min_version="0.20.1"):
invalidInputError(
False,
"Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: "
"Please run `pip install transformers[torch]` or `pip install accelerate -U`"
)
AcceleratorState._reset_state(reset_partial_state=True)
self.distributed_state = None
if not self.use_ipex and "ACCELERATE_USE_IPEX" not in os.environ:
os.environ["ACCELERATE_USE_IPEX"] = "false"
if self.use_cpu or strtobool(os.environ.get("ACCELERATE_USE_CPU", "False")):
self.distributed_state = PartialState(cpu=True, backend=self.ddp_backend)
self._n_gpu = 0
elif is_sagemaker_mp_enabled():
local_rank = smp.local_rank()
device = torch.device("cuda", local_rank)
self._n_gpu = 1
torch.cuda.set_device(device)
elif is_torch_xpu_available() and "ACCELERATE_USE_XPU" not in os.environ:
os.environ["ACCELERATE_USE_XPU"] = "true"
self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
# device = torch.device("xpu:0")
device = self.distributed_state.device
self._n_gpu = 1
elif is_sagemaker_dp_enabled():
self.distributed_state = PartialState(_use_sagemaker_dp=True)
self._n_gpu = 1
elif self.deepspeed:
# Need to do similar for Accelerator init
os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
del os.environ["ACCELERATE_USE_DEEPSPEED"]
self._n_gpu = 1
else:
self.distributed_state = PartialState(
backend=self.ddp_backend, timeout=timedelta(seconds=self.ddp_timeout)
)
self._n_gpu = 1
if not is_sagemaker_mp_enabled():
device = self.distributed_state.device
self.local_rank = self.distributed_state.local_process_index
if dist.is_available() and dist.is_initialized() and \
self.parallel_mode != ParallelMode.DISTRIBUTED:
logger.warning(
"torch.distributed process group is initialized, "
"but parallel_mode != ParallelMode.DISTRIBUTED. "
"In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
)
if is_torch_tpu_available():
device = self.distributed_state.device
self._n_gpu = 0
elif is_sagemaker_dp_enabled() or is_sagemaker_mp_enabled():
# Already set _n_gpu
pass
elif self.distributed_state.distributed_type == DistributedType.MULTI_XPU:
if "ACCELERATE_USE_XPU" not in os.environ:
os.environ["ACCELERATE_USE_XPU"] = "true"
# self._n_gpu = torch.xpu.device_count()
# device = torch.device("xpu:0")
# torch.xpu.set_device(device)
elif self.distributed_state.distributed_type == DistributedType.NO:
if self.use_mps_device:
warnings.warn(
"`use_mps_device` is deprecated and will be removed in"
" version 5.0 of 🤗 Transformers."
"`mps` device will be used by default if available similar"
" to the way `cuda` device is used."
"Therefore, no action from user is required. "
)
if device.type != "mps":
invalidInputError(False,
("Either you do not have an MPS-enabled device"
" on this machine or MacOS"
" version is not 12.3+ "
"or current PyTorch install was not built with MPS enabled."))
if device.type == "mps":
self._n_gpu = 1
elif self.use_cpu:
device = torch.device("cpu")
self._n_gpu = 0
elif is_torch_xpu_available():
device = torch.device("xpu:0")
torch.xpu.set_device(device)
self._n_gpu = 1
elif is_torch_npu_available():
device = torch.device("npu:0")
torch.npu.set_device(device)
self._n_gpu = 1
else:
# if n_gpu is > 1 we'll use nn.DataParallel.
# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
# Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
# trigger an error that a device index is missing. Index 0 takes into account the
# GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
# will use the first GPU in that env, i.e. GPU#1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Sometimes the line in the postinit has not been run before we end up here,
# so just checking we're not at
# the default value.
self._n_gpu = torch.cuda.device_count()
if device.type == "cuda":
torch.cuda.set_device(device)
return device
# remove ipex.optimize
from accelerate import Accelerator
Accelerator._prepare_ipex = patch_prepare_ipex
# patch transformer for xpu DDP traing
from transformers import TrainingArguments
TrainingArguments._setup_devices = _setup_devices