From 95271f10e09d7e208b2b451e0e3d965fd04055a2 Mon Sep 17 00:00:00 2001 From: Zhao Changmin Date: Tue, 5 Sep 2023 13:21:12 +0800 Subject: [PATCH] LLM: Rename low bit layer (#8875) * rename lowbit --------- Co-authored-by: leonardozcm --- python/llm/src/bigdl/llm/optimize.py | 4 +-- .../src/bigdl/llm/transformers/__init__.py | 3 ++- .../llm/src/bigdl/llm/transformers/convert.py | 26 +++++++++---------- .../{linear_quant.py => low_bit_linear.py} | 6 ++--- .../llm/src/bigdl/llm/transformers/model.py | 8 +++--- 5 files changed, 24 insertions(+), 23 deletions(-) rename python/llm/src/bigdl/llm/transformers/{linear_quant.py => low_bit_linear.py} (98%) diff --git a/python/llm/src/bigdl/llm/optimize.py b/python/llm/src/bigdl/llm/optimize.py index 334ad0e9..efff4266 100644 --- a/python/llm/src/bigdl/llm/optimize.py +++ b/python/llm/src/bigdl/llm/optimize.py @@ -14,7 +14,7 @@ # limitations under the License. # -from .transformers import ggml_convert_quant +from .transformers import ggml_convert_low_bit from bigdl.llm.ggml.quantize import ggml_tensor_qtype from bigdl.llm.utils.common import invalidInputError @@ -34,4 +34,4 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True): f"Unknown load_in_low_bit value: {low_bit}, expected:" f" sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.") qtype = ggml_tensor_qtype[low_bit] - return ggml_convert_quant(model, qtype=qtype, optimize_model=optimize_llm) + return ggml_convert_low_bit(model, qtype=qtype, optimize_model=optimize_llm) diff --git a/python/llm/src/bigdl/llm/transformers/__init__.py b/python/llm/src/bigdl/llm/transformers/__init__.py index b446cfcc..2ad5d815 100644 --- a/python/llm/src/bigdl/llm/transformers/__init__.py +++ b/python/llm/src/bigdl/llm/transformers/__init__.py @@ -14,7 +14,8 @@ # limitations under the License. # -from .convert import ggml_convert_quant + +from .convert import ggml_convert_low_bit from .model import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM, \ AutoModelForSpeechSeq2Seq, AutoModelForQuestionAnswering, \ AutoModelForSequenceClassification, AutoModelForMaskedLM, \ diff --git a/python/llm/src/bigdl/llm/transformers/convert.py b/python/llm/src/bigdl/llm/transformers/convert.py index c70b7440..ae758b99 100644 --- a/python/llm/src/bigdl/llm/transformers/convert.py +++ b/python/llm/src/bigdl/llm/transformers/convert.py @@ -43,9 +43,9 @@ import transformers import importlib -def _replace_with_quant_linear(model, qtype, modules_to_not_convert=None, - current_key_name=None): - from bigdl.llm.transformers.linear_quant import LinearQuant, FP4Params +def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, + current_key_name=None): + from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params has_been_replaced = False for name, module in model.named_children(): @@ -56,7 +56,7 @@ def _replace_with_quant_linear(model, qtype, modules_to_not_convert=None, # Check if the current key is not in the `modules_to_not_convert` if not any(key in ".".join(current_key_name) for key in modules_to_not_convert): with init_empty_weights(): - new_linear = LinearQuant( + new_linear = LowBitLinear( module.in_features, module.out_features, qtype, @@ -65,12 +65,12 @@ def _replace_with_quant_linear(model, qtype, modules_to_not_convert=None, device_type = module.weight.data.device.type # Copy the weights - paramsQuant = FP4Params(data=module.weight.data, - requires_grad=False, - quantized=False, - _shape=None, - qtype=qtype).to(device_type) - new_linear._parameters['weight'] = paramsQuant + paramsLowBit = FP4Params(data=module.weight.data, + requires_grad=False, + quantized=False, + _shape=None, + qtype=qtype).to(device_type) + new_linear._parameters['weight'] = paramsLowBit if module.bias is not None: new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\ @@ -85,7 +85,7 @@ def _replace_with_quant_linear(model, qtype, modules_to_not_convert=None, # Remove the last key for recursion if len(list(module.children())) > 0: - _, _flag = _replace_with_quant_linear( + _, _flag = _replace_with_low_bit_linear( module, qtype, modules_to_not_convert, @@ -95,9 +95,9 @@ def _replace_with_quant_linear(model, qtype, modules_to_not_convert=None, return model, has_been_replaced -def ggml_convert_quant(model, qtype, optimize_model=True, device="cpu"): +def ggml_convert_low_bit(model, qtype, optimize_model=True, device="cpu"): modules_to_not_convert = [] # ["lm_head"] - model, has_been_replaced = _replace_with_quant_linear( + model, has_been_replaced = _replace_with_low_bit_linear( model, qtype, modules_to_not_convert, None ) if not has_been_replaced: diff --git a/python/llm/src/bigdl/llm/transformers/linear_quant.py b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py similarity index 98% rename from python/llm/src/bigdl/llm/transformers/linear_quant.py rename to python/llm/src/bigdl/llm/transformers/low_bit_linear.py index 09e7b666..6e6d328c 100644 --- a/python/llm/src/bigdl/llm/transformers/linear_quant.py +++ b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py @@ -60,7 +60,7 @@ TORCH_LINEAR_THRESHOLD = 96 SYM_INT4 = ggml_tensor_qtype["sym_int4"] -def ggml_convert_quant(tensor: torch.Tensor, qtype: int, device=None): +def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, device=None): QK = ggml.ggml_qk_size(qtype) block_size_in_bytes = ggml.ggml_type_size(qtype) @@ -123,7 +123,7 @@ class FP4Params(torch.nn.Parameter): def quantize(self, device=None): if not self.quantized: w = self.data.contiguous().float() - w_quantized = ggml_convert_quant(w, self.qtype, + w_quantized = ggml_convert_qtype(w, self.qtype, device=device) self.data = w_quantized self.quantized = True @@ -212,7 +212,7 @@ def ggml_matmul_src1_x_src0_t(src0: torch.Tensor, return result_t -class LinearQuant(nn.Linear): +class LowBitLinear(nn.Linear): def __init__(self, input_features, output_features, qtype, bias=True): super().__init__(input_features, output_features, bias) self.weight = FP4Params(self.weight.data, diff --git a/python/llm/src/bigdl/llm/transformers/model.py b/python/llm/src/bigdl/llm/transformers/model.py index 75bf196e..180d2916 100644 --- a/python/llm/src/bigdl/llm/transformers/model.py +++ b/python/llm/src/bigdl/llm/transformers/model.py @@ -98,7 +98,7 @@ class _BaseAutoModelClass: @classmethod def load_convert(cls, q_k, optimize_model, *args, **kwargs): - from .convert import ggml_convert_quant + from .convert import ggml_convert_low_bit invalidInputError(q_k in ggml_tensor_qtype, f"Unknown load_in_low_bit value: {q_k}, expected:" f" sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.") @@ -117,7 +117,7 @@ class _BaseAutoModelClass: model = cls.HF_Model.from_pretrained(*_args, **_kwargs) model.config.update({"bigdl_lcmu_enabled": False}) model = model.to("cpu") - model = ggml_convert_quant(model, qtype, optimize_model) + model = ggml_convert_low_bit(model, qtype, optimize_model) model.config.update({"bigdl_transformers_low_bit": q_k}) # add save_low_bit to pretrained model dynamically @@ -139,7 +139,7 @@ class _BaseAutoModelClass: from transformers.generation.configuration_utils import GenerationConfig from transformers.models.auto.auto_factory import _get_model_class from accelerate.big_modeling import init_empty_weights - from .convert import ggml_convert_quant + from .convert import ggml_convert_low_bit import copy import os @@ -252,7 +252,7 @@ class _BaseAutoModelClass: # Loading args may differ based on their usage quant_device = "meta" if bigdl_lcmu_enabled else "cpu" - model = ggml_convert_quant(model, qtype, optimize_model, device=quant_device) + model = ggml_convert_low_bit(model, qtype, optimize_model, device=quant_device) if is_sharded: loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]