LLM: Rename low bit layer (#8875)

* rename lowbit --------- Co-authored-by: leonardozcm <leonardozcm@gmail.com>
2023-09-05 13:21:12 +08:00 · 2023-09-05 13:21:12 +08:00 · 95271f10e0
commit 95271f10e0
parent 74a2c2ddf5
5 changed files with 24 additions and 23 deletions
--- a/python/llm/src/bigdl/llm/optimize.py
+++ b/python/llm/src/bigdl/llm/optimize.py
@ -14,7 +14,7 @@
 # limitations under the License.
 #

-from .transformers import ggml_convert_quant
+from .transformers import ggml_convert_low_bit
 from bigdl.llm.ggml.quantize import ggml_tensor_qtype
 from bigdl.llm.utils.common import invalidInputError

@ -34,4 +34,4 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True):
                      f"Unknown load_in_low_bit value: {low_bit}, expected:"
                      f" sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.")
    qtype = ggml_tensor_qtype[low_bit]
-    return ggml_convert_quant(model, qtype=qtype, optimize_model=optimize_llm)
+    return ggml_convert_low_bit(model, qtype=qtype, optimize_model=optimize_llm)
--- a/python/llm/src/bigdl/llm/transformers/init.py
+++ b/python/llm/src/bigdl/llm/transformers/init.py
@ -14,7 +14,8 @@
 # limitations under the License.
 #

-from .convert import ggml_convert_quant
+
+from .convert import ggml_convert_low_bit
 from .model import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM, \
        AutoModelForSpeechSeq2Seq, AutoModelForQuestionAnswering, \
        AutoModelForSequenceClassification, AutoModelForMaskedLM, \
--- a/python/llm/src/bigdl/llm/transformers/convert.py
+++ b/python/llm/src/bigdl/llm/transformers/convert.py
@ -43,9 +43,9 @@ import transformers
 import importlib


-def _replace_with_quant_linear(model, qtype, modules_to_not_convert=None,
-                               current_key_name=None):
-    from bigdl.llm.transformers.linear_quant import LinearQuant, FP4Params
+def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
+                                 current_key_name=None):
+    from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params
    has_been_replaced = False

    for name, module in model.named_children():
@ -56,7 +56,7 @@ def _replace_with_quant_linear(model, qtype, modules_to_not_convert=None,
            # Check if the current key is not in the `modules_to_not_convert`
            if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
                with init_empty_weights():
-                    new_linear = LinearQuant(
+                    new_linear = LowBitLinear(
                        module.in_features,
                        module.out_features,
                        qtype,
@ -65,12 +65,12 @@ def _replace_with_quant_linear(model, qtype, modules_to_not_convert=None,

                    device_type = module.weight.data.device.type
                    # Copy the weights
-                    paramsQuant = FP4Params(data=module.weight.data,
-                                            requires_grad=False,
-                                            quantized=False,
-                                            _shape=None,
-                                            qtype=qtype).to(device_type)
-                    new_linear._parameters['weight'] = paramsQuant
+                    paramsLowBit = FP4Params(data=module.weight.data,
+                                             requires_grad=False,
+                                             quantized=False,
+                                             _shape=None,
+                                             qtype=qtype).to(device_type)
+                    new_linear._parameters['weight'] = paramsLowBit

                    if module.bias is not None:
                        new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
@ -85,7 +85,7 @@ def _replace_with_quant_linear(model, qtype, modules_to_not_convert=None,

        # Remove the last key for recursion
        if len(list(module.children())) > 0:
-            _, _flag = _replace_with_quant_linear(
+            _, _flag = _replace_with_low_bit_linear(
                module,
                qtype,
                modules_to_not_convert,
@ -95,9 +95,9 @@ def _replace_with_quant_linear(model, qtype, modules_to_not_convert=None,
    return model, has_been_replaced


-def ggml_convert_quant(model, qtype, optimize_model=True, device="cpu"):
+def ggml_convert_low_bit(model, qtype, optimize_model=True, device="cpu"):
    modules_to_not_convert = []  # ["lm_head"]
-    model, has_been_replaced = _replace_with_quant_linear(
+    model, has_been_replaced = _replace_with_low_bit_linear(
        model, qtype, modules_to_not_convert, None
    )
    if not has_been_replaced:
--- a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py
+++ b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py
@ -60,7 +60,7 @@ TORCH_LINEAR_THRESHOLD = 96
 SYM_INT4 = ggml_tensor_qtype["sym_int4"]


-def ggml_convert_quant(tensor: torch.Tensor, qtype: int, device=None):
+def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, device=None):
    QK = ggml.ggml_qk_size(qtype)
    block_size_in_bytes = ggml.ggml_type_size(qtype)

@ -123,7 +123,7 @@ class FP4Params(torch.nn.Parameter):
    def quantize(self, device=None):
        if not self.quantized:
            w = self.data.contiguous().float()
-            w_quantized = ggml_convert_quant(w, self.qtype,
+            w_quantized = ggml_convert_qtype(w, self.qtype,
                                             device=device)
            self.data = w_quantized
            self.quantized = True
@ -212,7 +212,7 @@ def ggml_matmul_src1_x_src0_t(src0: torch.Tensor,
    return result_t


-class LinearQuant(nn.Linear):
+class LowBitLinear(nn.Linear):
    def __init__(self, input_features, output_features, qtype, bias=True):
        super().__init__(input_features, output_features, bias)
        self.weight = FP4Params(self.weight.data,
--- a/python/llm/src/bigdl/llm/transformers/model.py
+++ b/python/llm/src/bigdl/llm/transformers/model.py
@ -98,7 +98,7 @@ class _BaseAutoModelClass:

    @classmethod
    def load_convert(cls, q_k, optimize_model, *args, **kwargs):
-        from .convert import ggml_convert_quant
+        from .convert import ggml_convert_low_bit
        invalidInputError(q_k in ggml_tensor_qtype,
                          f"Unknown load_in_low_bit value: {q_k}, expected:"
                          f" sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.")
@ -117,7 +117,7 @@ class _BaseAutoModelClass:
            model = cls.HF_Model.from_pretrained(*_args, **_kwargs)
            model.config.update({"bigdl_lcmu_enabled": False})
        model = model.to("cpu")
-        model = ggml_convert_quant(model, qtype, optimize_model)
+        model = ggml_convert_low_bit(model, qtype, optimize_model)
        model.config.update({"bigdl_transformers_low_bit": q_k})

        # add save_low_bit to pretrained model dynamically
@ -139,7 +139,7 @@ class _BaseAutoModelClass:
        from transformers.generation.configuration_utils import GenerationConfig
        from transformers.models.auto.auto_factory import _get_model_class
        from accelerate.big_modeling import init_empty_weights
-        from .convert import ggml_convert_quant
+        from .convert import ggml_convert_low_bit
        import copy
        import os

@ -252,7 +252,7 @@ class _BaseAutoModelClass:

        # Loading args may differ based on their usage
        quant_device = "meta" if bigdl_lcmu_enabled else "cpu"
-        model = ggml_convert_quant(model, qtype, optimize_model, device=quant_device)
+        model = ggml_convert_low_bit(model, qtype, optimize_model, device=quant_device)

        if is_sharded:
            loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]