rename low bit type name (#8512)

* change qx_0 to sym_intx * update * fix typo * update * fix type * fix style * add python doc * meet code review * fix style
2023-07-13 15:53:31 +08:00 · 2023-07-13 15:53:31 +08:00 · 90e3d86bce
commit 90e3d86bce
parent 4f152b4e3a
4 changed files with 27 additions and 14 deletions
--- a/python/llm/README.md
+++ b/python/llm/README.md
@ -102,9 +102,9 @@ You may run the models using `transformers`-style API in `bigdl-llm`.

  See the complete example [here](example/transformers/transformers_int4/transformers_int4_pipeline.py).  

-  Notice: For more quantized precision, you can use another parameter `load_in_low_bit`. `q4_0` and `q4_1` are INT4 quantization, `q5_0` and `q5_1` are INT5 quantization, `q8_0` is INT8 quantization. Like:
+  Notice: For more quantized precision, you can use another parameter `load_in_low_bit`. Available types are `sym_int4`, `asym_int4`, `sym_int5`, `asym_int5` and `sym_int8`.
  ```python
-  model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_low_bit="q5_0")
+  model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_low_bit="sym_int5")
  ```
  
 - ##### Using native INT4 format
--- a/python/llm/src/bigdl/llm/ggml/quantize.py
+++ b/python/llm/src/bigdl/llm/ggml/quantize.py
@ -25,11 +25,11 @@ dirname, _ = os.path.split(os.path.abspath(__file__))
 libs_dirname = os.path.dirname(dirname)

 # ggml quantized tensor type, this is different from below file quantized type(_quantize_type)
-ggml_tensor_qtype = {"q4_0": 2,
-                     "q4_1": 3,
-                     "q5_0": 6,
-                     "q5_1": 7,
-                     "q8_0": 8}
+ggml_tensor_qtype = {"sym_int4": 2,   # q4_0 in ggml
+                     "asym_int4": 3,  # q4_1 in ggml
+                     "sym_int5": 6,   # q5_0 in ggml
+                     "asym_int5": 7,  # q5_1 in ggml
+                     "sym_int8": 8}   # q8_0 in ggml

 _llama_quantize_type = {"q4_0": 2,
                        "q4_1": 3,
--- a/python/llm/src/bigdl/llm/transformers/model.py
+++ b/python/llm/src/bigdl/llm/transformers/model.py
@ -29,6 +29,18 @@ class _BaseAutoModelClass:
    def from_pretrained(cls,
                        *args,
                        **kwargs):
+        """
+        Load a model from a directory or the HF Hub. Use load_in_4bit or load_in_low_bit parameter
+        the weight of model's linears can be loaded to low-bit format, like int4, int5 and int8.
+
+        Two new arguments are added to extend Hugging Face's from_pretrained method as follows:
+        New Arguments:
+            load_in_4bit: boolean value, True means load linear's weight to symmetric int 4.
+            load_in_low_bit: str value, options are sym_int4, asym_int4, sym_int5, asym_int5 or
+                             sym_int8. The model's linear will be loaded into corresponding
+                             low-bit type. sym_int4 means symmetric int 4, asym_int4 means
+                             asymmetric int 4.
+        """

        # For huggingface transformers cls.HF_Model.from_pretrained could only restore the model
        # in the original format, which is not quantized,
@ -49,8 +61,9 @@ class _BaseAutoModelClass:

        if bigdl_transformers_low_bit:
            invalidInputError(bigdl_transformers_low_bit in ggml_tensor_qtype,
-                              f"Unknown load_in_low_bit value: {bigdl_transformers_low_bit},"
-                              f" excepted q4_0, q4_1, q5_0, q5_1, q8_0.")
+                              f"Unknown bigdl_transformers_low_bit value:"
+                              f" {bigdl_transformers_low_bit},"
+                              f" expected: sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.")
            qtype = ggml_tensor_qtype[bigdl_transformers_low_bit]
            # Note that the int4 linear layers cannot currently
            # be recorded in huggingface Pretrained Model or AutoConfig,
@ -86,7 +99,7 @@ class _BaseAutoModelClass:
            del state_dict

        elif load_in_4bit or load_in_low_bit:
-            q_k = load_in_low_bit if load_in_low_bit else "q4_0"
+            q_k = load_in_low_bit if load_in_low_bit else "sym_int4"
            model = cls.convert_quant(model, q_k, *args, **kwargs)

        return model
@ -95,8 +108,8 @@ class _BaseAutoModelClass:
    def convert_quant(cls, model, q_k, *args, **kwargs):
        from .convert import ggml_convert_quant
        invalidInputError(q_k in ggml_tensor_qtype,
-                          f"Unknown load_in_low_bit value: {q_k},"
-                          f" excepted q4_0, q4_1, q5_0, q5_1, q8_0.")
+                          f"Unknown load_in_low_bit value: {q_k}, expected:"
+                          f" sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.")
        qtype = ggml_tensor_qtype[q_k]
        model = cls.HF_Model.from_pretrained(*args, **kwargs)
        model = model.to("cpu")
--- a/python/llm/test/convert/test_convert_model.py
+++ b/python/llm/test/convert/test_convert_model.py
@ -73,11 +73,11 @@ class TestConvertModel(TestCase):

    def test_transformer_convert_llama_q5(self):
        model = AutoModelForCausalLM.from_pretrained(llama_model_path,
-                                                     load_in_low_bit="q5_0")
+                                                     load_in_low_bit="sym_int5")

    def test_transformer_convert_llama_q8(self):
        model = AutoModelForCausalLM.from_pretrained(llama_model_path,
-                                                     load_in_low_bit="q8_0")
+                                                     load_in_low_bit="sym_int8")


 if __name__ == '__main__':