rename low bit type name (#8512)

* change qx_0 to sym_intx

* update

* fix typo

* update

* fix type

* fix style

* add python doc

* meet code review

* fix style
This commit is contained in:
Xin Qiu 2023-07-13 15:53:31 +08:00 committed by GitHub
parent 4f152b4e3a
commit 90e3d86bce
4 changed files with 27 additions and 14 deletions

View file

@ -102,9 +102,9 @@ You may run the models using `transformers`-style API in `bigdl-llm`.
See the complete example [here](example/transformers/transformers_int4/transformers_int4_pipeline.py).
Notice: For more quantized precision, you can use another parameter `load_in_low_bit`. `q4_0` and `q4_1` are INT4 quantization, `q5_0` and `q5_1` are INT5 quantization, `q8_0` is INT8 quantization. Like:
Notice: For more quantized precision, you can use another parameter `load_in_low_bit`. Available types are `sym_int4`, `asym_int4`, `sym_int5`, `asym_int5` and `sym_int8`.
```python
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_low_bit="q5_0")
model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_low_bit="sym_int5")
```
- ##### Using native INT4 format

View file

@ -25,11 +25,11 @@ dirname, _ = os.path.split(os.path.abspath(__file__))
libs_dirname = os.path.dirname(dirname)
# ggml quantized tensor type, this is different from below file quantized type(_quantize_type)
ggml_tensor_qtype = {"q4_0": 2,
"q4_1": 3,
"q5_0": 6,
"q5_1": 7,
"q8_0": 8}
ggml_tensor_qtype = {"sym_int4": 2, # q4_0 in ggml
"asym_int4": 3, # q4_1 in ggml
"sym_int5": 6, # q5_0 in ggml
"asym_int5": 7, # q5_1 in ggml
"sym_int8": 8} # q8_0 in ggml
_llama_quantize_type = {"q4_0": 2,
"q4_1": 3,

View file

@ -29,6 +29,18 @@ class _BaseAutoModelClass:
def from_pretrained(cls,
*args,
**kwargs):
"""
Load a model from a directory or the HF Hub. Use load_in_4bit or load_in_low_bit parameter
the weight of model's linears can be loaded to low-bit format, like int4, int5 and int8.
Two new arguments are added to extend Hugging Face's from_pretrained method as follows:
New Arguments:
load_in_4bit: boolean value, True means load linear's weight to symmetric int 4.
load_in_low_bit: str value, options are sym_int4, asym_int4, sym_int5, asym_int5 or
sym_int8. The model's linear will be loaded into corresponding
low-bit type. sym_int4 means symmetric int 4, asym_int4 means
asymmetric int 4.
"""
# For huggingface transformers cls.HF_Model.from_pretrained could only restore the model
# in the original format, which is not quantized,
@ -49,8 +61,9 @@ class _BaseAutoModelClass:
if bigdl_transformers_low_bit:
invalidInputError(bigdl_transformers_low_bit in ggml_tensor_qtype,
f"Unknown load_in_low_bit value: {bigdl_transformers_low_bit},"
f" excepted q4_0, q4_1, q5_0, q5_1, q8_0.")
f"Unknown bigdl_transformers_low_bit value:"
f" {bigdl_transformers_low_bit},"
f" expected: sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.")
qtype = ggml_tensor_qtype[bigdl_transformers_low_bit]
# Note that the int4 linear layers cannot currently
# be recorded in huggingface Pretrained Model or AutoConfig,
@ -86,7 +99,7 @@ class _BaseAutoModelClass:
del state_dict
elif load_in_4bit or load_in_low_bit:
q_k = load_in_low_bit if load_in_low_bit else "q4_0"
q_k = load_in_low_bit if load_in_low_bit else "sym_int4"
model = cls.convert_quant(model, q_k, *args, **kwargs)
return model
@ -95,8 +108,8 @@ class _BaseAutoModelClass:
def convert_quant(cls, model, q_k, *args, **kwargs):
from .convert import ggml_convert_quant
invalidInputError(q_k in ggml_tensor_qtype,
f"Unknown load_in_low_bit value: {q_k},"
f" excepted q4_0, q4_1, q5_0, q5_1, q8_0.")
f"Unknown load_in_low_bit value: {q_k}, expected:"
f" sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.")
qtype = ggml_tensor_qtype[q_k]
model = cls.HF_Model.from_pretrained(*args, **kwargs)
model = model.to("cpu")

View file

@ -73,11 +73,11 @@ class TestConvertModel(TestCase):
def test_transformer_convert_llama_q5(self):
model = AutoModelForCausalLM.from_pretrained(llama_model_path,
load_in_low_bit="q5_0")
load_in_low_bit="sym_int5")
def test_transformer_convert_llama_q8(self):
model = AutoModelForCausalLM.from_pretrained(llama_model_path,
load_in_low_bit="q8_0")
load_in_low_bit="sym_int8")
if __name__ == '__main__':