diff --git a/python/llm/README.md b/python/llm/README.md index 363ea637..5453d7b9 100644 --- a/python/llm/README.md +++ b/python/llm/README.md @@ -102,9 +102,9 @@ You may run the models using `transformers`-style API in `bigdl-llm`. See the complete example [here](example/transformers/transformers_int4/transformers_int4_pipeline.py). - Notice: For more quantized precision, you can use another parameter `load_in_low_bit`. `q4_0` and `q4_1` are INT4 quantization, `q5_0` and `q5_1` are INT5 quantization, `q8_0` is INT8 quantization. Like: + Notice: For more quantized precision, you can use another parameter `load_in_low_bit`. Available types are `sym_int4`, `asym_int4`, `sym_int5`, `asym_int5` and `sym_int8`. ```python - model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_low_bit="q5_0") + model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_low_bit="sym_int5") ``` - ##### Using native INT4 format diff --git a/python/llm/src/bigdl/llm/ggml/quantize.py b/python/llm/src/bigdl/llm/ggml/quantize.py index c34ef522..c61a05fe 100644 --- a/python/llm/src/bigdl/llm/ggml/quantize.py +++ b/python/llm/src/bigdl/llm/ggml/quantize.py @@ -25,11 +25,11 @@ dirname, _ = os.path.split(os.path.abspath(__file__)) libs_dirname = os.path.dirname(dirname) # ggml quantized tensor type, this is different from below file quantized type(_quantize_type) -ggml_tensor_qtype = {"q4_0": 2, - "q4_1": 3, - "q5_0": 6, - "q5_1": 7, - "q8_0": 8} +ggml_tensor_qtype = {"sym_int4": 2, # q4_0 in ggml + "asym_int4": 3, # q4_1 in ggml + "sym_int5": 6, # q5_0 in ggml + "asym_int5": 7, # q5_1 in ggml + "sym_int8": 8} # q8_0 in ggml _llama_quantize_type = {"q4_0": 2, "q4_1": 3, diff --git a/python/llm/src/bigdl/llm/transformers/model.py b/python/llm/src/bigdl/llm/transformers/model.py index 518f30f2..c4293de3 100644 --- a/python/llm/src/bigdl/llm/transformers/model.py +++ b/python/llm/src/bigdl/llm/transformers/model.py @@ -29,6 +29,18 @@ class _BaseAutoModelClass: def from_pretrained(cls, *args, **kwargs): + """ + Load a model from a directory or the HF Hub. Use load_in_4bit or load_in_low_bit parameter + the weight of model's linears can be loaded to low-bit format, like int4, int5 and int8. + + Two new arguments are added to extend Hugging Face's from_pretrained method as follows: + New Arguments: + load_in_4bit: boolean value, True means load linear's weight to symmetric int 4. + load_in_low_bit: str value, options are sym_int4, asym_int4, sym_int5, asym_int5 or + sym_int8. The model's linear will be loaded into corresponding + low-bit type. sym_int4 means symmetric int 4, asym_int4 means + asymmetric int 4. + """ # For huggingface transformers cls.HF_Model.from_pretrained could only restore the model # in the original format, which is not quantized, @@ -49,8 +61,9 @@ class _BaseAutoModelClass: if bigdl_transformers_low_bit: invalidInputError(bigdl_transformers_low_bit in ggml_tensor_qtype, - f"Unknown load_in_low_bit value: {bigdl_transformers_low_bit}," - f" excepted q4_0, q4_1, q5_0, q5_1, q8_0.") + f"Unknown bigdl_transformers_low_bit value:" + f" {bigdl_transformers_low_bit}," + f" expected: sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.") qtype = ggml_tensor_qtype[bigdl_transformers_low_bit] # Note that the int4 linear layers cannot currently # be recorded in huggingface Pretrained Model or AutoConfig, @@ -86,7 +99,7 @@ class _BaseAutoModelClass: del state_dict elif load_in_4bit or load_in_low_bit: - q_k = load_in_low_bit if load_in_low_bit else "q4_0" + q_k = load_in_low_bit if load_in_low_bit else "sym_int4" model = cls.convert_quant(model, q_k, *args, **kwargs) return model @@ -95,8 +108,8 @@ class _BaseAutoModelClass: def convert_quant(cls, model, q_k, *args, **kwargs): from .convert import ggml_convert_quant invalidInputError(q_k in ggml_tensor_qtype, - f"Unknown load_in_low_bit value: {q_k}," - f" excepted q4_0, q4_1, q5_0, q5_1, q8_0.") + f"Unknown load_in_low_bit value: {q_k}, expected:" + f" sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.") qtype = ggml_tensor_qtype[q_k] model = cls.HF_Model.from_pretrained(*args, **kwargs) model = model.to("cpu") diff --git a/python/llm/test/convert/test_convert_model.py b/python/llm/test/convert/test_convert_model.py index 3bb96b00..d2a7d794 100644 --- a/python/llm/test/convert/test_convert_model.py +++ b/python/llm/test/convert/test_convert_model.py @@ -73,11 +73,11 @@ class TestConvertModel(TestCase): def test_transformer_convert_llama_q5(self): model = AutoModelForCausalLM.from_pretrained(llama_model_path, - load_in_low_bit="q5_0") + load_in_low_bit="sym_int5") def test_transformer_convert_llama_q8(self): model = AutoModelForCausalLM.from_pretrained(llama_model_path, - load_in_low_bit="q8_0") + load_in_low_bit="sym_int8") if __name__ == '__main__':