diff --git a/python/llm/src/bigdl/llm/transformers/convert.py b/python/llm/src/bigdl/llm/transformers/convert.py index 642304ed..96a24a30 100644 --- a/python/llm/src/bigdl/llm/transformers/convert.py +++ b/python/llm/src/bigdl/llm/transformers/convert.py @@ -17,6 +17,7 @@ # Some parts of this file is adapted from # https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/utils/bitsandbytes.py +# and https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py # which is licensed under Apache License 2.0: # # Copyright 2021 The HuggingFace Inc. team. All rights reserved. @@ -44,6 +45,23 @@ import warnings def _replace_with_quant_linear(model, qtype, modules_to_not_convert=None, current_key_name=None, convert_shape_only=False): has_been_replaced = False + + # Through our method, certain layers that were initialized on the device "meta" + # (associated with the lazy initialization strategy of low_cpu_mem_usage) are not + # being correctly moved back to the CPU device for some reason. Therefore, we are + # moving these layers back to the CPU here in order to prevent the occurrence + # of NoImplementnError. Details refer to: + # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L3110 + model_state_dict = model.state_dict() + for name, param in model.named_parameters(): + if param.data.device == torch.device('meta'): + from accelerate.utils.modeling import set_module_tensor_to_device + param = model_state_dict[name] + set_module_tensor_to_device(model, + name, + "cpu", + torch.empty(*param.size(), dtype=torch.float32)) + for name, module in model.named_children(): if current_key_name is None: current_key_name = [] @@ -86,6 +104,7 @@ def _replace_with_quant_linear(model, qtype, modules_to_not_convert=None, qtype, modules_to_not_convert, current_key_name, + convert_shape_only, ) return model, has_been_replaced diff --git a/python/llm/src/bigdl/llm/transformers/model.py b/python/llm/src/bigdl/llm/transformers/model.py index 6ebd4828..89c288dc 100644 --- a/python/llm/src/bigdl/llm/transformers/model.py +++ b/python/llm/src/bigdl/llm/transformers/model.py @@ -29,50 +29,54 @@ class _BaseAutoModelClass: def from_pretrained(cls, *args, **kwargs): - load_in_4bit = kwargs.pop("load_in_4bit", False) - qtype = 0 - if load_in_4bit: - kwargs["low_cpu_mem_usage"] = True - qtype = ggml_tensor_qtype['q4_0'] - load_in_low_bit = kwargs.pop("load_in_low_bit", "").lower() - if load_in_low_bit: - kwargs["low_cpu_mem_usage"] = True - invalidInputError(qtype in ggml_tensor_qtype, - f"Unknown load_in_low_bit value: {qtype}," - f" excepted q4_0, q4_1, q5_0, q5_1, q8_0.") - qtype = ggml_tensor_qtype[load_in_low_bit] - - subfolder = kwargs.get("subfolder", "") - variant = kwargs.get("variant", None) - pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None) \ - if len(args) == 0 else args[0] # For huggingface transformers cls.HF_Model.from_pretrained could only restore the model # in the original format, which is not quantized, # we can convert the model to quantized later. model = None + load_in_4bit = kwargs.pop("load_in_4bit", False) + load_in_low_bit = kwargs.pop("load_in_low_bit", None) - # Read bigdl_transformers_int4 from config.json + # Read bigdl_transformers_low_bit from config.json + pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None) \ + if len(args) == 0 else args[0] config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path) + bigdl_transformers_low_bit = config_dict.pop("bigdl_transformers_low_bit", False) + + if load_in_4bit or load_in_low_bit or bigdl_transformers_low_bit: + # Speed up when loading model + kwargs["low_cpu_mem_usage"] = True + + if bigdl_transformers_low_bit: + invalidInputError(bigdl_transformers_low_bit in ggml_tensor_qtype, + f"Unknown load_in_low_bit value: {bigdl_transformers_low_bit}," + f" excepted q4_0, q4_1, q5_0, q5_1, q8_0.") + qtype = ggml_tensor_qtype[bigdl_transformers_low_bit] + # Note that the int4 linear layers cannot currently + # be recorded in huggingface Pretrained Model or AutoConfig, + # and huggingface transformers cls.HF_Model.from_pretrained + # could only restore the model in the original format, + # which is not quantized. we can Initialize original model first, + # convert the model to quantized int4 format later, and then load the quantized model. - bigdl_transformers_int4 = config_dict.pop("bigdl_transformers_int4", False) - if bigdl_transformers_int4: # Avoid KeyError kwargs["ignore_mismatched_sizes"] = True + # Avoid reading from local file at the first initialization + kwargs["state_dict"] = {} - model = cls.HF_Model.from_pretrained(*args, **kwargs) - print("Note: If there are warnings about mismatched during the loading process, " - "please ignore them as it is part of the normal flow. " - "The model will be reconverted to the format of BigDL after loading.") + # Maybe needed when extract_local_archive_file + subfolder = kwargs.get("subfolder", "") + variant = kwargs.get("variant", None) - # Note that the ggml_matmul_src1_x_src0_t operation cannot currently - # be recorded in AutoConfig, - # and this operation is not included in the core Hugging Face infrastructure. - if bigdl_transformers_int4: from .convert import ggml_convert_quant + model = cls.HF_Model.from_pretrained(*args, **kwargs) + print("Note: If there are warnings during the model loading process, " + "they can be safely ignored; " + "the model will be loaded with INT4 optimizations applied.") + # We forcefully modify the model's definition # and the tensor shape of int4 weights without quantization. - model = ggml_convert_quant(model, convert_shape_only=True) + model = ggml_convert_quant(model, qtype, convert_shape_only=True) # Load the quantized model at last. archive_file = extract_local_archive_file(pretrained_model_name_or_path, subfolder, @@ -80,12 +84,24 @@ class _BaseAutoModelClass: state_dict = load_state_dict(archive_file) load(model, state_dict) del state_dict - elif qtype: - from .convert import ggml_convert_quant - model = model.to("cpu") - model = ggml_convert_quant(model, qtype) - model.config.update({"bigdl_transformers_int4": True}) + elif load_in_4bit or load_in_low_bit: + q_k = load_in_low_bit if load_in_low_bit else "q4_0" + model = cls.convert_quant(model, q_k, *args, **kwargs) + + return model + + @classmethod + def convert_quant(cls, model, q_k, *args, **kwargs): + from .convert import ggml_convert_quant + invalidInputError(q_k in ggml_tensor_qtype, + f"Unknown load_in_low_bit value: {q_k}," + f" excepted q4_0, q4_1, q5_0, q5_1, q8_0.") + qtype = ggml_tensor_qtype[q_k] + model = cls.HF_Model.from_pretrained(*args, **kwargs) + model = model.to("cpu") + model = ggml_convert_quant(model, qtype) + model.config.update({"bigdl_transformers_low_bit": q_k}) return model diff --git a/python/llm/src/bigdl/llm/transformers/utils.py b/python/llm/src/bigdl/llm/transformers/utils.py index 4720bffb..837fd5dc 100644 --- a/python/llm/src/bigdl/llm/transformers/utils.py +++ b/python/llm/src/bigdl/llm/transformers/utils.py @@ -52,9 +52,6 @@ WEIGHTS_NAME = "pytorch_model.bin" def extract_local_archive_file(pretrained_model_name_or_path, subfolder, variant): pretrained_model_name_or_path = str(pretrained_model_name_or_path) - print(os.path.join(pretrained_model_name_or_path, - subfolder, - _add_variant(WEIGHTS_NAME, variant))) if os.path.isfile( os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant)) ):