diff --git a/python/llm/src/bigdl/llm/transformers/convert.py b/python/llm/src/bigdl/llm/transformers/convert.py index d95e0a64..47df66dc 100644 --- a/python/llm/src/bigdl/llm/transformers/convert.py +++ b/python/llm/src/bigdl/llm/transformers/convert.py @@ -71,6 +71,8 @@ def _replace_with_int4_linear(model, modules_to_not_convert=None, current_key_na # Force requires grad to False to avoid unexpected errors model._modules[name].requires_grad_(False) + module.weight = None + # Remove the last key for recursion if len(list(module.children())) > 0: _, has_been_replaced = _replace_with_int4_linear( @@ -93,4 +95,6 @@ def ggml_convert_int4(model): "instead of Linear layers. Please double check your model architecture, or submit " "an issue on github if you think this is a bug." ) + else: + model.to(torch.float32) return model diff --git a/python/llm/src/bigdl/llm/transformers/linear_int4.py b/python/llm/src/bigdl/llm/transformers/linear_int4.py index 63e5978a..437252a9 100644 --- a/python/llm/src/bigdl/llm/transformers/linear_int4.py +++ b/python/llm/src/bigdl/llm/transformers/linear_int4.py @@ -197,4 +197,4 @@ class LinearInt4(nn.Linear): if self.bias is not None: result += self.bias - return result + return result.to(x.dtype) diff --git a/python/llm/src/bigdl/llm/transformers/model.py b/python/llm/src/bigdl/llm/transformers/model.py index 6a31fa89..2dcfb89c 100644 --- a/python/llm/src/bigdl/llm/transformers/model.py +++ b/python/llm/src/bigdl/llm/transformers/model.py @@ -27,13 +27,14 @@ class _BaseAutoModelClass: *args, **kwargs): load_in_4bit = kwargs.pop("load_in_4bit", False) + if load_in_4bit: + kwargs["low_cpu_mem_usage"] = True model = cls.HF_Model.from_pretrained(*args, **kwargs) if load_in_4bit: from .convert import ggml_convert_int4 - model = model.to("cpu", torch.float32) + model = model.to("cpu") model = ggml_convert_int4(model) - return model