LLM: add default torch_dtype for fp16. (#10124)

* set default torch_dtype for fp16. * fix style. * bug fix. * update bug fix.
2024-02-08 10:24:16 +08:00 · 2024-02-08 10:24:16 +08:00 · 0cf6a12691
commit 0cf6a12691
parent 1aa0c623ce
1 changed files with 14 additions and 2 deletions
--- a/python/llm/src/bigdl/llm/transformers/model.py
+++ b/python/llm/src/bigdl/llm/transformers/model.py
@ -155,6 +155,7 @@ class _BaseAutoModelClass:
        optimize_model = kwargs.pop("optimize_model", True)
        user_quantization_config = kwargs.pop("quantization_config", None)
        speculative = kwargs.pop("speculative", False)
+        torch_dtype = kwargs.pop("torch_dtype", None)

        if user_quantization_config is not None and \
                "BitsAndBytesConfig" in str(user_quantization_config.__class__):
@ -250,8 +251,19 @@ class _BaseAutoModelClass:

            # load int x-bit
            kwargs["low_cpu_mem_usage"] = True
-            # set default torch_dtype='auto'
-            kwargs["torch_dtype"] = kwargs.get("torch_dtype", 'auto')
+            # set default torch_dtype='auto'.
+            # Note that when load_in_low_bit="fp16", set default torch_dtype=torch.float16
+            if load_in_low_bit == "fp16":
+                if torch_dtype is not None and torch_dtype != torch.float16:
+                    invalidInputError(
+                        False,
+                        f"Please use torch_dtype=torch.float16"
+                        f" when setting load_in_low_bit='fp16'."
+                    )
+                else:
+                    kwargs["torch_dtype"] = torch.float16
+            else:
+                kwargs["torch_dtype"] = torch_dtype or "auto"
            # Avoid tensor parallel F.Linear Operations
            if "pretraining_tp" in config_dict:
                if "config" in kwargs: