From eb3fb18eb4d8da66346fae7c75de674b8436c541 Mon Sep 17 00:00:00 2001
From: binbin Deng <108676127+plusbang@users.noreply.github.com>
Date: Wed, 11 Oct 2023 15:03:39 +0800
Subject: [PATCH] LLM: improve PyTorch API doc (#9128)

---
 .../source/doc/PythonAPI/LLM/optimize.rst     | 18 ++++++++-
 python/llm/src/bigdl/llm/optimize.py          | 40 ++++++++++++++++++-
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst b/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst
index fa6ba7fa..f28211ca 100644
--- a/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst
+++ b/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst
@@ -1,10 +1,24 @@
 BigDL-LLM PyTorch API
 =====================
 
-optimize model
+Optimize Model
 ----------------------------------------
 
-.. automodule:: bigdl.llm.optimize
+You can run any PyTorch model with ``optimize_model`` through only one-line code change to benefit from BigDL-LLM optimization, regardless of the library or API you are using.
+
+.. automodule:: bigdl.llm
     :members: optimize_model
     :undoc-members:
     :show-inheritance:
+
+
+
+Load Optimized Model
+----------------------------------------
+
+To avoid high resource consumption during the loading processes of the original model, we provide save/load API to support the saving of model after low-bit optimization and the loading of the saved low-bit model. Saving and loading operations are platform-independent, regardless of their operating systems.
+
+.. automodule:: bigdl.llm.optimize
+    :members: load_low_bit
+    :undoc-members:
+    :show-inheritance:
diff --git a/python/llm/src/bigdl/llm/optimize.py b/python/llm/src/bigdl/llm/optimize.py
index 3169660c..b10b8acc 100644
--- a/python/llm/src/bigdl/llm/optimize.py
+++ b/python/llm/src/bigdl/llm/optimize.py
@@ -133,6 +133,33 @@ def low_memory_init():
 
 
 def load_low_bit(model, model_path):
+    """
+    Load the optimized pytorch model.
+
+    :param model: The PyTorch model instance
+    :param model_path: The path of saved optimized model
+
+    :return: The optimized model.
+
+    >>> # Example 1:
+    >>> # Take ChatGLM2-6B model as an example
+    >>> # Make sure you have saved the optimized model by calling 'save_low_bit'
+    >>> from bigdl.llm.optimize import low_memory_init, load_low_bit
+    >>> with low_memory_init(): # Fast and low cost by loading model on meta device
+    >>>     model = AutoModel.from_pretrained(saved_dir,
+    >>>                                       torch_dtype="auto",
+    >>>                                       trust_remote_code=True)
+    >>> model = load_low_bit(model, saved_dir) # Load the optimized model
+
+    >>> # Example 2:
+    >>> # If the model doesn't fit 'low_memory_init' method,
+    >>> # alternatively, you can obtain the model instance through traditional loading method.
+    >>> # Take OpenAI Whisper model as an example
+    >>> # Make sure you have saved the optimized model by calling 'save_low_bit'
+    >>> from bigdl.llm.optimize import load_low_bit
+    >>> model = whisper.load_model('tiny') # A model instance through traditional loading method
+    >>> model = load_low_bit(model, saved_dir) # Load the optimized model
+    """
     low_bit = low_bit_sanity_check(model_path)
     invalidInputError(isinstance(model, torch.nn.Module),
                       "model should be a instance of "
@@ -167,14 +194,23 @@ def load_low_bit(model, model_path):
 
 def optimize_model(model, low_bit='sym_int4', optimize_llm=True):
     """
-    A method to optimize any pytorch models.
+    A method to optimize any pytorch model.
 
     :param model: The original PyTorch model (nn.module)
     :param low_bit: Supported low-bit options are "sym_int4", "asym_int4", "sym_int5",
         "asym_int5" or "sym_int8".
     :param optimize_llm: Whether to further optimize llm model.
 
-    return: The optimized model.
+    :return: The optimized model.
+
+    >>> # Take OpenAI Whisper model as an example
+    >>> from bigdl.llm import optimize_model
+    >>> model = whisper.load_model('tiny') # Load whisper model under pytorch framework
+    >>> model = optimize_model(model) # With only one line code change
+    >>> # Use the optimized model without other API change
+    >>> result = model.transcribe(audio, verbose=True, language="English")
+    >>> # (Optional) you can also save the optimized model by calling 'save_low_bit'
+    >>> model.save_low_bit(saved_dir)
     """
     invalidInputError(low_bit in ggml_tensor_qtype,
                       f"Unknown load_in_low_bit value: {low_bit}, expected:"