From eb3fb18eb4d8da66346fae7c75de674b8436c541 Mon Sep 17 00:00:00 2001 From: binbin Deng <108676127+plusbang@users.noreply.github.com> Date: Wed, 11 Oct 2023 15:03:39 +0800 Subject: [PATCH] LLM: improve PyTorch API doc (#9128) --- .../source/doc/PythonAPI/LLM/optimize.rst | 18 ++++++++- python/llm/src/bigdl/llm/optimize.py | 40 ++++++++++++++++++- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst b/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst index fa6ba7fa..f28211ca 100644 --- a/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst +++ b/docs/readthedocs/source/doc/PythonAPI/LLM/optimize.rst @@ -1,10 +1,24 @@ BigDL-LLM PyTorch API ===================== -optimize model +Optimize Model ---------------------------------------- -.. automodule:: bigdl.llm.optimize +You can run any PyTorch model with ``optimize_model`` through only one-line code change to benefit from BigDL-LLM optimization, regardless of the library or API you are using. + +.. automodule:: bigdl.llm :members: optimize_model :undoc-members: :show-inheritance: + + + +Load Optimized Model +---------------------------------------- + +To avoid high resource consumption during the loading processes of the original model, we provide save/load API to support the saving of model after low-bit optimization and the loading of the saved low-bit model. Saving and loading operations are platform-independent, regardless of their operating systems. + +.. automodule:: bigdl.llm.optimize + :members: load_low_bit + :undoc-members: + :show-inheritance: diff --git a/python/llm/src/bigdl/llm/optimize.py b/python/llm/src/bigdl/llm/optimize.py index 3169660c..b10b8acc 100644 --- a/python/llm/src/bigdl/llm/optimize.py +++ b/python/llm/src/bigdl/llm/optimize.py @@ -133,6 +133,33 @@ def low_memory_init(): def load_low_bit(model, model_path): + """ + Load the optimized pytorch model. + + :param model: The PyTorch model instance + :param model_path: The path of saved optimized model + + :return: The optimized model. + + >>> # Example 1: + >>> # Take ChatGLM2-6B model as an example + >>> # Make sure you have saved the optimized model by calling 'save_low_bit' + >>> from bigdl.llm.optimize import low_memory_init, load_low_bit + >>> with low_memory_init(): # Fast and low cost by loading model on meta device + >>> model = AutoModel.from_pretrained(saved_dir, + >>> torch_dtype="auto", + >>> trust_remote_code=True) + >>> model = load_low_bit(model, saved_dir) # Load the optimized model + + >>> # Example 2: + >>> # If the model doesn't fit 'low_memory_init' method, + >>> # alternatively, you can obtain the model instance through traditional loading method. + >>> # Take OpenAI Whisper model as an example + >>> # Make sure you have saved the optimized model by calling 'save_low_bit' + >>> from bigdl.llm.optimize import load_low_bit + >>> model = whisper.load_model('tiny') # A model instance through traditional loading method + >>> model = load_low_bit(model, saved_dir) # Load the optimized model + """ low_bit = low_bit_sanity_check(model_path) invalidInputError(isinstance(model, torch.nn.Module), "model should be a instance of " @@ -167,14 +194,23 @@ def load_low_bit(model, model_path): def optimize_model(model, low_bit='sym_int4', optimize_llm=True): """ - A method to optimize any pytorch models. + A method to optimize any pytorch model. :param model: The original PyTorch model (nn.module) :param low_bit: Supported low-bit options are "sym_int4", "asym_int4", "sym_int5", "asym_int5" or "sym_int8". :param optimize_llm: Whether to further optimize llm model. - return: The optimized model. + :return: The optimized model. + + >>> # Take OpenAI Whisper model as an example + >>> from bigdl.llm import optimize_model + >>> model = whisper.load_model('tiny') # Load whisper model under pytorch framework + >>> model = optimize_model(model) # With only one line code change + >>> # Use the optimized model without other API change + >>> result = model.transcribe(audio, verbose=True, language="English") + >>> # (Optional) you can also save the optimized model by calling 'save_low_bit' + >>> model.save_low_bit(saved_dir) """ invalidInputError(low_bit in ggml_tensor_qtype, f"Unknown load_in_low_bit value: {low_bit}, expected:"