From 4be784a49d5095e0a36ea32ca2ad19d503c065d3 Mon Sep 17 00:00:00 2001
From: Ruonan Wang <105281011+rnwang04@users.noreply.github.com>
Date: Tue, 27 Jun 2023 12:12:11 +0800
Subject: [PATCH] LLM: add UT for starcoder (convert, inference)  update
 examples and readme (#8379)

* first commit to add path

* update example and readme

* update path

* fix

* update based on comment
---
 .github/workflows/llm-nightly-test.yml          |  7 +++++++
 .github/workflows/llm_unit_tests_linux.yml      |  5 +++++
 python/llm/README.md                            |  6 ++++--
 python/llm/example/transformers/README.md       | 17 +++++++++++++++--
 .../llm/example/transformers/int4_pipeline.py   |  8 +++++---
 python/llm/test/convert/test_convert_model.py   | 10 ++++++++++
 python/llm/test/inference/test_call_models.py   | 12 +++++++++++-
 7 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/llm-nightly-test.yml b/.github/workflows/llm-nightly-test.yml
index 74ee1dd5..7dbef3b9 100644
--- a/.github/workflows/llm-nightly-test.yml
+++ b/.github/workflows/llm-nightly-test.yml
@@ -24,10 +24,12 @@ jobs:
       LLAMA_ORIGIN_PATH: ./llm/models/llama-7b-hf
       GPTNEOX_ORIGIN_PATH: ./llm/models/gptneox-7b-redpajama-bf16
       BLOOM_ORIGIN_PATH: ./llm/models/bloomz-7b1
+      STARCODER_ORIGIN_PATH: ./llm/models/gpt_bigcode-santacoder
       INT4_CKPT_DIR: ./llm/ggml-actions/nightly
       LLAMA_INT4_CKPT_PATH: ./llm/ggml-actions/nightly/bigdl_llm_llama_q4_0.bin
       GPTNEOX_INT4_CKPT_PATH: ./llm/ggml-actions/nightly/bigdl_llm_gptneox_q4_0.bin
       BLOOM_INT4_CKPT_PATH: ./llm/ggml-actions/nightly/bigdl_llm_bloom_q4_0.bin
+      STARCODER_INT4_CKPT_PATH: ./llm/ggml-actions/nightly/bigdl_llm_starcoder_q4_0.bin
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}
@@ -57,6 +59,10 @@ jobs:
             echo "Directory $BLOOM_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $llm_ftp_url/llm/bloomz-7b1 -P $ORIGIN_DIR
           fi
+          if [ ! -d $STARCODER_ORIGIN_PATH ]; then
+            echo "Directory $STARCODER_ORIGIN_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=1 $llm_ftp_url/llm/gpt_bigcode-santacoder -P $ORIGIN_DIR
+          fi
       
       - name: Run LLM convert test
         shell: bash
@@ -85,3 +91,4 @@ jobs:
           tnftp -u ${llm_ftp_url}/${INT4_CKPT_DIR:1}/bigdl_llm_llama_7b_q4_0.bin $LLAMA_INT4_CKPT_PATH
           tnftp -u ${llm_ftp_url}/${INT4_CKPT_DIR:1}/bigdl_llm_redpajama_7b_q4_0.bin $GPTNEOX_INT4_CKPT_PATH
           tnftp -u ${llm_ftp_url}/${INT4_CKPT_DIR:1}/bigdl_llm_bloom_7b_q4_0.bin $BLOOM_INT4_CKPT_PATH
+          tnftp -u ${llm_ftp_url}/${INT4_CKPT_DIR:1}/bigdl_llm_santacoder_1b_q4_0.bin $STARCODER_INT4_CKPT_PATH
diff --git a/.github/workflows/llm_unit_tests_linux.yml b/.github/workflows/llm_unit_tests_linux.yml
index 82ee6fcc..6a127993 100644
--- a/.github/workflows/llm_unit_tests_linux.yml
+++ b/.github/workflows/llm_unit_tests_linux.yml
@@ -32,6 +32,7 @@ jobs:
       LLAMA_INT4_CKPT_PATH: ./llm/ggml-actions/stable/bigdl_llm_llama_7b_q4_0.bin
       GPTNEOX_INT4_CKPT_PATH: ./llm/ggml-actions/stable/bigdl_llm_redpajama_7b_q4_0.bin
       BLOOM_INT4_CKPT_PATH: ./llm/ggml-actions/stable/bigdl_llm_bloom_7b_q4_0.bin
+      STARCODER_INT4_CKPT_PATH: ./llm/ggml-actions/stable/bigdl_llm_santacoder_1b_q4_0.bin
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}
@@ -93,6 +94,10 @@ jobs:
             echo "Directory $BLOOM_INT4_CKPT_PATH not found. Downloading from FTP server..."
             wget --no-verbose $llm_ftp_url/${BLOOM_INT4_CKPT_PATH:1} -P $INT4_CKPT_DIR
           fi
+          if [ ! -d $STARCODER_INT4_CKPT_PATH ]; then
+            echo "Directory $STARCODER_INT4_CKPT_PATH not found. Downloading from FTP server..."
+            wget --no-verbose $llm_ftp_url/${STARCODER_INT4_CKPT_PATH:1} -P $INT4_CKPT_DIR
+          fi
 
       - name: Run LLM inference test
         shell: bash
diff --git a/python/llm/README.md b/python/llm/README.md
index f9adec8f..a30180c6 100644
--- a/python/llm/README.md
+++ b/python/llm/README.md
@@ -9,7 +9,7 @@ Users could use `bigdl-llm` to
 
 Currently `bigdl-llm` has supported
 - Precision: INT4
-- Model Family: llama, gptneox, bloom
+- Model Family: llama, gptneox, bloom, starcoder
 - Platform: Ubuntu 20.04 or later, CentOS 7 or later, Windows 10/11
 - Device: CPU
 - Python: 3.9 (recommended) or later 
@@ -121,7 +121,7 @@ tokenizer.batch_decode(tokens_id)
 `llama-cpp-python` has become a popular pybinding for `llama.cpp` program. Some users may be familiar with this API so `bigdl-llm` reserve this API and extend it to other model families (e.g., gptneox, bloom)
 
 ```python
-from bigdl.llm.models import Llama, Bloom, Gptneox
+from bigdl.llm.models import Llama, Bloom, Gptneox, Starcoder
 
 llm = Llama("/path/to/llama-7b-int4/bigdl-llm-xxx.bin", n_threads=4)
 result = llm("what is ai")
@@ -144,3 +144,5 @@ To avoid difficaulties during the installtion. `bigdl-llm` release the C impleme
 | gptneox      | Windows  | MSVC 19.36.32532.0 |       |
 | bloom        | Linux    | GCC 9.4.0          | 2.31  |
 | bloom        | Windows  | MSVC 19.36.32532.0 |       |
+| starcoder    | Linux    | GCC 9.4.0          | 2.31  |
+| starcoder    | Windows  | MSVC 19.36.32532.0 |       |
diff --git a/python/llm/example/transformers/README.md b/python/llm/example/transformers/README.md
index 562f24dc..efed0629 100644
--- a/python/llm/example/transformers/README.md
+++ b/python/llm/example/transformers/README.md
@@ -2,7 +2,7 @@
 
 In this example, we show a pipeline to convert a large language model to low precision (INT4), and then conduct inference on the converted INT4 model, using BigDL-LLM transformers-like API.
 
-> **Note**: BigDL-LLM currently supports model family LLaMA, GPT-NeoX, and BLOOM.
+> **Note**: BigDL-LLM currently supports model family LLaMA, GPT-NeoX, BLOOM and StarCoder.
 
 ## Prepare Environment
 We suggest using conda to manage environment:
@@ -19,12 +19,13 @@ python ./int4_pipeline.py --thread-num THREAD_NUM --model-family MODEL_FAMILY
 ```
 arguments info:
 - `--thread-num THREAD_NUM`: **required** argument defining the number of threads to use for inference. It is default to be `2`.
-- `--model-family MODEL_FAMILY`: **required** argument defining the model family of the large language model (supported option: `'llama'`, `'gptneox'`, `'bloom'`). It is default to be `'llama'`.
+- `--model-family MODEL_FAMILY`: **required** argument defining the model family of the large language model (supported option: `'llama'`, `'gptneox'`, `'bloom'`, `'starcoder'`). It is default to be `'llama'`.
 - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: optional argument defining the huggingface repo id from which the large language model is downloaded, or the path to the huggingface checkpoint folder for the model.
 
   - When model family is `'llama'`, it is default to be `'decapoda-research/llama-7b-hf'`.
   - When model family is `'gptneox'`, it is default to be `'togethercomputer/RedPajama-INCITE-7B-Chat'`.
   - When model family is `'bloom'`, it is default to be `'bigscience/bloomz-7b1'`.
+  - When model family is `'starcoder'`, it is default to be `'bigcode/gpt_bigcode-santacoder'`.
 
   > **Note** `REPO_ID_OR_MODEL_PATH` should fits your inputed `MODEL_FAMILY`.
 - `--promp PROMPT`: optional argument defining the prompt to be infered. It is default to be `'Q: What is CPU? A:'`.
@@ -93,4 +94,16 @@ inference:       total time =     xxxx ms
 Inference time (fast forward): xxxx s
 Output:
 {'id': 'cmpl-a0ab2953-e08c-449c-b476-e21ad5bb84b0', 'object': 'text_completion', 'created': 1686557434, 'model': './bigdl_llm_bloom_q4_0.bin', 'choices': [{'text': 'Q: What is CPU? A: central processing unit</s>', 'index': 0, 'logprobs': None, 'finish_reason': None}], 'usage': {'prompt_tokens': None, 'completion_tokens': None, 'total_tokens': None}}
+```
+
+### Model family StarCoder
+```log
+bigdl-llm: mem per token =   313912 bytes
+bigdl-llm:     load time =   xxxx ms
+bigdl-llm:   sample time =   xxxx ms
+bigdl-llm:  predict time =   xxxx ms / xxxx ms per token
+bigdl-llm:    total time =   xxxx ms
+Inference time (fast forward): xxxx s
+Output:
+{'id': 'cmpl-c2358898-cad0-47f4-af5b-84bb575eb942', 'object': 'text_completion', 'created': 1687864692, 'model': './output_starcoder/bigdl_llm_starcoder_q4_0.bin', 'choices': [{'text': ' %s" % cpu)\n    print("Q: What is RAM? A: %s GB" % ram)\n    print("Q: What is BAT? A: %s" % bat)\n    print("Q: What is the number of CPU\'s? A: %s" % ncpus)', 'index': 0, 'logprobs': None, 'finish_reason': None}], 'usage': {'prompt_tokens': 8, 'completion_tokens': 64, 'total_tokens': 72}}
 ```
\ No newline at end of file
diff --git a/python/llm/example/transformers/int4_pipeline.py b/python/llm/example/transformers/int4_pipeline.py
index 7499a725..a6c271d8 100644
--- a/python/llm/example/transformers/int4_pipeline.py
+++ b/python/llm/example/transformers/int4_pipeline.py
@@ -80,9 +80,9 @@ def inference(llm, repo_id_or_model_path, model_family, prompt):
         print(f'Output:\n{output}')
 
 
-    if model_family in ['llama', 'gptneox', 'bloom']:
+    if model_family in ['llama', 'gptneox', 'bloom', 'starcoder']:
         # Option 3: fast forward
-        # note that currently Bloom family model only supports the fast forward inference method
+        # note that currently Bloom/Starcoder family model only supports the fast forward inference method
         print('-'*20, ' fast forward ', '-'*20)
         st = time.time()
 
@@ -99,7 +99,7 @@ def main():
                         help='Number of threads to use for inference')
     parser.add_argument('--model-family', type=str, default='llama', required=True,
                         help="The model family of the large language model (supported option: 'llama', "
-                             "'gptneox', 'bloom')")
+                             "'gptneox', 'bloom', 'starcoder')")
     parser.add_argument('--repo-id-or-model-path', type=str,
                         help='The huggingface repo id for the larga language model to be downloaded'
                              ', or the path to the huggingface checkpoint folder')
@@ -117,6 +117,8 @@ def main():
             repo_id_or_model_path = 'togethercomputer/RedPajama-INCITE-7B-Chat'
         elif args.model_family == 'bloom':
             repo_id_or_model_path = 'bigscience/bloomz-7b1'
+        elif args.model_family == 'starcoder':
+            repo_id_or_model_path = 'bigcode/gpt_bigcode-santacoder'
 
     # Step 1: convert original model to BigDL llm model
     bigdl_llm_path = convert(repo_id_or_model_path=repo_id_or_model_path,
diff --git a/python/llm/test/convert/test_convert_model.py b/python/llm/test/convert/test_convert_model.py
index 6a39801c..434bad8f 100644
--- a/python/llm/test/convert/test_convert_model.py
+++ b/python/llm/test/convert/test_convert_model.py
@@ -25,6 +25,7 @@ from bigdl.llm import llm_convert
 llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH')
 gptneox_model_path = os.environ.get('GPTNEOX_ORIGIN_PATH')
 bloom_model_path = os.environ.get('BLOOM_ORIGIN_PATH')
+starcoder_model_path = os.environ.get('STARCODER_ORIGIN_PATH')
 output_dir = os.environ.get('INT4_CKPT_DIR')
 
 class TestConvertModel(TestCase):
@@ -52,6 +53,15 @@ class TestConvertModel(TestCase):
                                            model_format="pth",
                                            outtype='int4')
         assert os.path.isfile(converted_model_path)
+    
+    def test_convert_starcoder(self):
+        converted_model_path = llm_convert(model=starcoder_model_path,
+                                           outfile=output_dir,
+                                           model_family='starcoder',
+                                           model_format="pth",
+                                           outtype='int4')
+        assert os.path.isfile(converted_model_path)
+
 
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/python/llm/test/inference/test_call_models.py b/python/llm/test/inference/test_call_models.py
index 88b77cfc..40613132 100644
--- a/python/llm/test/inference/test_call_models.py
+++ b/python/llm/test/inference/test_call_models.py
@@ -15,7 +15,7 @@
 #
 
 
-from bigdl.llm.models import Llama, Bloom, Gptneox
+from bigdl.llm.models import Llama, Bloom, Gptneox, Starcoder
 from bigdl.llm.utils import get_avx_flags
 import pytest
 from unittest import TestCase
@@ -28,6 +28,7 @@ class Test_Models_Basics(TestCase):
         self.llama_model_path = os.environ.get('LLAMA_INT4_CKPT_PATH')
         self.bloom_model_path = os.environ.get('BLOOM_INT4_CKPT_PATH')
         self.gptneox_model_path = os.environ.get('GPTNEOX_INT4_CKPT_PATH')
+        self.starcoder_model_path = os.environ.get('STARCODER_INT4_CKPT_PATH')
 
     def test_llama_completion_success(self):
         llm = Llama(self.llama_model_path)
@@ -58,6 +59,15 @@ class Test_Models_Basics(TestCase):
     def test_gptneox_completion_with_stream_success(self):
         llm = Gptneox(self.gptneox_model_path)
         output = llm("Q: What is the capital of France? A:", max_tokens=32, stream=True)
+    
+    def test_starcoder_completion_success(self):
+        llm = Starcoder(self.starcoder_model_path)
+        output = llm("def print_hello_world(", max_tokens=32, stream=False)
+        # assert "Paris" in output['choices'][0]['text']
+
+    def test_starcoder_completion_with_stream_success(self):
+        llm = Starcoder(self.starcoder_model_path)
+        output = llm("def print_hello_world(", max_tokens=32, stream=True)
 
 
 if __name__ == '__main__':