From 411d896636c223abdc16023ebf4c852d25f1257c Mon Sep 17 00:00:00 2001
From: Song Jiaming <litchy233@gmail.com>
Date: Thu, 20 Jul 2023 10:16:27 +0800
Subject: [PATCH] LLM first transformers UT (#8514)

* ut

* transformers api first ut

* name

* dir issue

* use chatglm instead of chatglm2

* omp

* set omp in sh

* source

* taskset

* test

* test omp

* add test
---
 .github/workflows/llm_unit_tests_linux.yml    |  8 +++
 .../test/inference/test_transformers_api.py   | 55 +++++++++++++++++++
 python/llm/test/run-llm-inference-tests.sh    |  6 +-
 3 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 python/llm/test/inference/test_transformers_api.py

diff --git a/.github/workflows/llm_unit_tests_linux.yml b/.github/workflows/llm_unit_tests_linux.yml
index eea0c139..313931b5 100644
--- a/.github/workflows/llm_unit_tests_linux.yml
+++ b/.github/workflows/llm_unit_tests_linux.yml
@@ -41,6 +41,9 @@ env:
   BLOOM_INT4_CKPT_PATH: ./llm/ggml-actions/stable/bigdl_llm_bloom_7b_q4_0.bin
   STARCODER_INT4_CKPT_PATH: ./llm/ggml-actions/stable/bigdl_llm_santacoder_1b_q4_0.bin
 
+  LLM_DIR: ./llm
+  ORIGINAL_CHATGLM_6B_PATH: ./llm/chatglm-6b/
+
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
   llm-cpp-build:
@@ -73,6 +76,7 @@ jobs:
         uses: ./.github/actions/llm/setup-llm-env
         env:
           ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
+          OMP_NUM_THREADS: 24
 
       - name: Download ckpt models
         run: |
@@ -92,6 +96,10 @@ jobs:
             echo "Directory $STARCODER_INT4_CKPT_PATH not found. Downloading from FTP server..."
             wget --no-verbose $LLM_FTP_URL/${STARCODER_INT4_CKPT_PATH:1} -P $INT4_CKPT_DIR
           fi
+          if [ ! -d $ORIGINAL_CHATGLM_6B_PATH ]; then
+            echo "Directory $ORIGINAL_CHATGLM_6B_PATH not found. Downloading from FTP server..."
+            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/${ORIGINAL_CHATGLM_6B_PATH:1} -P $LLM_DIR
+          fi
 
       - name: Run LLM cli test
         uses: ./.github/actions/llm/cli-test
diff --git a/python/llm/test/inference/test_transformers_api.py b/python/llm/test/inference/test_transformers_api.py
new file mode 100644
index 00000000..f0f65fde
--- /dev/null
+++ b/python/llm/test/inference/test_transformers_api.py
@@ -0,0 +1,55 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+from bigdl.llm.models import Llama, Bloom, Gptneox, Starcoder
+from bigdl.llm.utils import get_avx_flags
+import unittest
+import os
+
+import time
+import torch
+from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel
+from transformers import LlamaTokenizer, AutoTokenizer
+
+class TestTransformersAPI(unittest.TestCase):
+
+    def setUp(self):        
+        thread_num = os.environ.get('THREAD_NUM')
+        if thread_num is not None:
+            self.n_threads = int(thread_num)
+        else:
+            self.n_threads = 2
+
+    def test_transformers_int4(self):
+        model_path = os.environ.get('ORIGINAL_CHATGLM_6B_PATH')
+        model = AutoModel.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        input_str = "晚上睡不着应该怎么办"
+
+        with torch.inference_mode():
+            st = time.time()
+            input_ids = tokenizer.encode(input_str, return_tensors="pt")
+            output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
+            output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+            end = time.time()
+        print('Prompt:', input_str)
+        print('Output:', output_str)
+        print(f'Inference time: {end-st} s')    
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/llm/test/run-llm-inference-tests.sh b/python/llm/test/run-llm-inference-tests.sh
index 9d535eb8..81358339 100644
--- a/python/llm/test/run-llm-inference-tests.sh
+++ b/python/llm/test/run-llm-inference-tests.sh
@@ -4,12 +4,16 @@ export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
 export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
 export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/inference
 
+source bigdl-nano-init
+
 set -e
 
 echo "# Start testing inference"
 start=$(date "+%s")
 
-python -m pytest -s ${LLM_INFERENCE_TEST_DIR}
+python -m pytest -s ${LLM_INFERENCE_TEST_DIR} -k "not test_transformers_int4"
+export OMP_NUM_THREADS=24
+taskset -c 0-23 python -m pytest -s ${LLM_INFERENCE_TEST_DIR} -k test_transformers_int4
 
 now=$(date "+%s")
 time=$((now-start))