From cb228c70eabea4c29d6bca9188995b259eecf570 Mon Sep 17 00:00:00 2001
From: "Chen, Zhentao" <zhentao.chen@intel.com>
Date: Fri, 1 Dec 2023 14:16:35 +0800
Subject: [PATCH] Add harness nightly (#9552)

* modify output_path as a directory

* schedule nightly at 21 on Friday

* add tasks and models for nightly

* add accuracy regression

* comment out if to test

* mixed fp4

* for test

* add  missing delimiter

* remove comma

* fixed golden results

* add mixed 4 golden result

* add more options

* add mistral results

* get golden result of stable lm

* move nightly scripts and results to test folder

* add license

* add fp8 stable lm golden

* run on all available devices

* trigger only when ready for review

* fix new line

* update golden

* add mistral
---
 .github/workflows/llm-harness-evaluation.yml  | 25 +++++++--
 python/llm/dev/benchmark/harness/run_llb.py   |  2 +-
 .../harness_nightly/accuracy_regression.py    | 56 +++++++++++++++++++
 .../harness_nightly/golden_results.json       | 37 ++++++++++++
 4 files changed, 113 insertions(+), 7 deletions(-)
 create mode 100644 python/llm/test/benchmark/harness_nightly/accuracy_regression.py
 create mode 100644 python/llm/test/benchmark/harness_nightly/golden_results.json

diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml
index 2fcc0e73..be433550 100644
--- a/.github/workflows/llm-harness-evaluation.yml
+++ b/.github/workflows/llm-harness-evaluation.yml
@@ -7,9 +7,10 @@ concurrency:
 
 # Controls when the action will run.
 on:
-  # schedule:
-  #   - cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China
+  schedule:
+    - cron: "00 13 * * 5" # GMT time, 13:00 GMT == 21:00 China
   pull_request:
+    types: ready_for_review
     branches: [main]
     paths:
       - ".github/workflows/llm-harness-evaluation.yml"
@@ -32,9 +33,10 @@ jobs:
         #   task: "arc"
         #   precision: "sym_int4" #options: sym_int4, fp4, mixed_fp4, sym_int8, fp8, mixed_fp8
         python-version: ["3.9"]
-        model_name: [stablelm-3b-4e1t]
-        task: [winogrande, drop, gsm8k] # truthfulqa, arc, hellaswag, mmlu, winogrande, drop, gsm8k
-        precision: [sym_int4] #options: sym_int4, fp4, mixed_fp4, sym_int8, fp8, mixed_fp8
+        model_name: [stablelm-3b-4e1t,Mistral-7B-v0.1]
+        task: [truthfulqa, arc]
+        precision: [mixed_fp4, fp8] 
+        device: [xpu]
         
     runs-on: [self-hosted, llm, accuracy]
     env:
@@ -98,5 +100,16 @@ jobs:
           export HF_DATASETS=$HARNESS_HF_HOME/datasets
           export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets
           source /opt/intel/oneapi/setvars.sh
-          python run_llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device xpu --tasks ${{ matrix.task }} --batch_size 1 --no_cache
+          python run_llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device ${{ matrix.device }} --tasks ${{ matrix.task }} --batch_size 1 --no_cache --output_path results
+
+
+      - name: Compare with golden accuracy
+        shell: bash
+        
+        working-directory: ${{ github.workspace }}/python/llm
+        run: |
+          python test/benchmark/harness_nightly/accuracy_regression.py dev/benchmark/harness/results/${{ matrix.model_name }}/${{ matrix.device }}/${{ matrix.precision }}/${{ matrix.task }}/result.json test/benchmark/harness_nightly/golden_results.json
+
+          
+
         
diff --git a/python/llm/dev/benchmark/harness/run_llb.py b/python/llm/dev/benchmark/harness/run_llb.py
index c425d8df..3e8bd03a 100644
--- a/python/llm/dev/benchmark/harness/run_llb.py
+++ b/python/llm/dev/benchmark/harness/run_llb.py
@@ -64,7 +64,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-
+    
     assert not args.provide_description  # not implemented
 
     if args.limit:
diff --git a/python/llm/test/benchmark/harness_nightly/accuracy_regression.py b/python/llm/test/benchmark/harness_nightly/accuracy_regression.py
new file mode 100644
index 00000000..4c6cc0d1
--- /dev/null
+++ b/python/llm/test/benchmark/harness_nightly/accuracy_regression.py
@@ -0,0 +1,56 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import json
+import sys
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def main(res_path, golden_path):
+    print(res_path, golden_path)
+    with open(res_path, "r") as f:
+        results = json.load(f)['results']
+        print(results)
+    
+    model_name, device, precision, task = res_path.split('/')[-5:-1]
+
+    with open(golden_path, "r") as f:
+        golden_results = json.load(f)[model_name][device][precision]
+        print(golden_results)
+
+    identical = True
+    for task in results.keys():
+
+        if task not in golden_results:
+            identical = False
+            logger.error(f"Task {task} should be updated to golden results.")
+            continue
+        task_results = results[task]
+        task_golden = golden_results[task]
+        for m in task_results.keys():
+            if m in task_golden and task_results[m] != task_golden[m]:
+                if not m.endswith("_stderr"):
+                    identical = False
+                    logger.error(f"Different on metric '{m}' [golden acc/ current acc]: [{task_golden[m]}/{task_results[m]}]")
+                else:
+                    logger.warning(f"Diff on {m} [golden acc/ current acc]: [{task_golden[m]}/{task_results[m]}]")
+    if identical:
+        logger.info("Accuracy values are identical to golden results.")
+    else:
+        raise RuntimeError("Accuracy has changed, please check if any accuracy issue or update golden accuracy value.")
+
+main(*sys.argv[1:3])
\ No newline at end of file
diff --git a/python/llm/test/benchmark/harness_nightly/golden_results.json b/python/llm/test/benchmark/harness_nightly/golden_results.json
new file mode 100644
index 00000000..2d51ca44
--- /dev/null
+++ b/python/llm/test/benchmark/harness_nightly/golden_results.json
@@ -0,0 +1,37 @@
+{
+  "stablelm-3b-4e1t": {"xpu": {
+    "mixed_fp4": {
+      "truthfulqa_mc": {"mc1": 0.24357405140758873,"mc1_stderr": 0.015026354824910782,"mc2": 0.37399115063281224,"mc2_stderr": 0.013684003173581748},
+      "arc_challenge": {
+        "acc": 0.40102389078498296,
+        "acc_stderr": 0.014322255790719869,
+        "acc_norm": 0.44283276450511944,
+        "acc_norm_stderr": 0.014515573873348897
+      }
+    },
+    "fp8": {
+      "truthfulqa_mc": {
+        "mc1": 0.24479804161566707,
+        "mc1_stderr": 0.01505186948671501,
+        "mc2": 0.3747170112957169,
+        "mc2_stderr": 0.013516983188729865
+      },
+      "arc_challenge": {
+        "acc": 0.41552901023890787,
+        "acc_stderr": 0.014401366641216377,
+        "acc_norm": 0.46245733788395904,
+        "acc_norm_stderr": 0.014570144495075581
+      }
+    }
+  }},
+"Mistral-7B-v0.1": {"xpu": {
+    "mixed_fp4": {
+      "truthfulqa_mc": {"mc1": 0.27539779681762544,"mc1_stderr": 0.01563813566777552,"mc2": 0.41062244273774384,"mc2_stderr": 0.014067078150027909},
+      "arc_challenge": {"acc": 0.5674061433447098,"acc_stderr": 0.014478005694182528,"acc_norm": 0.5989761092150171,"acc_norm_stderr": 0.014322255790719867}
+    },
+    "fp8": {
+      "truthfulqa_mc": {"mc1": 0.2778457772337821,"mc1_stderr": 0.015680929364024643,"mc2": 0.4212635093545362,"mc2_stderr": 0.01414660694632397},
+      "arc_challenge": {"acc": 0.5639931740614335,"acc_stderr": 0.014491225699230916,"acc_norm": 0.5998293515358362,"acc_norm_stderr": 0.014317197787809174}
+    }
+  }}
+}