From bfa98666a64550c4099855779be9adf7ecfcbe2a Mon Sep 17 00:00:00 2001
From: "Chen, Zhentao" <zhentao.chen@intel.com>
Date: Fri, 23 Feb 2024 16:33:38 +0800
Subject: [PATCH] fall back to make_table.py

---
 .github/workflows/llm-harness-evaluation.yml  |   2 +-
 python/llm/dev/benchmark/harness/README.md    |   4 +
 .../llm/dev/benchmark/harness/make_table.py   | 108 ++++++++++++++++++
 3 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 python/llm/dev/benchmark/harness/make_table.py

diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml
index fcfa2f3c..deb339ae 100644
--- a/.github/workflows/llm-harness-evaluation.yml
+++ b/.github/workflows/llm-harness-evaluation.yml
@@ -230,7 +230,7 @@ jobs:
         shell: bash
         run: |
           ls results
-          python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py results
+          python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table.py results
 
   # TODO: change machine to store the results later        
   llm-harness-summary-html:
diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md
index a373d66c..5ecc0d71 100644
--- a/python/llm/dev/benchmark/harness/README.md
+++ b/python/llm/dev/benchmark/harness/README.md
@@ -26,3 +26,7 @@ python run_multi_llb.py --model bigdl-llm --pretrained /path/to/model --precisio
 Taking example above, the script will fork 3 processes, each for one xpu, to execute the tasks.
 ## Results
 We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result.
+## Summarize the results
+"""python
+python make_table.py <input_dir>
+"""
\ No newline at end of file
diff --git a/python/llm/dev/benchmark/harness/make_table.py b/python/llm/dev/benchmark/harness/make_table.py
new file mode 100644
index 00000000..cdad073c
--- /dev/null
+++ b/python/llm/dev/benchmark/harness/make_table.py
@@ -0,0 +1,108 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Usage:
+   python make_table.py <input_dir>
+"""
+
+import logging
+from pytablewriter import MarkdownTableWriter, LatexTableWriter
+import os
+import json
+import sys
+import csv
+import datetime
+from harness_to_leaderboard import task_to_metric
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def make_table(result_dict):
+    """Generate table of results."""
+    md_writer = MarkdownTableWriter()
+    latex_writer = LatexTableWriter()
+    md_writer.headers = ["Model", "Precision", "Arc", "Hellaswag", "MMLU", "TruthfulQA","Winogrande", "GSM8K"]
+    latex_writer.headers = ["Model", "Precision", "Arc", "Hellaswag", "MMLU", "TruthfulQA","Winogrande", "GSM8K"]
+
+    tasks = ["arc", "hellaswag", "mmlu", "truthfulqa", "winogrande", "gsm8k"]
+    values = []
+    for model, model_results in result_dict.items():
+        for precision, prec_results in model_results.items():
+            value = [model, precision]
+            for task in tasks:
+
+                task_results = prec_results.get(task, None)
+                if task_results is None:
+                    value.append("")
+                else:
+                    m = task_to_metric[task]
+                    results = task_results["results"]
+                    if len(results) > 1:
+                        result = results[task]
+                    else:
+                        result = list(results.values())[0]
+                    value.append("%.2f" % (result[m] * 100))
+            values.append(value)
+            model = ""    
+            precision = ""
+        
+    md_writer.value_matrix = values
+    latex_writer.value_matrix = values
+
+    # todo: make latex table look good
+    # print(latex_writer.dumps())
+
+    return md_writer.dumps()
+
+
+def merge_results(path):
+    # loop dirs and subdirs in results dir
+    # for each dir, load json files
+    print('Read from', path)
+    merged_results = dict()
+    for dirpath, dirnames, filenames in os.walk(path):
+        # skip dirs without files
+        if not filenames:
+            continue
+        for filename in sorted([f for f in filenames if f.endswith("result.json")]):
+            path = os.path.join(dirpath, filename)
+            model, device, precision, task = dirpath.split('/')[-4:]
+            with open(path, "r") as f:
+                result_dict = json.load(f)
+            if model not in merged_results:
+                merged_results[model] = dict()
+            if precision not in merged_results[model]:
+                merged_results[model][precision] = dict()
+            merged_results[model][precision][task] = result_dict
+    return merged_results
+
+
+def main(*args):
+    if len(args) > 1:
+        input_path = args[1]
+    else:
+        raise ValueError("Input path is required")
+
+    merged_results = merge_results(input_path)
+    print(make_table(merged_results))
+
+
+if __name__ == "__main__":
+    # when running from the harness, the first argument is the script name
+    # you must name the second argument and the third argument(optional) to be the input_dir and output_dir
+    main(*sys.argv)