From bfa98666a64550c4099855779be9adf7ecfcbe2a Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 23 Feb 2024 16:33:38 +0800 Subject: [PATCH] fall back to make_table.py --- .github/workflows/llm-harness-evaluation.yml | 2 +- python/llm/dev/benchmark/harness/README.md | 4 + .../llm/dev/benchmark/harness/make_table.py | 108 ++++++++++++++++++ 3 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 python/llm/dev/benchmark/harness/make_table.py diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index fcfa2f3c..deb339ae 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -230,7 +230,7 @@ jobs: shell: bash run: | ls results - python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py results + python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table.py results # TODO: change machine to store the results later llm-harness-summary-html: diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md index a373d66c..5ecc0d71 100644 --- a/python/llm/dev/benchmark/harness/README.md +++ b/python/llm/dev/benchmark/harness/README.md @@ -26,3 +26,7 @@ python run_multi_llb.py --model bigdl-llm --pretrained /path/to/model --precisio Taking example above, the script will fork 3 processes, each for one xpu, to execute the tasks. ## Results We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result. +## Summarize the results +"""python +python make_table.py +""" \ No newline at end of file diff --git a/python/llm/dev/benchmark/harness/make_table.py b/python/llm/dev/benchmark/harness/make_table.py new file mode 100644 index 00000000..cdad073c --- /dev/null +++ b/python/llm/dev/benchmark/harness/make_table.py @@ -0,0 +1,108 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Usage: + python make_table.py +""" + +import logging +from pytablewriter import MarkdownTableWriter, LatexTableWriter +import os +import json +import sys +import csv +import datetime +from harness_to_leaderboard import task_to_metric + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def make_table(result_dict): + """Generate table of results.""" + md_writer = MarkdownTableWriter() + latex_writer = LatexTableWriter() + md_writer.headers = ["Model", "Precision", "Arc", "Hellaswag", "MMLU", "TruthfulQA","Winogrande", "GSM8K"] + latex_writer.headers = ["Model", "Precision", "Arc", "Hellaswag", "MMLU", "TruthfulQA","Winogrande", "GSM8K"] + + tasks = ["arc", "hellaswag", "mmlu", "truthfulqa", "winogrande", "gsm8k"] + values = [] + for model, model_results in result_dict.items(): + for precision, prec_results in model_results.items(): + value = [model, precision] + for task in tasks: + + task_results = prec_results.get(task, None) + if task_results is None: + value.append("") + else: + m = task_to_metric[task] + results = task_results["results"] + if len(results) > 1: + result = results[task] + else: + result = list(results.values())[0] + value.append("%.2f" % (result[m] * 100)) + values.append(value) + model = "" + precision = "" + + md_writer.value_matrix = values + latex_writer.value_matrix = values + + # todo: make latex table look good + # print(latex_writer.dumps()) + + return md_writer.dumps() + + +def merge_results(path): + # loop dirs and subdirs in results dir + # for each dir, load json files + print('Read from', path) + merged_results = dict() + for dirpath, dirnames, filenames in os.walk(path): + # skip dirs without files + if not filenames: + continue + for filename in sorted([f for f in filenames if f.endswith("result.json")]): + path = os.path.join(dirpath, filename) + model, device, precision, task = dirpath.split('/')[-4:] + with open(path, "r") as f: + result_dict = json.load(f) + if model not in merged_results: + merged_results[model] = dict() + if precision not in merged_results[model]: + merged_results[model][precision] = dict() + merged_results[model][precision][task] = result_dict + return merged_results + + +def main(*args): + if len(args) > 1: + input_path = args[1] + else: + raise ValueError("Input path is required") + + merged_results = merge_results(input_path) + print(make_table(merged_results)) + + +if __name__ == "__main__": + # when running from the harness, the first argument is the script name + # you must name the second argument and the third argument(optional) to be the input_dir and output_dir + main(*sys.argv)