diff --git a/python/llm/dev/benchmark/harness/make_table_results.py b/python/llm/dev/benchmark/harness/make_table_results.py index c90e95ae..39960337 100644 --- a/python/llm/dev/benchmark/harness/make_table_results.py +++ b/python/llm/dev/benchmark/harness/make_table_results.py @@ -34,25 +34,30 @@ def make_table(result_dict): """Generate table of results.""" md_writer = MarkdownTableWriter() latex_writer = LatexTableWriter() - md_writer.headers = ["Model", "Precision", "Task", "Metric", "Value"] - latex_writer.headers = ["Model", "Precision", "Task", "Metric", "Value"] + md_writer.headers = ["Model", "Precision", "Arc", "Hellaswag", "MMLU", "TruthfulQA","Winogrande", "GSM8K"] + latex_writer.headers = ["Model", "Precision", "Arc", "Hellaswag", "MMLU", "TruthfulQA","Winogrande", "GSM8K"] + tasks = ["arc", "hellaswag", "mmlu", "truthfulqa", "winogrande", "gsm8k"] values = [] for model, model_results in result_dict.items(): for precision, prec_results in model_results.items(): - for task, task_results in prec_results.items(): + value = [model, precision] + for task in tasks: - results = task_results["results"] - m = task_to_metric[task] - if len(results) > 1: - result = results[task] + task_results = prec_results.get(task, None) + if task_results is None: + value.append("") else: - result = list(results.values())[0] - - values.append([model, precision, task, m, "%.2f" % (result[m] * 100)]) - - model = "" - precision = "" + m = task_to_metric[task] + results = task_results["results"] + if len(results) > 1: + result = results[task] + else: + result = list(results.values())[0] + value.append("%.2f" % (result[m] * 100)) + values.append(value) + model = "" + precision = "" md_writer.value_matrix = values latex_writer.value_matrix = values @@ -63,8 +68,7 @@ def make_table(result_dict): return md_writer.dumps() -if __name__ == "__main__": - +def merge_results(path): # loop dirs and subdirs in results dir # for each dir, load json files merged_results = dict() @@ -72,7 +76,7 @@ if __name__ == "__main__": # skip dirs without files if not filenames: continue - for filename in sorted([f for f in filenames if f.endswith(".json")]): + for filename in sorted([f for f in filenames if f.endswith("result.json")]): path = os.path.join(dirpath, filename) model, device, precision, task = dirpath.split('/')[-4:] with open(path, "r") as f: @@ -82,4 +86,15 @@ if __name__ == "__main__": if precision not in merged_results[model]: merged_results[model][precision] = dict() merged_results[model][precision][task] = result_dict + return merged_results + + +def main(*args): + + merged_results = merge_results(args[0]) print(make_table(merged_results)) + + +if __name__ == "__main__": + + main(*sys.argv)