From c8e0c2ed4861712d2e25e3c2da9309a8c9fb75b4 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Thu, 30 Nov 2023 12:47:56 +0800 Subject: [PATCH] Fixed dumped logs in harness (#9549) * install transformers==4.34.0 * modify output_path as a directory * add device and task to output dir parents --- .github/workflows/llm-harness-evaluation.yml | 2 +- python/llm/dev/benchmark/harness/run_llb.py | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 1bed8aa7..24adf36a 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -87,7 +87,7 @@ jobs: - name: Upgrade packages shell: bash run: | - pip install --upgrade transformers + pip install --upgrade transformers==4.34.0 - name: Run harness shell: bash diff --git a/python/llm/dev/benchmark/harness/run_llb.py b/python/llm/dev/benchmark/harness/run_llb.py index 56be54c5..c425d8df 100644 --- a/python/llm/dev/benchmark/harness/run_llb.py +++ b/python/llm/dev/benchmark/harness/run_llb.py @@ -86,6 +86,8 @@ def main(): success = [] fail = [] + model_name = os.path.basename(os.path.realpath(args.pretrained)) + output_path = args.output_path if args.output_path else "results" for prec in args.precision: prec_arg = parse_precision(prec, args.model) model_args = f"pretrained={args.pretrained},{prec_arg}" @@ -94,6 +96,8 @@ def main(): for task in args.tasks: task_names=task_map.get(task, task).split(',') num_fewshot = task_to_n_few_shots.get(task, args.num_fewshot) + log_dir = f"{output_path}/{model_name}/{args.device}/{prec}/{task}" + os.makedirs(log_dir, exist_ok=True) try: results = evaluator.simple_evaluate( model=args.model, @@ -109,7 +113,7 @@ def main(): decontamination_ngrams_path=args.decontamination_ngrams_path, check_integrity=args.check_integrity, write_out=args.write_out, - output_base_path=args.output_base_path, + output_base_path=log_dir ) if len(results['results']) > 1: average = {} @@ -117,18 +121,15 @@ def main(): for metric, value in subtask.items(): average[metric] = average.get(metric, []) + [value] for k, v in average.items(): - average[k] = sum(average[k]) / len(average[k]) if not k.endswith("_stderr") else 0 - results['results'][f"avg_{task}"] = average - results['versions'][f"avg_{task}"] = 1 + average[k] = sum(v) / len(v) if not k.endswith("_stderr") else 0 + results['results'][task] = average + results['versions'][task] = 1 dumped = json.dumps(results, indent=2) print(dumped) if args.output_path: - dirname = os.path.dirname(args.output_path) - if dirname: - os.makedirs(dirname, exist_ok=True) - with open(args.output_path, "w") as f: + with open(f"{log_dir}/result.json", "w") as f: f.write(dumped) success.append(results) except Exception as e: