Fixed dumped logs in harness (#9549)
* install transformers==4.34.0 * modify output_path as a directory * add device and task to output dir parents
This commit is contained in:
parent
d85a430a8c
commit
c8e0c2ed48
2 changed files with 10 additions and 9 deletions
2
.github/workflows/llm-harness-evaluation.yml
vendored
2
.github/workflows/llm-harness-evaluation.yml
vendored
|
|
@ -87,7 +87,7 @@ jobs:
|
|||
- name: Upgrade packages
|
||||
shell: bash
|
||||
run: |
|
||||
pip install --upgrade transformers
|
||||
pip install --upgrade transformers==4.34.0
|
||||
|
||||
- name: Run harness
|
||||
shell: bash
|
||||
|
|
|
|||
|
|
@ -86,6 +86,8 @@ def main():
|
|||
|
||||
success = []
|
||||
fail = []
|
||||
model_name = os.path.basename(os.path.realpath(args.pretrained))
|
||||
output_path = args.output_path if args.output_path else "results"
|
||||
for prec in args.precision:
|
||||
prec_arg = parse_precision(prec, args.model)
|
||||
model_args = f"pretrained={args.pretrained},{prec_arg}"
|
||||
|
|
@ -94,6 +96,8 @@ def main():
|
|||
for task in args.tasks:
|
||||
task_names=task_map.get(task, task).split(',')
|
||||
num_fewshot = task_to_n_few_shots.get(task, args.num_fewshot)
|
||||
log_dir = f"{output_path}/{model_name}/{args.device}/{prec}/{task}"
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
try:
|
||||
results = evaluator.simple_evaluate(
|
||||
model=args.model,
|
||||
|
|
@ -109,7 +113,7 @@ def main():
|
|||
decontamination_ngrams_path=args.decontamination_ngrams_path,
|
||||
check_integrity=args.check_integrity,
|
||||
write_out=args.write_out,
|
||||
output_base_path=args.output_base_path,
|
||||
output_base_path=log_dir
|
||||
)
|
||||
if len(results['results']) > 1:
|
||||
average = {}
|
||||
|
|
@ -117,18 +121,15 @@ def main():
|
|||
for metric, value in subtask.items():
|
||||
average[metric] = average.get(metric, []) + [value]
|
||||
for k, v in average.items():
|
||||
average[k] = sum(average[k]) / len(average[k]) if not k.endswith("_stderr") else 0
|
||||
results['results'][f"avg_{task}"] = average
|
||||
results['versions'][f"avg_{task}"] = 1
|
||||
average[k] = sum(v) / len(v) if not k.endswith("_stderr") else 0
|
||||
results['results'][task] = average
|
||||
results['versions'][task] = 1
|
||||
|
||||
dumped = json.dumps(results, indent=2)
|
||||
print(dumped)
|
||||
|
||||
if args.output_path:
|
||||
dirname = os.path.dirname(args.output_path)
|
||||
if dirname:
|
||||
os.makedirs(dirname, exist_ok=True)
|
||||
with open(args.output_path, "w") as f:
|
||||
with open(f"{log_dir}/result.json", "w") as f:
|
||||
f.write(dumped)
|
||||
success.append(results)
|
||||
except Exception as e:
|
||||
|
|
|
|||
Loading…
Reference in a new issue