From 86a69e289cde743a1cd479b1a3b6b68724b86d26 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 22 Dec 2023 15:09:22 +0800 Subject: [PATCH] fix harness runner label of manual trigger (#9754) * fix runner * update golden --- .github/workflows/llm-harness-evaluation.yml | 2 +- .../harness_nightly/golden_results.json | 31 +++++++++++-------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index c0b49b45..a991c3b4 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -78,7 +78,7 @@ jobs: MANUAL_MATRIX_MODEL_NAME: ${{format('[ {0} ]', inputs.model_name)}} MANUAL_MATRIX_TASK: ${{format('[ {0} ]', inputs.task)}} MANUAL_MATRIX_PRECISION: ${{format('[ {0} ]', inputs.precision)}} - MANUAL_LABELS: ${{format('["self-hosted", "llm", {0}]', inputs.precision)}} + MANUAL_LABELS: ${{format('["self-hosted", "llm", {0}]', inputs.runs-on)}} run: | echo "model_name=$MANUAL_MATRIX_MODEL_NAME" >> $GITHUB_ENV echo "precision=$MANUAL_MATRIX_TASK" >> $GITHUB_ENV diff --git a/python/llm/test/benchmark/harness_nightly/golden_results.json b/python/llm/test/benchmark/harness_nightly/golden_results.json index 0d164cb2..15300011 100644 --- a/python/llm/test/benchmark/harness_nightly/golden_results.json +++ b/python/llm/test/benchmark/harness_nightly/golden_results.json @@ -27,25 +27,30 @@ "Mistral-7B-v0.1": {"xpu": { "mixed_fp4": { "truthfulqa_mc": { - "mc1": 0.27539779681762544, - "mc1_stderr": 0.01563813566777552, - "mc2": 0.41062756399348693, - "mc2_stderr": 0.014067612078490615 + "mc1": 0.2741738066095471, + "mc1_stderr": 0.015616518497219374, + "mc2": 0.4090424865843113, + "mc2_stderr": 0.014068835265546585 }, - "arc_challenge": {"acc": 0.5674061433447098,"acc_stderr": 0.014478005694182528,"acc_norm": 0.5989761092150171,"acc_norm_stderr": 0.014322255790719867} + "arc_challenge": { + "acc": 0.5674061433447098, + "acc_stderr": 0.014478005694182528, + "acc_norm": 0.6023890784982935, + "acc_norm_stderr": 0.01430175222327954 + } }, "fp8": { "truthfulqa_mc": { - "mc1": 0.2778457772337821, - "mc1_stderr": 0.015680929364024643, - "mc2": 0.42125519016651203, - "mc2_stderr": 0.014145367212406432 + "mc1": 0.2802937576499388, + "mc1_stderr": 0.015723139524608763, + "mc2": 0.4253576013662111, + "mc2_stderr": 0.014199215617062957 }, "arc_challenge": { - "acc": 0.5639931740614335, - "acc_stderr": 0.014491225699230916, - "acc_norm": 0.5989761092150171, - "acc_norm_stderr": 0.014322255790719867 + "acc": 0.5622866894197952, + "acc_stderr": 0.014497573881108283, + "acc_norm": 0.6032423208191127, + "acc_norm_stderr": 0.014296513020180646 } } }}