diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index be433550..ef8b1dc1 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -105,7 +105,7 @@ jobs: - name: Compare with golden accuracy shell: bash - + if: ${{github.event_name == 'schedule'}} working-directory: ${{ github.workspace }}/python/llm run: | python test/benchmark/harness_nightly/accuracy_regression.py dev/benchmark/harness/results/${{ matrix.model_name }}/${{ matrix.device }}/${{ matrix.precision }}/${{ matrix.task }}/result.json test/benchmark/harness_nightly/golden_results.json diff --git a/python/llm/test/benchmark/harness_nightly/accuracy_regression.py b/python/llm/test/benchmark/harness_nightly/accuracy_regression.py index 4c6cc0d1..fc1da7b7 100644 --- a/python/llm/test/benchmark/harness_nightly/accuracy_regression.py +++ b/python/llm/test/benchmark/harness_nightly/accuracy_regression.py @@ -42,7 +42,7 @@ def main(res_path, golden_path): task_results = results[task] task_golden = golden_results[task] for m in task_results.keys(): - if m in task_golden and task_results[m] != task_golden[m]: + if m in task_golden and abs(task_results[m] - task_golden[m]) < 0.001: if not m.endswith("_stderr"): identical = False logger.error(f"Different on metric '{m}' [golden acc/ current acc]: [{task_golden[m]}/{task_results[m]}]") diff --git a/python/llm/test/benchmark/harness_nightly/golden_results.json b/python/llm/test/benchmark/harness_nightly/golden_results.json index 2d51ca44..0d164cb2 100644 --- a/python/llm/test/benchmark/harness_nightly/golden_results.json +++ b/python/llm/test/benchmark/harness_nightly/golden_results.json @@ -26,12 +26,27 @@ }}, "Mistral-7B-v0.1": {"xpu": { "mixed_fp4": { - "truthfulqa_mc": {"mc1": 0.27539779681762544,"mc1_stderr": 0.01563813566777552,"mc2": 0.41062244273774384,"mc2_stderr": 0.014067078150027909}, + "truthfulqa_mc": { + "mc1": 0.27539779681762544, + "mc1_stderr": 0.01563813566777552, + "mc2": 0.41062756399348693, + "mc2_stderr": 0.014067612078490615 + }, "arc_challenge": {"acc": 0.5674061433447098,"acc_stderr": 0.014478005694182528,"acc_norm": 0.5989761092150171,"acc_norm_stderr": 0.014322255790719867} }, "fp8": { - "truthfulqa_mc": {"mc1": 0.2778457772337821,"mc1_stderr": 0.015680929364024643,"mc2": 0.4212635093545362,"mc2_stderr": 0.01414660694632397}, - "arc_challenge": {"acc": 0.5639931740614335,"acc_stderr": 0.014491225699230916,"acc_norm": 0.5998293515358362,"acc_norm_stderr": 0.014317197787809174} + "truthfulqa_mc": { + "mc1": 0.2778457772337821, + "mc1_stderr": 0.015680929364024643, + "mc2": 0.42125519016651203, + "mc2_stderr": 0.014145367212406432 + }, + "arc_challenge": { + "acc": 0.5639931740614335, + "acc_stderr": 0.014491225699230916, + "acc_norm": 0.5989761092150171, + "acc_norm_stderr": 0.014322255790719867 + } } }} }