From b3647507c02c64227895cedb63a40bccf2a41ed6 Mon Sep 17 00:00:00 2001
From: "Chen, Zhentao" <zhentao.chen@intel.com>
Date: Mon, 18 Dec 2023 15:42:10 +0800
Subject: [PATCH] Fix harness workflow (#9704)

* error when larger than 0.001

* fix env setup

* fix typo

* fix typo
---
 .github/workflows/llm-harness-evaluation.yml         | 12 ++++++------
 .../benchmark/harness_nightly/accuracy_regression.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml
index b9fb6123..c0b49b45 100644
--- a/.github/workflows/llm-harness-evaluation.yml
+++ b/.github/workflows/llm-harness-evaluation.yml
@@ -17,20 +17,20 @@ on:
   workflow_dispatch:
     inputs:
       model_name:
-        description: 'A list of models added to the job matrix.'
+        description: 'Model names, seperated by comma and must be quoted.'
         required: true
         type: string
       precision:
-        description: 'A list of precisions added to the job matrix'
+        description: 'Precisions, seperated by comma and must be quoted.'
         required: true
         type: string
       task:
-        description: 'A list of precisions added to the job matrix'
+        description: 'Tasks, seperated by comma and must be quoted.'
         required: true
         type: string
       runs-on:
-        description: 'Labels to filter the runners.'
-        default: 'accuracy'
+        description: 'Labels to filter the runners, seperated by comma and must be quoted.'
+        default: "accuracy"
         required: false
         type: string
 
@@ -166,7 +166,7 @@ jobs:
           export HF_HOME=${HARNESS_HF_HOME}
           export HF_DATASETS=$HARNESS_HF_HOME/datasets
           export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets
-          source /opt/intel/oneapi/setvars.sh
+          source $HOME/intel/oneapi/setvars.sh
           python run_llb.py \
           --model bigdl-llm \
           --pretrained ${MODEL_PATH} \
diff --git a/python/llm/test/benchmark/harness_nightly/accuracy_regression.py b/python/llm/test/benchmark/harness_nightly/accuracy_regression.py
index fc1da7b7..ed793956 100644
--- a/python/llm/test/benchmark/harness_nightly/accuracy_regression.py
+++ b/python/llm/test/benchmark/harness_nightly/accuracy_regression.py
@@ -42,7 +42,7 @@ def main(res_path, golden_path):
         task_results = results[task]
         task_golden = golden_results[task]
         for m in task_results.keys():
-            if m in task_golden and abs(task_results[m] - task_golden[m]) < 0.001:
+            if m in task_golden and abs(task_results[m] - task_golden[m]) > 0.001:
                 if not m.endswith("_stderr"):
                     identical = False
                     logger.error(f"Different on metric '{m}' [golden acc/ current acc]: [{task_golden[m]}/{task_results[m]}]")