Fix harness workflow (#9704)

* error when larger than 0.001 * fix env setup * fix typo * fix typo
2023-12-18 15:42:10 +08:00 · 2023-12-18 15:42:10 +08:00 · b3647507c0
commit b3647507c0
parent 12df70953e
2 changed files with 7 additions and 7 deletions
--- a/.github/workflows/llm-harness-evaluation.yml
+++ b/.github/workflows/llm-harness-evaluation.yml
@ -17,20 +17,20 @@ on:
  workflow_dispatch:
    inputs:
      model_name:
-        description: 'A list of models added to the job matrix.'
+        description: 'Model names, seperated by comma and must be quoted.'
        required: true
        type: string
      precision:
-        description: 'A list of precisions added to the job matrix'
+        description: 'Precisions, seperated by comma and must be quoted.'
        required: true
        type: string
      task:
-        description: 'A list of precisions added to the job matrix'
+        description: 'Tasks, seperated by comma and must be quoted.'
        required: true
        type: string
      runs-on:
-        description: 'Labels to filter the runners.'
+        description: 'Labels to filter the runners, seperated by comma and must be quoted.'
-        default: 'accuracy'
+        default: "accuracy"
        required: false
        type: string
@ -166,7 +166,7 @@ jobs:
          export HF_HOME=${HARNESS_HF_HOME}
          export HF_DATASETS=$HARNESS_HF_HOME/datasets
          export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets
-          source /opt/intel/oneapi/setvars.sh
+          source $HOME/intel/oneapi/setvars.sh
          python run_llb.py \
          --model bigdl-llm \
          --pretrained ${MODEL_PATH} \
--- a/python/llm/test/benchmark/harness_nightly/accuracy_regression.py
+++ b/python/llm/test/benchmark/harness_nightly/accuracy_regression.py
@ -42,7 +42,7 @@ def main(res_path, golden_path):
        task_results = results[task]
        task_golden = golden_results[task]
        for m in task_results.keys():
-            if m in task_golden and abs(task_results[m] - task_golden[m]) < 0.001:
+            if m in task_golden and abs(task_results[m] - task_golden[m]) > 0.001:
                if not m.endswith("_stderr"):
                    identical = False
                    logger.error(f"Different on metric '{m}' [golden acc/ current acc]: [{task_golden[m]}/{task_results[m]}]")