diff --git a/python/llm/test/benchmark/harness_nightly/accuracy_regression.py b/python/llm/test/benchmark/harness_nightly/accuracy_regression.py deleted file mode 100644 index ed793956..00000000 --- a/python/llm/test/benchmark/harness_nightly/accuracy_regression.py +++ /dev/null @@ -1,56 +0,0 @@ -# -# Copyright 2016 The BigDL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import json -import sys -import logging - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -def main(res_path, golden_path): - print(res_path, golden_path) - with open(res_path, "r") as f: - results = json.load(f)['results'] - print(results) - - model_name, device, precision, task = res_path.split('/')[-5:-1] - - with open(golden_path, "r") as f: - golden_results = json.load(f)[model_name][device][precision] - print(golden_results) - - identical = True - for task in results.keys(): - - if task not in golden_results: - identical = False - logger.error(f"Task {task} should be updated to golden results.") - continue - task_results = results[task] - task_golden = golden_results[task] - for m in task_results.keys(): - if m in task_golden and abs(task_results[m] - task_golden[m]) > 0.001: - if not m.endswith("_stderr"): - identical = False - logger.error(f"Different on metric '{m}' [golden acc/ current acc]: [{task_golden[m]}/{task_results[m]}]") - else: - logger.warning(f"Diff on {m} [golden acc/ current acc]: [{task_golden[m]}/{task_results[m]}]") - if identical: - logger.info("Accuracy values are identical to golden results.") - else: - raise RuntimeError("Accuracy has changed, please check if any accuracy issue or update golden accuracy value.") - -main(*sys.argv[1:3]) \ No newline at end of file diff --git a/python/llm/test/benchmark/harness_nightly/golden_results.json b/python/llm/test/benchmark/harness_nightly/golden_results.json deleted file mode 100644 index 15300011..00000000 --- a/python/llm/test/benchmark/harness_nightly/golden_results.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "stablelm-3b-4e1t": {"xpu": { - "mixed_fp4": { - "truthfulqa_mc": {"mc1": 0.24357405140758873,"mc1_stderr": 0.015026354824910782,"mc2": 0.37399115063281224,"mc2_stderr": 0.013684003173581748}, - "arc_challenge": { - "acc": 0.40102389078498296, - "acc_stderr": 0.014322255790719869, - "acc_norm": 0.44283276450511944, - "acc_norm_stderr": 0.014515573873348897 - } - }, - "fp8": { - "truthfulqa_mc": { - "mc1": 0.24479804161566707, - "mc1_stderr": 0.01505186948671501, - "mc2": 0.3747170112957169, - "mc2_stderr": 0.013516983188729865 - }, - "arc_challenge": { - "acc": 0.41552901023890787, - "acc_stderr": 0.014401366641216377, - "acc_norm": 0.46245733788395904, - "acc_norm_stderr": 0.014570144495075581 - } - } - }}, -"Mistral-7B-v0.1": {"xpu": { - "mixed_fp4": { - "truthfulqa_mc": { - "mc1": 0.2741738066095471, - "mc1_stderr": 0.015616518497219374, - "mc2": 0.4090424865843113, - "mc2_stderr": 0.014068835265546585 - }, - "arc_challenge": { - "acc": 0.5674061433447098, - "acc_stderr": 0.014478005694182528, - "acc_norm": 0.6023890784982935, - "acc_norm_stderr": 0.01430175222327954 - } - }, - "fp8": { - "truthfulqa_mc": { - "mc1": 0.2802937576499388, - "mc1_stderr": 0.015723139524608763, - "mc2": 0.4253576013662111, - "mc2_stderr": 0.014199215617062957 - }, - "arc_challenge": { - "acc": 0.5622866894197952, - "acc_stderr": 0.014497573881108283, - "acc_norm": 0.6032423208191127, - "acc_norm_stderr": 0.014296513020180646 - } - } - }} -}