From cb228c70eabea4c29d6bca9188995b259eecf570 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 1 Dec 2023 14:16:35 +0800 Subject: [PATCH] Add harness nightly (#9552) * modify output_path as a directory * schedule nightly at 21 on Friday * add tasks and models for nightly * add accuracy regression * comment out if to test * mixed fp4 * for test * add missing delimiter * remove comma * fixed golden results * add mixed 4 golden result * add more options * add mistral results * get golden result of stable lm * move nightly scripts and results to test folder * add license * add fp8 stable lm golden * run on all available devices * trigger only when ready for review * fix new line * update golden * add mistral --- .github/workflows/llm-harness-evaluation.yml | 25 +++++++-- python/llm/dev/benchmark/harness/run_llb.py | 2 +- .../harness_nightly/accuracy_regression.py | 56 +++++++++++++++++++ .../harness_nightly/golden_results.json | 37 ++++++++++++ 4 files changed, 113 insertions(+), 7 deletions(-) create mode 100644 python/llm/test/benchmark/harness_nightly/accuracy_regression.py create mode 100644 python/llm/test/benchmark/harness_nightly/golden_results.json diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 2fcc0e73..be433550 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -7,9 +7,10 @@ concurrency: # Controls when the action will run. on: - # schedule: - # - cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China + schedule: + - cron: "00 13 * * 5" # GMT time, 13:00 GMT == 21:00 China pull_request: + types: ready_for_review branches: [main] paths: - ".github/workflows/llm-harness-evaluation.yml" @@ -32,9 +33,10 @@ jobs: # task: "arc" # precision: "sym_int4" #options: sym_int4, fp4, mixed_fp4, sym_int8, fp8, mixed_fp8 python-version: ["3.9"] - model_name: [stablelm-3b-4e1t] - task: [winogrande, drop, gsm8k] # truthfulqa, arc, hellaswag, mmlu, winogrande, drop, gsm8k - precision: [sym_int4] #options: sym_int4, fp4, mixed_fp4, sym_int8, fp8, mixed_fp8 + model_name: [stablelm-3b-4e1t,Mistral-7B-v0.1] + task: [truthfulqa, arc] + precision: [mixed_fp4, fp8] + device: [xpu] runs-on: [self-hosted, llm, accuracy] env: @@ -98,5 +100,16 @@ jobs: export HF_DATASETS=$HARNESS_HF_HOME/datasets export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets source /opt/intel/oneapi/setvars.sh - python run_llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device xpu --tasks ${{ matrix.task }} --batch_size 1 --no_cache + python run_llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device ${{ matrix.device }} --tasks ${{ matrix.task }} --batch_size 1 --no_cache --output_path results + + + - name: Compare with golden accuracy + shell: bash + + working-directory: ${{ github.workspace }}/python/llm + run: | + python test/benchmark/harness_nightly/accuracy_regression.py dev/benchmark/harness/results/${{ matrix.model_name }}/${{ matrix.device }}/${{ matrix.precision }}/${{ matrix.task }}/result.json test/benchmark/harness_nightly/golden_results.json + + + diff --git a/python/llm/dev/benchmark/harness/run_llb.py b/python/llm/dev/benchmark/harness/run_llb.py index c425d8df..3e8bd03a 100644 --- a/python/llm/dev/benchmark/harness/run_llb.py +++ b/python/llm/dev/benchmark/harness/run_llb.py @@ -64,7 +64,7 @@ def parse_args(): def main(): args = parse_args() - + assert not args.provide_description # not implemented if args.limit: diff --git a/python/llm/test/benchmark/harness_nightly/accuracy_regression.py b/python/llm/test/benchmark/harness_nightly/accuracy_regression.py new file mode 100644 index 00000000..4c6cc0d1 --- /dev/null +++ b/python/llm/test/benchmark/harness_nightly/accuracy_regression.py @@ -0,0 +1,56 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +import sys +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def main(res_path, golden_path): + print(res_path, golden_path) + with open(res_path, "r") as f: + results = json.load(f)['results'] + print(results) + + model_name, device, precision, task = res_path.split('/')[-5:-1] + + with open(golden_path, "r") as f: + golden_results = json.load(f)[model_name][device][precision] + print(golden_results) + + identical = True + for task in results.keys(): + + if task not in golden_results: + identical = False + logger.error(f"Task {task} should be updated to golden results.") + continue + task_results = results[task] + task_golden = golden_results[task] + for m in task_results.keys(): + if m in task_golden and task_results[m] != task_golden[m]: + if not m.endswith("_stderr"): + identical = False + logger.error(f"Different on metric '{m}' [golden acc/ current acc]: [{task_golden[m]}/{task_results[m]}]") + else: + logger.warning(f"Diff on {m} [golden acc/ current acc]: [{task_golden[m]}/{task_results[m]}]") + if identical: + logger.info("Accuracy values are identical to golden results.") + else: + raise RuntimeError("Accuracy has changed, please check if any accuracy issue or update golden accuracy value.") + +main(*sys.argv[1:3]) \ No newline at end of file diff --git a/python/llm/test/benchmark/harness_nightly/golden_results.json b/python/llm/test/benchmark/harness_nightly/golden_results.json new file mode 100644 index 00000000..2d51ca44 --- /dev/null +++ b/python/llm/test/benchmark/harness_nightly/golden_results.json @@ -0,0 +1,37 @@ +{ + "stablelm-3b-4e1t": {"xpu": { + "mixed_fp4": { + "truthfulqa_mc": {"mc1": 0.24357405140758873,"mc1_stderr": 0.015026354824910782,"mc2": 0.37399115063281224,"mc2_stderr": 0.013684003173581748}, + "arc_challenge": { + "acc": 0.40102389078498296, + "acc_stderr": 0.014322255790719869, + "acc_norm": 0.44283276450511944, + "acc_norm_stderr": 0.014515573873348897 + } + }, + "fp8": { + "truthfulqa_mc": { + "mc1": 0.24479804161566707, + "mc1_stderr": 0.01505186948671501, + "mc2": 0.3747170112957169, + "mc2_stderr": 0.013516983188729865 + }, + "arc_challenge": { + "acc": 0.41552901023890787, + "acc_stderr": 0.014401366641216377, + "acc_norm": 0.46245733788395904, + "acc_norm_stderr": 0.014570144495075581 + } + } + }}, +"Mistral-7B-v0.1": {"xpu": { + "mixed_fp4": { + "truthfulqa_mc": {"mc1": 0.27539779681762544,"mc1_stderr": 0.01563813566777552,"mc2": 0.41062244273774384,"mc2_stderr": 0.014067078150027909}, + "arc_challenge": {"acc": 0.5674061433447098,"acc_stderr": 0.014478005694182528,"acc_norm": 0.5989761092150171,"acc_norm_stderr": 0.014322255790719867} + }, + "fp8": { + "truthfulqa_mc": {"mc1": 0.2778457772337821,"mc1_stderr": 0.015680929364024643,"mc2": 0.4212635093545362,"mc2_stderr": 0.01414660694632397}, + "arc_challenge": {"acc": 0.5639931740614335,"acc_stderr": 0.014491225699230916,"acc_norm": 0.5998293515358362,"acc_norm_stderr": 0.014317197787809174} + } + }} +}