Add harness nightly (#9552)
* modify output_path as a directory * schedule nightly at 21 on Friday * add tasks and models for nightly * add accuracy regression * comment out if to test * mixed fp4 * for test * add missing delimiter * remove comma * fixed golden results * add mixed 4 golden result * add more options * add mistral results * get golden result of stable lm * move nightly scripts and results to test folder * add license * add fp8 stable lm golden * run on all available devices * trigger only when ready for review * fix new line * update golden * add mistral
This commit is contained in:
parent
4d7d5d4c59
commit
cb228c70ea
4 changed files with 113 additions and 7 deletions
25
.github/workflows/llm-harness-evaluation.yml
vendored
25
.github/workflows/llm-harness-evaluation.yml
vendored
|
|
@ -7,9 +7,10 @@ concurrency:
|
||||||
|
|
||||||
# Controls when the action will run.
|
# Controls when the action will run.
|
||||||
on:
|
on:
|
||||||
# schedule:
|
schedule:
|
||||||
# - cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China
|
- cron: "00 13 * * 5" # GMT time, 13:00 GMT == 21:00 China
|
||||||
pull_request:
|
pull_request:
|
||||||
|
types: ready_for_review
|
||||||
branches: [main]
|
branches: [main]
|
||||||
paths:
|
paths:
|
||||||
- ".github/workflows/llm-harness-evaluation.yml"
|
- ".github/workflows/llm-harness-evaluation.yml"
|
||||||
|
|
@ -32,9 +33,10 @@ jobs:
|
||||||
# task: "arc"
|
# task: "arc"
|
||||||
# precision: "sym_int4" #options: sym_int4, fp4, mixed_fp4, sym_int8, fp8, mixed_fp8
|
# precision: "sym_int4" #options: sym_int4, fp4, mixed_fp4, sym_int8, fp8, mixed_fp8
|
||||||
python-version: ["3.9"]
|
python-version: ["3.9"]
|
||||||
model_name: [stablelm-3b-4e1t]
|
model_name: [stablelm-3b-4e1t,Mistral-7B-v0.1]
|
||||||
task: [winogrande, drop, gsm8k] # truthfulqa, arc, hellaswag, mmlu, winogrande, drop, gsm8k
|
task: [truthfulqa, arc]
|
||||||
precision: [sym_int4] #options: sym_int4, fp4, mixed_fp4, sym_int8, fp8, mixed_fp8
|
precision: [mixed_fp4, fp8]
|
||||||
|
device: [xpu]
|
||||||
|
|
||||||
runs-on: [self-hosted, llm, accuracy]
|
runs-on: [self-hosted, llm, accuracy]
|
||||||
env:
|
env:
|
||||||
|
|
@ -98,5 +100,16 @@ jobs:
|
||||||
export HF_DATASETS=$HARNESS_HF_HOME/datasets
|
export HF_DATASETS=$HARNESS_HF_HOME/datasets
|
||||||
export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets
|
export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
python run_llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device xpu --tasks ${{ matrix.task }} --batch_size 1 --no_cache
|
python run_llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device ${{ matrix.device }} --tasks ${{ matrix.task }} --batch_size 1 --no_cache --output_path results
|
||||||
|
|
||||||
|
|
||||||
|
- name: Compare with golden accuracy
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
working-directory: ${{ github.workspace }}/python/llm
|
||||||
|
run: |
|
||||||
|
python test/benchmark/harness_nightly/accuracy_regression.py dev/benchmark/harness/results/${{ matrix.model_name }}/${{ matrix.device }}/${{ matrix.precision }}/${{ matrix.task }}/result.json test/benchmark/harness_nightly/golden_results.json
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,56 @@
|
||||||
|
#
|
||||||
|
# Copyright 2016 The BigDL Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def main(res_path, golden_path):
|
||||||
|
print(res_path, golden_path)
|
||||||
|
with open(res_path, "r") as f:
|
||||||
|
results = json.load(f)['results']
|
||||||
|
print(results)
|
||||||
|
|
||||||
|
model_name, device, precision, task = res_path.split('/')[-5:-1]
|
||||||
|
|
||||||
|
with open(golden_path, "r") as f:
|
||||||
|
golden_results = json.load(f)[model_name][device][precision]
|
||||||
|
print(golden_results)
|
||||||
|
|
||||||
|
identical = True
|
||||||
|
for task in results.keys():
|
||||||
|
|
||||||
|
if task not in golden_results:
|
||||||
|
identical = False
|
||||||
|
logger.error(f"Task {task} should be updated to golden results.")
|
||||||
|
continue
|
||||||
|
task_results = results[task]
|
||||||
|
task_golden = golden_results[task]
|
||||||
|
for m in task_results.keys():
|
||||||
|
if m in task_golden and task_results[m] != task_golden[m]:
|
||||||
|
if not m.endswith("_stderr"):
|
||||||
|
identical = False
|
||||||
|
logger.error(f"Different on metric '{m}' [golden acc/ current acc]: [{task_golden[m]}/{task_results[m]}]")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Diff on {m} [golden acc/ current acc]: [{task_golden[m]}/{task_results[m]}]")
|
||||||
|
if identical:
|
||||||
|
logger.info("Accuracy values are identical to golden results.")
|
||||||
|
else:
|
||||||
|
raise RuntimeError("Accuracy has changed, please check if any accuracy issue or update golden accuracy value.")
|
||||||
|
|
||||||
|
main(*sys.argv[1:3])
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
{
|
||||||
|
"stablelm-3b-4e1t": {"xpu": {
|
||||||
|
"mixed_fp4": {
|
||||||
|
"truthfulqa_mc": {"mc1": 0.24357405140758873,"mc1_stderr": 0.015026354824910782,"mc2": 0.37399115063281224,"mc2_stderr": 0.013684003173581748},
|
||||||
|
"arc_challenge": {
|
||||||
|
"acc": 0.40102389078498296,
|
||||||
|
"acc_stderr": 0.014322255790719869,
|
||||||
|
"acc_norm": 0.44283276450511944,
|
||||||
|
"acc_norm_stderr": 0.014515573873348897
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fp8": {
|
||||||
|
"truthfulqa_mc": {
|
||||||
|
"mc1": 0.24479804161566707,
|
||||||
|
"mc1_stderr": 0.01505186948671501,
|
||||||
|
"mc2": 0.3747170112957169,
|
||||||
|
"mc2_stderr": 0.013516983188729865
|
||||||
|
},
|
||||||
|
"arc_challenge": {
|
||||||
|
"acc": 0.41552901023890787,
|
||||||
|
"acc_stderr": 0.014401366641216377,
|
||||||
|
"acc_norm": 0.46245733788395904,
|
||||||
|
"acc_norm_stderr": 0.014570144495075581
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}},
|
||||||
|
"Mistral-7B-v0.1": {"xpu": {
|
||||||
|
"mixed_fp4": {
|
||||||
|
"truthfulqa_mc": {"mc1": 0.27539779681762544,"mc1_stderr": 0.01563813566777552,"mc2": 0.41062244273774384,"mc2_stderr": 0.014067078150027909},
|
||||||
|
"arc_challenge": {"acc": 0.5674061433447098,"acc_stderr": 0.014478005694182528,"acc_norm": 0.5989761092150171,"acc_norm_stderr": 0.014322255790719867}
|
||||||
|
},
|
||||||
|
"fp8": {
|
||||||
|
"truthfulqa_mc": {"mc1": 0.2778457772337821,"mc1_stderr": 0.015680929364024643,"mc2": 0.4212635093545362,"mc2_stderr": 0.01414660694632397},
|
||||||
|
"arc_challenge": {"acc": 0.5639931740614335,"acc_stderr": 0.014491225699230916,"acc_norm": 0.5998293515358362,"acc_norm_stderr": 0.014317197787809174}
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue