diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 5727f172..b5b08d82 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -171,7 +171,13 @@ jobs: export HF_DATASETS=$HARNESS_HF_HOME/datasets export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets source /opt/intel/oneapi/setvars.sh - python run_llb.py --model bigdl-llm --pretrained ${MODEL_PATH} --precision ${{ matrix.precision }} --device ${{ matrix.device }} --tasks ${{ matrix.task }} --batch_size 1 --no_cache --output_path results + python run_llb.py \ + --model bigdl-llm \ + --pretrained ${MODEL_PATH} \ + --precision ${{ matrix.precision }} \ + --device ${{ matrix.device }} \ + --tasks ${{ matrix.task }} \ + --batch_size 1 --no_cache --output_path results - name: Compare with golden accuracy @@ -179,8 +185,45 @@ jobs: if: ${{github.event_name == 'schedule'}} working-directory: ${{ github.workspace }}/python/llm run: | - python test/benchmark/harness_nightly/accuracy_regression.py dev/benchmark/harness/results/${{ matrix.model_name }}/${{ matrix.device }}/${{ matrix.precision }}/${{ matrix.task }}/result.json test/benchmark/harness_nightly/golden_results.json - - - + python test/benchmark/harness_nightly/accuracy_regression.py \ + dev/benchmark/harness/results/${{ matrix.model_name }}/${{ matrix.device }}/${{ matrix.precision }}/${{ matrix.task }}/result.json \ + test/benchmark/harness_nightly/golden_results.json + - uses: actions/upload-artifact@v3 + with: + name: harness_results + path: + ${{ github.workspace }}/python/llm/dev/benchmark/harness/results/** + + - name: echo single result + shell: bash + + working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness/results/ + run: | + cat ${{ matrix.model_name }}/${{ matrix.device }}/${{ matrix.precision }}/${{ matrix.task }}/result.json + + llm-harness-summary: + if: ${{ always() }} + needs: llm-harness-evalution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.9 + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install dependencies + shell: bash + run: | + pip install --upgrade pip + pip install jsonlines pytablewriter regex + - name: Download all results + uses: actions/download-artifact@v3 + with: + name: harness_results + path: results + - name: Summarize the results + shell: bash + run: | + ls results + python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_results.py results \ No newline at end of file diff --git a/python/llm/dev/benchmark/harness/make_table_results.py b/python/llm/dev/benchmark/harness/make_table_results.py new file mode 100644 index 00000000..c90e95ae --- /dev/null +++ b/python/llm/dev/benchmark/harness/make_table_results.py @@ -0,0 +1,85 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Usage: + python make_table_results.py +""" + +import logging +from pytablewriter import MarkdownTableWriter, LatexTableWriter +import os +import json +import sys +from harness_to_leaderboard import task_to_metric + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def make_table(result_dict): + """Generate table of results.""" + md_writer = MarkdownTableWriter() + latex_writer = LatexTableWriter() + md_writer.headers = ["Model", "Precision", "Task", "Metric", "Value"] + latex_writer.headers = ["Model", "Precision", "Task", "Metric", "Value"] + + values = [] + for model, model_results in result_dict.items(): + for precision, prec_results in model_results.items(): + for task, task_results in prec_results.items(): + + results = task_results["results"] + m = task_to_metric[task] + if len(results) > 1: + result = results[task] + else: + result = list(results.values())[0] + + values.append([model, precision, task, m, "%.2f" % (result[m] * 100)]) + + model = "" + precision = "" + + md_writer.value_matrix = values + latex_writer.value_matrix = values + + # todo: make latex table look good + # print(latex_writer.dumps()) + + return md_writer.dumps() + + +if __name__ == "__main__": + + # loop dirs and subdirs in results dir + # for each dir, load json files + merged_results = dict() + for dirpath, dirnames, filenames in os.walk(sys.argv[1]): + # skip dirs without files + if not filenames: + continue + for filename in sorted([f for f in filenames if f.endswith(".json")]): + path = os.path.join(dirpath, filename) + model, device, precision, task = dirpath.split('/')[-4:] + with open(path, "r") as f: + result_dict = json.load(f) + if model not in merged_results: + merged_results[model] = dict() + if precision not in merged_results[model]: + merged_results[model][precision] = dict() + merged_results[model][precision][task] = result_dict + print(make_table(merged_results))