From 8472de90e88693491e713eb7d2370906ecabedde Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 23 Feb 2024 16:33:34 +0800 Subject: [PATCH 01/15] use stable lm to test pr --- .github/workflows/llm-harness-evaluation.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 5b6b7727..882f522e 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -68,9 +68,9 @@ jobs: - name: set-pr-env if: ${{github.event_name == 'pull_request'}} env: - PR_MATRIX_MODEL_NAME: '["Mistral-7B-v0.1"]' - PR_MATRIX_TASK: '["arc", "truthfulqa", "winogrande"]' - PR_MATRIX_PRECISION: '["fp8"]' + PR_MATRIX_MODEL_NAME: '["stablelm-3b-4e1t"]' + PR_MATRIX_TASK: '["truthfulqa", "winogrande"]' + PR_MATRIX_PRECISION: '["sym_int4"]' PR_LABELS: '["self-hosted", "llm", "temp-arc01"]' run: | @@ -294,7 +294,7 @@ jobs: - name: Download fp16.csv for summary shell: bash run: | - wget https://raw.githubusercontent.com/intel-analytics/BigDL/main/python/llm/test/benchmark/harness/fp16.csv -O ${{ env.NIGHTLY_FOLDER}}/../fp16.csv + wget https://raw.githubusercontent.com/intel-analytics/BigDL/main/python/llm/dev/benchmark/harness/fp16.csv -O ${{ env.NIGHTLY_FOLDER}}/../fp16.csv ls ${{ env.NIGHTLY_FOLDER}}/.. - name: Summarize the results for nightly run @@ -304,8 +304,8 @@ jobs: ls /home/arda/harness-action-runners/nightly-accuracy-data/${{ env.OUTPUT_PATH }} pip install pandas==1.5.3 python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }} ${{ env.NIGHTLY_FOLDER}} - python ${{ github.workspace }}/python/llm/test/benchmark/harness/harness_csv_to_html.py -f ${{ env.NIGHTLY_FOLDER}} - python ${{ github.workspace }}/python/llm/test/benchmark/harness/update_html_in_parent_folder.py -f ${{ env.NIGHTLY_FOLDER }} + python ${{ github.workspace }}/python/llm/dev/benchmark/harness/harness_csv_to_html.py -f ${{ env.NIGHTLY_FOLDER}} + python ${{ github.workspace }}/python/llm/dev/benchmark/harness/update_html_in_parent_folder.py -f ${{ env.NIGHTLY_FOLDER }} - name: Summarize the results for pull request if: github.event_name == 'pull_request' @@ -314,4 +314,4 @@ jobs: ls /home/arda/harness-action-runners/pr-accuracy-data/${{ env.OUTPUT_PATH }} pip install pandas==1.5.3 python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }} ${{ env.PR_FOLDER}} - python ${{ github.workspace }}/python/llm/test/benchmark/harness/harness_csv_to_html.py -f ${{ env.PR_FOLDER}} \ No newline at end of file + python ${{ github.workspace }}/python/llm/dev/benchmark/harness/harness_csv_to_html.py -f ${{ env.PR_FOLDER}} \ No newline at end of file From 9c8e349196983db052d10c46a492a9aebbefb6e8 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 23 Feb 2024 16:33:34 +0800 Subject: [PATCH 02/15] remove harness job output --- .github/workflows/llm-harness-evaluation.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 882f522e..66207bf5 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -110,8 +110,6 @@ jobs: device: [xpu] runs-on: ${{ fromJson(needs.set-matrix.outputs.runner) }} - outputs: - output_path: ${{ steps.run_harness.outputs.output_path }} env: ANALYTICS_ZOO_ROOT: ${{ github.workspace }} ORIGIN_DIR: /mnt/disk1/models From 5399343adc819bc7818b904017110b4126f3e02d Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 23 Feb 2024 16:33:35 +0800 Subject: [PATCH 03/15] fix harness installation --- .github/workflows/llm-harness-evaluation.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 66207bf5..be119459 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -140,7 +140,10 @@ jobs: working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness/ shell: bash run: | - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b09 + git clone https://github.com/EleutherAI/lm-evaluation-harness.git + cd lm-evaluation-harness + git checkout b281b09 + pip install -e . - name: Download models and datasets shell: bash From e1fcf54a0ce12747b227f26edb7541d73c3b1732 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 23 Feb 2024 16:33:36 +0800 Subject: [PATCH 04/15] reformat --- .github/workflows/llm-harness-evaluation.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index be119459..2992e3a6 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -161,9 +161,9 @@ jobs: run: | pip install --upgrade datasets==2.14.6 if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then - pip install --upgrade transformers==4.36 + pip install --upgrade transformers==4.36 else - pip install --upgrade transformers==4.31 + pip install --upgrade transformers==4.31 fi From 02cb96e7f611276c627f90ce0fd0b780822c3353 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 23 Feb 2024 16:33:37 +0800 Subject: [PATCH 05/15] fix Run Harness job --- .github/workflows/llm-harness-evaluation.yml | 31 +++++--------------- 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 2992e3a6..fcfa2f3c 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -167,8 +167,7 @@ jobs: fi - - name: Run harness nightly - if: ${{github.event_name == 'schedule'}} + - name: Run harness shell: bash working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness env: @@ -180,6 +179,11 @@ jobs: export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets source /opt/intel/oneapi/setvars.sh + # set --limit if it's pr-triggered to accelerate pr action + if ${{github.event_name == 'pull_request'}}; then + export LIMIT="--limit 4" + fi + python run_llb.py \ --model bigdl-llm \ --pretrained ${MODEL_PATH} \ @@ -187,28 +191,7 @@ jobs: --device ${{ matrix.device }} \ --tasks ${{ matrix.task }} \ --batch_size 1 --no_cache --output_path results \ - - - name: Run harness pr - if: ${{github.event_name == 'pull_request'}} - shell: bash - working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness - env: - USE_XETLA: OFF - # SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS: 1 - run: | - export HF_HOME=${HARNESS_HF_HOME} - export HF_DATASETS=$HARNESS_HF_HOME/datasets - export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets - source /opt/intel/oneapi/setvars.sh - - python run_llb.py \ - --model bigdl-llm \ - --pretrained ${MODEL_PATH} \ - --precision ${{ matrix.precision }} \ - --device ${{ matrix.device }} \ - --tasks ${{ matrix.task }} \ - --batch_size 1 --no_cache --output_path results \ - --limit 3 \ + $LIMIT - uses: actions/upload-artifact@v3 with: From bfa98666a64550c4099855779be9adf7ecfcbe2a Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 23 Feb 2024 16:33:38 +0800 Subject: [PATCH 06/15] fall back to make_table.py --- .github/workflows/llm-harness-evaluation.yml | 2 +- python/llm/dev/benchmark/harness/README.md | 4 + .../llm/dev/benchmark/harness/make_table.py | 108 ++++++++++++++++++ 3 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 python/llm/dev/benchmark/harness/make_table.py diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index fcfa2f3c..deb339ae 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -230,7 +230,7 @@ jobs: shell: bash run: | ls results - python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py results + python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table.py results # TODO: change machine to store the results later llm-harness-summary-html: diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md index a373d66c..5ecc0d71 100644 --- a/python/llm/dev/benchmark/harness/README.md +++ b/python/llm/dev/benchmark/harness/README.md @@ -26,3 +26,7 @@ python run_multi_llb.py --model bigdl-llm --pretrained /path/to/model --precisio Taking example above, the script will fork 3 processes, each for one xpu, to execute the tasks. ## Results We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result. +## Summarize the results +"""python +python make_table.py +""" \ No newline at end of file diff --git a/python/llm/dev/benchmark/harness/make_table.py b/python/llm/dev/benchmark/harness/make_table.py new file mode 100644 index 00000000..cdad073c --- /dev/null +++ b/python/llm/dev/benchmark/harness/make_table.py @@ -0,0 +1,108 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Usage: + python make_table.py +""" + +import logging +from pytablewriter import MarkdownTableWriter, LatexTableWriter +import os +import json +import sys +import csv +import datetime +from harness_to_leaderboard import task_to_metric + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def make_table(result_dict): + """Generate table of results.""" + md_writer = MarkdownTableWriter() + latex_writer = LatexTableWriter() + md_writer.headers = ["Model", "Precision", "Arc", "Hellaswag", "MMLU", "TruthfulQA","Winogrande", "GSM8K"] + latex_writer.headers = ["Model", "Precision", "Arc", "Hellaswag", "MMLU", "TruthfulQA","Winogrande", "GSM8K"] + + tasks = ["arc", "hellaswag", "mmlu", "truthfulqa", "winogrande", "gsm8k"] + values = [] + for model, model_results in result_dict.items(): + for precision, prec_results in model_results.items(): + value = [model, precision] + for task in tasks: + + task_results = prec_results.get(task, None) + if task_results is None: + value.append("") + else: + m = task_to_metric[task] + results = task_results["results"] + if len(results) > 1: + result = results[task] + else: + result = list(results.values())[0] + value.append("%.2f" % (result[m] * 100)) + values.append(value) + model = "" + precision = "" + + md_writer.value_matrix = values + latex_writer.value_matrix = values + + # todo: make latex table look good + # print(latex_writer.dumps()) + + return md_writer.dumps() + + +def merge_results(path): + # loop dirs and subdirs in results dir + # for each dir, load json files + print('Read from', path) + merged_results = dict() + for dirpath, dirnames, filenames in os.walk(path): + # skip dirs without files + if not filenames: + continue + for filename in sorted([f for f in filenames if f.endswith("result.json")]): + path = os.path.join(dirpath, filename) + model, device, precision, task = dirpath.split('/')[-4:] + with open(path, "r") as f: + result_dict = json.load(f) + if model not in merged_results: + merged_results[model] = dict() + if precision not in merged_results[model]: + merged_results[model][precision] = dict() + merged_results[model][precision][task] = result_dict + return merged_results + + +def main(*args): + if len(args) > 1: + input_path = args[1] + else: + raise ValueError("Input path is required") + + merged_results = merge_results(input_path) + print(make_table(merged_results)) + + +if __name__ == "__main__": + # when running from the harness, the first argument is the script name + # you must name the second argument and the third argument(optional) to be the input_dir and output_dir + main(*sys.argv) From 6fe5344fa6e31597fb3e1803d15385e6708fd1a8 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 23 Feb 2024 16:33:38 +0800 Subject: [PATCH 07/15] separate make_csv from the file --- .../{make_table_and_csv.py => make_csv.py} | 59 ++++--------------- 1 file changed, 10 insertions(+), 49 deletions(-) rename python/llm/dev/benchmark/harness/{make_table_and_csv.py => make_csv.py} (64%) diff --git a/python/llm/dev/benchmark/harness/make_table_and_csv.py b/python/llm/dev/benchmark/harness/make_csv.py similarity index 64% rename from python/llm/dev/benchmark/harness/make_table_and_csv.py rename to python/llm/dev/benchmark/harness/make_csv.py index d2d3b5af..621d4bd8 100644 --- a/python/llm/dev/benchmark/harness/make_table_and_csv.py +++ b/python/llm/dev/benchmark/harness/make_csv.py @@ -15,7 +15,7 @@ # """ Usage: - python make_table_results.py + python make_csv.py """ import logging @@ -32,43 +32,6 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -def make_table(result_dict): - """Generate table of results.""" - md_writer = MarkdownTableWriter() - latex_writer = LatexTableWriter() - md_writer.headers = ["Model", "Precision", "Arc", "Hellaswag", "MMLU", "TruthfulQA","Winogrande", "GSM8K"] - latex_writer.headers = ["Model", "Precision", "Arc", "Hellaswag", "MMLU", "TruthfulQA","Winogrande", "GSM8K"] - - tasks = ["arc", "hellaswag", "mmlu", "truthfulqa", "winogrande", "gsm8k"] - values = [] - for model, model_results in result_dict.items(): - for precision, prec_results in model_results.items(): - value = [model, precision] - for task in tasks: - - task_results = prec_results.get(task, None) - if task_results is None: - value.append("") - else: - m = task_to_metric[task] - results = task_results["results"] - if len(results) > 1: - result = results[task] - else: - result = list(results.values())[0] - value.append("%.2f" % (result[m] * 100)) - values.append(value) - model = "" - precision = "" - - md_writer.value_matrix = values - latex_writer.value_matrix = values - - # todo: make latex table look good - # print(latex_writer.dumps()) - - return md_writer.dumps() - def make_csv(result_dict, output_path=None): current_date = datetime.datetime.now().strftime("%Y-%m-%d") file_name = f'results_{current_date}.csv' @@ -102,7 +65,7 @@ def merge_results(path): # for each dir, load json files print('Read from', path) merged_results = dict() - for dirpath, dirnames, filenames in os.walk(sys.argv[1]): + for dirpath, dirnames, filenames in os.walk(path): # skip dirs without files if not filenames: continue @@ -120,19 +83,17 @@ def merge_results(path): def main(*args): - if len(args) > 1: - input_path = args[1] - else: - raise ValueError("Input path is required") - - if len(args) > 2: - output_path = args[2] # use the third argument as the output path - else: - output_path = "./" # default to current directory + assert len(args) > 2, \ + """Usage: + python make_csv.py + """ + + input_path = args[1] + output_path = args[2] + merged_results = merge_results(input_path) make_csv(merged_results, output_path) - print(make_table(merged_results)) if __name__ == "__main__": From 88f7f569802a13b96fd291b37a54dab1796f4c88 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 23 Feb 2024 16:33:39 +0800 Subject: [PATCH 08/15] rewrite html visualization --- .github/workflows/llm-harness-evaluation.yml | 59 +++++++++----------- 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index deb339ae..105cb5b7 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -233,7 +233,7 @@ jobs: python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table.py results # TODO: change machine to store the results later - llm-harness-summary-html: + llm-harness-html: if: ${{github.event_name == 'schedule' || github.event_name == 'pull_request'}} needs: [set-matrix, llm-harness-evaluation] runs-on: ["self-hosted", "llm", "accuracy1", "accuracy-nightly"] @@ -248,54 +248,45 @@ jobs: run: | pip install --upgrade pip pip install jsonlines pytablewriter regex + pip install pandas==1.5.3 - name: Set output path shell: bash run: | - DATE=$(date +%Y-%m-%d) - OUTPUT_PATH="results_$DATE" - echo "OUTPUT_PATH=$OUTPUT_PATH" >> $GITHUB_ENV - NIGHTLY_FOLDER="/home/arda/harness-action-runners/nightly-accuracy-data" - echo "NIGHTLY_FOLDER=$NIGHTLY_FOLDER" >> $GITHUB_ENV - PR_FOLDER="/home/arda/harness-action-runners/pr-accuracy-data" - echo "PR_FOLDER=$PR_FOLDER" >> $GITHUB_ENV + echo "DATE=$(date +%Y-%m-%d)" >> $GITHUB_ENV + if ${{github.event_name == 'pull_request'}}; then + echo 'ACC_FOLDER="/home/arda/harness-action-runners/pr-accuracy-data"' >> $GITHUB_ENV + fi + if ${{github.event_name == 'schedule'}}; then + echo 'ACC_FOLDER="/home/arda/harness-action-runners/nightly-accuracy-data"' >> $GITHUB_ENV + fi - - name: Download all results for nightly run - if: github.event_name == 'schedule' + - name: Download all results uses: actions/download-artifact@v3 with: name: harness_results - path: ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }} - - - name: Download all results for pr run - if: github.event_name == 'pull_request' - uses: actions/download-artifact@v3 - with: - name: harness_results - path: ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }} + path: ${{ env.ACC_FOLDER}}/${{ env.DATE }} + # Save fp16.csv in the parent folder of env.nightly_folder - - name: Download fp16.csv for summary + - name: Download FP16 results shell: bash run: | - wget https://raw.githubusercontent.com/intel-analytics/BigDL/main/python/llm/dev/benchmark/harness/fp16.csv -O ${{ env.NIGHTLY_FOLDER}}/../fp16.csv - ls ${{ env.NIGHTLY_FOLDER}}/.. + wget https://raw.githubusercontent.com/intel-analytics/BigDL/main/python/llm/test/benchmark/harness/fp16.csv -O $ACC_FOLDER/../fp16.csv + ls $ACC_FOLDER/.. - - name: Summarize the results for nightly run - if: github.event_name == 'schedule' + - name: Write to CSV + working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness shell: bash run: | - ls /home/arda/harness-action-runners/nightly-accuracy-data/${{ env.OUTPUT_PATH }} - pip install pandas==1.5.3 - python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }} ${{ env.NIGHTLY_FOLDER}} - python ${{ github.workspace }}/python/llm/dev/benchmark/harness/harness_csv_to_html.py -f ${{ env.NIGHTLY_FOLDER}} - python ${{ github.workspace }}/python/llm/dev/benchmark/harness/update_html_in_parent_folder.py -f ${{ env.NIGHTLY_FOLDER }} + ls $ACC_FOLDER/$DATE + python make_table_and_csv.py $ACC_FOLDER/$DATE $ACC_FOLDER - - name: Summarize the results for pull request - if: github.event_name == 'pull_request' + - name: Update HTML + working-directory: ${{ github.workspace }}/python/llm/test/benchmark/harness shell: bash run: | - ls /home/arda/harness-action-runners/pr-accuracy-data/${{ env.OUTPUT_PATH }} - pip install pandas==1.5.3 - python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }} ${{ env.PR_FOLDER}} - python ${{ github.workspace }}/python/llm/dev/benchmark/harness/harness_csv_to_html.py -f ${{ env.PR_FOLDER}} \ No newline at end of file + python harness_csv_to_html.py -f $ACC_FOLDER + if github.event_name == 'schedule'; then + python update_html_in_parent_folder.py -f $ACC_FOLDER + fi \ No newline at end of file From e838ec9e1460426e1bd5a929fde0f20a2717d43f Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 23 Feb 2024 16:33:40 +0800 Subject: [PATCH 09/15] remove dependency --- .github/workflows/llm-harness-evaluation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 105cb5b7..407b09a8 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -235,7 +235,7 @@ jobs: # TODO: change machine to store the results later llm-harness-html: if: ${{github.event_name == 'schedule' || github.event_name == 'pull_request'}} - needs: [set-matrix, llm-harness-evaluation] + needs: [llm-harness-evaluation] runs-on: ["self-hosted", "llm", "accuracy1", "accuracy-nightly"] steps: - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3 From 4fdf96dc8b426294690820f555dcf2c0883433f6 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 23 Feb 2024 17:11:03 +0800 Subject: [PATCH 10/15] fix ACC_FOLDER --- .github/workflows/llm-harness-evaluation.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 407b09a8..7b4ba66d 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -255,10 +255,10 @@ jobs: run: | echo "DATE=$(date +%Y-%m-%d)" >> $GITHUB_ENV if ${{github.event_name == 'pull_request'}}; then - echo 'ACC_FOLDER="/home/arda/harness-action-runners/pr-accuracy-data"' >> $GITHUB_ENV + echo 'ACC_FOLDER=/home/arda/harness-action-runners/pr-accuracy-data' >> $GITHUB_ENV fi if ${{github.event_name == 'schedule'}}; then - echo 'ACC_FOLDER="/home/arda/harness-action-runners/nightly-accuracy-data"' >> $GITHUB_ENV + echo 'ACC_FOLDER=/home/arda/harness-action-runners/nightly-accuracy-data' >> $GITHUB_ENV fi - name: Download all results From a204337cad5b7ddde6054000c86edc31412393d0 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 23 Feb 2024 17:12:37 +0800 Subject: [PATCH 11/15] Rename results --- .github/workflows/llm-harness-evaluation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 7b4ba66d..9f496fb1 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -261,7 +261,7 @@ jobs: echo 'ACC_FOLDER=/home/arda/harness-action-runners/nightly-accuracy-data' >> $GITHUB_ENV fi - - name: Download all results + - name: Download harness results uses: actions/download-artifact@v3 with: name: harness_results From a55cc91e1f2b42d41d2901a171a1513fe6853fa2 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Fri, 23 Feb 2024 20:25:46 +0800 Subject: [PATCH 12/15] fix make_csv.py --- .github/workflows/llm-harness-evaluation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 9f496fb1..be23569e 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -280,7 +280,7 @@ jobs: shell: bash run: | ls $ACC_FOLDER/$DATE - python make_table_and_csv.py $ACC_FOLDER/$DATE $ACC_FOLDER + python make_csv.py $ACC_FOLDER/$DATE $ACC_FOLDER - name: Update HTML working-directory: ${{ github.workspace }}/python/llm/test/benchmark/harness From 85d13c65de70ddeab0b69a2cf8f2532fc1b994d0 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Sat, 24 Feb 2024 00:33:33 +0800 Subject: [PATCH 13/15] run one job only if triggered by pr --- .github/workflows/llm-harness-evaluation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index be23569e..0bcc8af4 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -69,7 +69,7 @@ jobs: if: ${{github.event_name == 'pull_request'}} env: PR_MATRIX_MODEL_NAME: '["stablelm-3b-4e1t"]' - PR_MATRIX_TASK: '["truthfulqa", "winogrande"]' + PR_MATRIX_TASK: '["winogrande"]' PR_MATRIX_PRECISION: '["sym_int4"]' PR_LABELS: '["self-hosted", "llm", "temp-arc01"]' From 213ef06691395f6c50c28d40c0abba9324013d08 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Sat, 24 Feb 2024 00:38:08 +0800 Subject: [PATCH 14/15] fix readme --- python/llm/dev/benchmark/harness/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md index 5ecc0d71..46b39865 100644 --- a/python/llm/dev/benchmark/harness/README.md +++ b/python/llm/dev/benchmark/harness/README.md @@ -5,7 +5,10 @@ Before running, make sure to have [bigdl-llm](../../../README.md) installed. ## Install Harness ```bash -pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b09 +git clone https://github.com/EleutherAI/lm-evaluation-harness.git +cd lm-evaluation-harness +git checkout b281b09 +pip install -e . ``` ## Run From 62350a36f0bf917f57202b57ba19995e4840dd33 Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Mon, 26 Feb 2024 13:39:59 +0800 Subject: [PATCH 15/15] fix if in update html --- .github/workflows/llm-harness-evaluation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 0bcc8af4..14180239 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -287,6 +287,6 @@ jobs: shell: bash run: | python harness_csv_to_html.py -f $ACC_FOLDER - if github.event_name == 'schedule'; then + if ${{github.event_name == 'schedule'}}; then python update_html_in_parent_folder.py -f $ACC_FOLDER fi \ No newline at end of file