diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index b65d3bd5..b4c7b521 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -70,9 +70,9 @@ jobs: - name: set-pr-env if: ${{github.event_name == 'pull_request'}} env: - PR_MATRIX_MODEL_NAME: '["Mistral-7B-v0.1"]' - PR_MATRIX_TASK: '["arc", "truthfulqa", "winogrande"]' - PR_MATRIX_PRECISION: '["fp8"]' + PR_MATRIX_MODEL_NAME: '["stablelm-3b-4e1t"]' + PR_MATRIX_TASK: '["winogrande"]' + PR_MATRIX_PRECISION: '["sym_int4"]' PR_LABELS: '["self-hosted", "llm", "temp-arc01"]' run: | @@ -112,8 +112,6 @@ jobs: device: [xpu] runs-on: ${{ fromJson(needs.set-matrix.outputs.runner) }} - outputs: - output_path: ${{ steps.run_harness.outputs.output_path }} env: ANALYTICS_ZOO_ROOT: ${{ github.workspace }} ORIGIN_DIR: /mnt/disk1/models @@ -146,7 +144,10 @@ jobs: working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness/ shell: bash run: | - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b09 + git clone https://github.com/EleutherAI/lm-evaluation-harness.git + cd lm-evaluation-harness + git checkout b281b09 + pip install -e . - name: Download models and datasets shell: bash @@ -164,14 +165,13 @@ jobs: run: | pip install --upgrade datasets==2.14.6 if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then - pip install --upgrade transformers==4.36 + pip install --upgrade transformers==4.36 else - pip install --upgrade transformers==4.31 + pip install --upgrade transformers==4.31 fi - - name: Run harness nightly - if: ${{github.event_name == 'schedule'}} + - name: Run harness shell: bash working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness env: @@ -183,6 +183,11 @@ jobs: export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets source /opt/intel/oneapi/setvars.sh + # set --limit if it's pr-triggered to accelerate pr action + if ${{github.event_name == 'pull_request'}}; then + export LIMIT="--limit 4" + fi + python run_llb.py \ --model bigdl-llm \ --pretrained ${MODEL_PATH} \ @@ -190,28 +195,7 @@ jobs: --device ${{ matrix.device }} \ --tasks ${{ matrix.task }} \ --batch_size 1 --no_cache --output_path results \ - - - name: Run harness pr - if: ${{github.event_name == 'pull_request'}} - shell: bash - working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness - env: - USE_XETLA: OFF - # SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS: 1 - run: | - export HF_HOME=${HARNESS_HF_HOME} - export HF_DATASETS=$HARNESS_HF_HOME/datasets - export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets - source /opt/intel/oneapi/setvars.sh - - python run_llb.py \ - --model bigdl-llm \ - --pretrained ${MODEL_PATH} \ - --precision ${{ matrix.precision }} \ - --device ${{ matrix.device }} \ - --tasks ${{ matrix.task }} \ - --batch_size 1 --no_cache --output_path results \ - --limit 3 \ + $LIMIT - uses: actions/upload-artifact@v3 with: @@ -250,12 +234,12 @@ jobs: shell: bash run: | ls results - python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py results + python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table.py results # TODO: change machine to store the results later - llm-harness-summary-html: + llm-harness-html: if: ${{github.event_name == 'schedule' || github.event_name == 'pull_request'}} - needs: [set-matrix, llm-harness-evaluation] + needs: [llm-harness-evaluation] runs-on: ["self-hosted", "llm", "accuracy1", "accuracy-nightly"] steps: - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3 @@ -268,54 +252,45 @@ jobs: run: | pip install --upgrade pip pip install jsonlines pytablewriter regex + pip install pandas==1.5.3 - name: Set output path shell: bash run: | - DATE=$(date +%Y-%m-%d) - OUTPUT_PATH="results_$DATE" - echo "OUTPUT_PATH=$OUTPUT_PATH" >> $GITHUB_ENV - NIGHTLY_FOLDER="/home/arda/harness-action-runners/nightly-accuracy-data" - echo "NIGHTLY_FOLDER=$NIGHTLY_FOLDER" >> $GITHUB_ENV - PR_FOLDER="/home/arda/harness-action-runners/pr-accuracy-data" - echo "PR_FOLDER=$PR_FOLDER" >> $GITHUB_ENV + echo "DATE=$(date +%Y-%m-%d)" >> $GITHUB_ENV + if ${{github.event_name == 'pull_request'}}; then + echo 'ACC_FOLDER=/home/arda/harness-action-runners/pr-accuracy-data' >> $GITHUB_ENV + fi + if ${{github.event_name == 'schedule'}}; then + echo 'ACC_FOLDER=/home/arda/harness-action-runners/nightly-accuracy-data' >> $GITHUB_ENV + fi - - name: Download all results for nightly run - if: github.event_name == 'schedule' + - name: Download harness results uses: actions/download-artifact@v3 with: name: harness_results - path: ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }} - - - name: Download all results for pr run - if: github.event_name == 'pull_request' - uses: actions/download-artifact@v3 - with: - name: harness_results - path: ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }} + path: ${{ env.ACC_FOLDER}}/${{ env.DATE }} + # Save fp16.csv in the parent folder of env.nightly_folder - - name: Download fp16.csv for summary + - name: Download FP16 results shell: bash run: | - wget https://raw.githubusercontent.com/intel-analytics/BigDL/main/python/llm/test/benchmark/harness/fp16.csv -O ${{ env.NIGHTLY_FOLDER}}/../fp16.csv - ls ${{ env.NIGHTLY_FOLDER}}/.. + wget https://raw.githubusercontent.com/intel-analytics/BigDL/main/python/llm/test/benchmark/harness/fp16.csv -O $ACC_FOLDER/../fp16.csv + ls $ACC_FOLDER/.. - - name: Summarize the results for nightly run - if: github.event_name == 'schedule' + - name: Write to CSV + working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness shell: bash run: | - ls /home/arda/harness-action-runners/nightly-accuracy-data/${{ env.OUTPUT_PATH }} - pip install pandas==1.5.3 - python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }} ${{ env.NIGHTLY_FOLDER}} - python ${{ github.workspace }}/python/llm/test/benchmark/harness/harness_csv_to_html.py -f ${{ env.NIGHTLY_FOLDER}} - python ${{ github.workspace }}/python/llm/test/benchmark/harness/update_html_in_parent_folder.py -f ${{ env.NIGHTLY_FOLDER }} + ls $ACC_FOLDER/$DATE + python make_csv.py $ACC_FOLDER/$DATE $ACC_FOLDER - - name: Summarize the results for pull request - if: github.event_name == 'pull_request' + - name: Update HTML + working-directory: ${{ github.workspace }}/python/llm/test/benchmark/harness shell: bash run: | - ls /home/arda/harness-action-runners/pr-accuracy-data/${{ env.OUTPUT_PATH }} - pip install pandas==1.5.3 - python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }} ${{ env.PR_FOLDER}} - python ${{ github.workspace }}/python/llm/test/benchmark/harness/harness_csv_to_html.py -f ${{ env.PR_FOLDER}} \ No newline at end of file + python harness_csv_to_html.py -f $ACC_FOLDER + if ${{github.event_name == 'schedule'}}; then + python update_html_in_parent_folder.py -f $ACC_FOLDER + fi \ No newline at end of file diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md index a373d66c..46b39865 100644 --- a/python/llm/dev/benchmark/harness/README.md +++ b/python/llm/dev/benchmark/harness/README.md @@ -5,7 +5,10 @@ Before running, make sure to have [bigdl-llm](../../../README.md) installed. ## Install Harness ```bash -pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b09 +git clone https://github.com/EleutherAI/lm-evaluation-harness.git +cd lm-evaluation-harness +git checkout b281b09 +pip install -e . ``` ## Run @@ -26,3 +29,7 @@ python run_multi_llb.py --model bigdl-llm --pretrained /path/to/model --precisio Taking example above, the script will fork 3 processes, each for one xpu, to execute the tasks. ## Results We follow [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to record our metrics, `acc_norm` for `hellaswag` and `arc_challenge`, `mc2` for `truthful_qa` and `acc` for `mmlu`. For `mmlu`, there are 57 subtasks which means users may need to average them manually to get final result. +## Summarize the results +"""python +python make_table.py +""" \ No newline at end of file diff --git a/python/llm/dev/benchmark/harness/make_csv.py b/python/llm/dev/benchmark/harness/make_csv.py new file mode 100644 index 00000000..621d4bd8 --- /dev/null +++ b/python/llm/dev/benchmark/harness/make_csv.py @@ -0,0 +1,102 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Usage: + python make_csv.py +""" + +import logging +from pytablewriter import MarkdownTableWriter, LatexTableWriter +import os +import json +import sys +import csv +import datetime +from harness_to_leaderboard import task_to_metric + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def make_csv(result_dict, output_path=None): + current_date = datetime.datetime.now().strftime("%Y-%m-%d") + file_name = f'results_{current_date}.csv' + full_path = os.path.join(output_path, file_name) if output_path else file_name + print('Writing to', full_path) + file_name = full_path + headers = ["Index", "Model", "Precision", "Arc", "TruthfulQA", "Winogrande"] + + with open(file_name, mode='w', newline='') as csv_file: + writer = csv.writer(csv_file) + writer.writerow(headers) + index = 0 + for model, model_results in result_dict.items(): + for precision, prec_results in model_results.items(): + row = [index, model, precision] + for task in headers[3:]: + task_results = prec_results.get(task.lower(), None) + if task_results is None: + row.append("") + else: + m = task_to_metric[task.lower()] + results = task_results["results"] + result = list(results.values())[0] if len(results) == 1 else results[task.lower()] + row.append("%.2f" % (result[m] * 100)) + writer.writerow(row) + index += 1 + + +def merge_results(path): + # loop dirs and subdirs in results dir + # for each dir, load json files + print('Read from', path) + merged_results = dict() + for dirpath, dirnames, filenames in os.walk(path): + # skip dirs without files + if not filenames: + continue + for filename in sorted([f for f in filenames if f.endswith("result.json")]): + path = os.path.join(dirpath, filename) + model, device, precision, task = dirpath.split('/')[-4:] + with open(path, "r") as f: + result_dict = json.load(f) + if model not in merged_results: + merged_results[model] = dict() + if precision not in merged_results[model]: + merged_results[model][precision] = dict() + merged_results[model][precision][task] = result_dict + return merged_results + + +def main(*args): + assert len(args) > 2, \ + """Usage: + python make_csv.py + """ + + input_path = args[1] + output_path = args[2] + + + merged_results = merge_results(input_path) + make_csv(merged_results, output_path) + + +if __name__ == "__main__": + # when running from the harness, the first argument is the script name + # you must name the second argument and the third argument(optional) to be the input_dir and output_dir + main(*sys.argv) diff --git a/python/llm/dev/benchmark/harness/make_table_and_csv.py b/python/llm/dev/benchmark/harness/make_table.py similarity index 69% rename from python/llm/dev/benchmark/harness/make_table_and_csv.py rename to python/llm/dev/benchmark/harness/make_table.py index d2d3b5af..cdad073c 100644 --- a/python/llm/dev/benchmark/harness/make_table_and_csv.py +++ b/python/llm/dev/benchmark/harness/make_table.py @@ -15,7 +15,7 @@ # """ Usage: - python make_table_results.py + python make_table.py """ import logging @@ -69,40 +69,13 @@ def make_table(result_dict): return md_writer.dumps() -def make_csv(result_dict, output_path=None): - current_date = datetime.datetime.now().strftime("%Y-%m-%d") - file_name = f'results_{current_date}.csv' - full_path = os.path.join(output_path, file_name) if output_path else file_name - print('Writing to', full_path) - file_name = full_path - headers = ["Index", "Model", "Precision", "Arc", "TruthfulQA", "Winogrande"] - - with open(file_name, mode='w', newline='') as csv_file: - writer = csv.writer(csv_file) - writer.writerow(headers) - index = 0 - for model, model_results in result_dict.items(): - for precision, prec_results in model_results.items(): - row = [index, model, precision] - for task in headers[3:]: - task_results = prec_results.get(task.lower(), None) - if task_results is None: - row.append("") - else: - m = task_to_metric[task.lower()] - results = task_results["results"] - result = list(results.values())[0] if len(results) == 1 else results[task.lower()] - row.append("%.2f" % (result[m] * 100)) - writer.writerow(row) - index += 1 - def merge_results(path): # loop dirs and subdirs in results dir # for each dir, load json files print('Read from', path) merged_results = dict() - for dirpath, dirnames, filenames in os.walk(sys.argv[1]): + for dirpath, dirnames, filenames in os.walk(path): # skip dirs without files if not filenames: continue @@ -124,14 +97,8 @@ def main(*args): input_path = args[1] else: raise ValueError("Input path is required") - - if len(args) > 2: - output_path = args[2] # use the third argument as the output path - else: - output_path = "./" # default to current directory merged_results = merge_results(input_path) - make_csv(merged_results, output_path) print(make_table(merged_results))