From 0c8d3c98302533794d6f3a53f050d6f890d15b34 Mon Sep 17 00:00:00 2001 From: Yuxuan Xia <77518229+NovTi@users.noreply.github.com> Date: Thu, 7 Mar 2024 16:44:49 +0800 Subject: [PATCH] Add C-Eval HTML report (#10294) * Add C-Eval HTML report * Fix C-Eval workflow pr trigger path * Fix C-Eval workflow typos * Add permissions to C-Eval workflow * Fix C-Eval workflow typo * Add pandas dependency * Fix C-Eval workflow typo --- .github/workflows/llm-c-evaluation.yml | 94 ++++++++++-- .../dev/benchmark/ceval/organize_results.py | 40 ++++- .../test/benchmark/ceval/ceval_csv_to_html.py | 138 ++++++++++++++++++ .../ceval/update_html_in_parent_folder.py | 49 +++++++ 4 files changed, 299 insertions(+), 22 deletions(-) create mode 100644 python/llm/test/benchmark/ceval/ceval_csv_to_html.py create mode 100644 python/llm/test/benchmark/ceval/update_html_in_parent_folder.py diff --git a/.github/workflows/llm-c-evaluation.yml b/.github/workflows/llm-c-evaluation.yml index 56c8a7e9..0a2cd0dc 100644 --- a/.github/workflows/llm-c-evaluation.yml +++ b/.github/workflows/llm-c-evaluation.yml @@ -15,7 +15,7 @@ on: pull_request: branches: [main] paths: - - ".github/workflows/llm-ceval.yml" + - ".github/workflows/llm-c-evaluation.yml" # Allows you to run this workflow manually from the Actions tab workflow_dispatch: inputs: @@ -84,7 +84,8 @@ jobs: echo "model_name=$model_name" >> $GITHUB_OUTPUT echo "precision=$precision" >> $GITHUB_OUTPUT echo "runner=$runner" >> $GITHUB_OUTPUT - llm-ceval-evaluation: + + llm-c-evaluation: timeout-minutes: 1200 needs: [llm-cpp-build, set-matrix] strategy: @@ -94,7 +95,7 @@ jobs: model_name: ${{ fromJson(needs.set-matrix.outputs.model_name) }} precision: ${{ fromJson(needs.set-matrix.outputs.precision) }} device: [xpu] - + runs-on: ${{ fromJson(needs.set-matrix.outputs.runner) }} env: ANALYTICS_ZOO_ROOT: ${{ github.workspace }} @@ -112,6 +113,11 @@ jobs: python -m pip install --upgrade pip python -m pip install --upgrade setuptools==58.0.4 python -m pip install --upgrade wheel + pip install einops + pip install thefuzz + pip install tiktoken + pip install transformers==4.31.0 + pip install transformers_stream_generator - name: Download llm binary uses: ./.github/actions/llm/download-llm-binary @@ -150,15 +156,6 @@ jobs: DATA_PATH=$CEVAL_HF_HOME/data unzip -o "$CEVAL_HF_HOME/data/ceval-exam.zip" -d "$CEVAL_HF_HOME/data" wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR} - - - name: Install Dependencies - shell: bash - run: | - pip install einops - pip install thefuzz - pip install tiktoken - pip install transformers==4.31.0 - pip install transformers_stream_generator - name: Run C-Eval shell: bash @@ -183,7 +180,7 @@ jobs: llm-ceval-summary: if: ${{ always() }} - needs: llm-ceval-evaluation + needs: llm-c-evaluation runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -191,13 +188,80 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.9 - - name: Download all results + - name: Install dependencies + shell: bash + run: | + pip install --upgrade pip + pip install pandas==1.5.3 + + - name: Download ceval results uses: actions/download-artifact@v3 with: name: ceval_results path: results + - name: Summarize the results shell: bash run: | ls results - python ${{ github.workspace }}/python/llm/dev/benchmark/ceval/organize_results.py results/ + echo "DATE=$(date +%Y-%m-%d)" >> $GITHUB_ENV + python ${{ github.workspace }}/python/llm/dev/benchmark/ceval/organize_results.py results/ results/ + + - name: Set artifact file path + run: echo "ARTIFACT_PATH=results/results_${{ env.DATE }}.csv" >> $GITHUB_ENV + + - uses: actions/upload-artifact@v3 + with: + name: results_${{ env.DATE }} + path: ${{ env.ARTIFACT_PATH }} + + llm-ceval-html: + if: ${{github.event_name == 'schedule' || github.event_name == 'pull_request'}} + needs: [llm-c-evaluation, llm-ceval-summary] + runs-on: ["self-hosted", "llm", "accuracy1", "accuracy-nightly"] + steps: + - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3 + - name: Set up Python 3.9 + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install dependencies + shell: bash + run: | + pip install --upgrade pip + pip install numpy + pip install pandas==1.5.3 + pip install jsonlines pytablewriter regex + + - name: Set output path + shell: bash + run: | + echo "DATE=$(date +%Y-%m-%d)" >> $GITHUB_ENV + if ${{github.event_name == 'pull_request'}}; then + echo 'ACC_FOLDER=/home/arda/ceval-action-runners/pr-accuracy-data' >> $GITHUB_ENV + fi + if ${{github.event_name == 'schedule'}}; then + echo 'ACC_FOLDER=/home/arda/ceval-action-runners/nightly-accuracy-data' >> $GITHUB_ENV + fi + + - name: Create ceval results directory if not exists + run: | + if [ ! -d "${{ env.ACC_FOLDER }}" ]; then + mkdir -p "${{ env.ACC_FOLDER }}" + fi + + - name: Download ceval results + uses: actions/download-artifact@v3 + with: + name: results_${{ env.DATE }} + path: ${{ env.ACC_FOLDER }} + rename: results_${{ env.DATE }}.csv + + - name: Update HTML + working-directory: ${{ github.workspace }}/python/llm/test/benchmark/ceval/ + shell: bash + run: | + python ceval_csv_to_html.py -f $ACC_FOLDER + if ${{github.event_name == 'schedule'}}; then + python update_html_in_parent_folder.py -f $ACC_FOLDER + fi \ No newline at end of file diff --git a/python/llm/dev/benchmark/ceval/organize_results.py b/python/llm/dev/benchmark/ceval/organize_results.py index a3c41110..57d759da 100644 --- a/python/llm/dev/benchmark/ceval/organize_results.py +++ b/python/llm/dev/benchmark/ceval/organize_results.py @@ -15,11 +15,17 @@ # import os +import pdb import sys +import csv import json +import datetime +import pandas as pd + if __name__ == '__main__': result_path = sys.argv[1] + output_path = sys.argv[2] column_size = [25, 15, 10, 18, 15, 10, 10, 10] pad_string = lambda x, l: [i.ljust(j) for i, j in zip(x, l)] @@ -49,20 +55,40 @@ if __name__ == '__main__': organized_dict[data['Model Name']] = {} organized_dict[data['Model Name']][data['Precision']] = result_lst - # define the print precision order - precision_order = ['sym_int4', 'mixed_fp4', 'fp4', 'sym_int8', 'fp8_e4m3', 'fp8_e5m2', 'mixed_fp8'] - + # define the print precision order + model_order = ['chatglm2-6b', 'chinese-llama2-7b', 'Qwen-7B-Chat'] + precision_order = ['sym_int4', 'fp8_e5m2'] + # print the results for model_name in organized_dict.keys(): for precision in precision_order: try: + # print the result print(' '.join(pad_string(organized_dict[model_name][precision], column_size))) except KeyError: - continue - + pass # separate between models print() - - + + # initialize the csv file + current_date = datetime.datetime.now().strftime("%Y-%m-%d") + file_name = f'results_{current_date}.csv' + file_name = os.path.join(output_path, file_name) if output_path else file_name + print('Writing to', file_name) + + with open(file_name, mode='w', newline='') as csv_file: + writer = csv.writer(csv_file) + + headers = ["Model Name", "Precision", 'STEM', 'Social Science', 'Humanities', 'Other', 'Hard', 'Average'] + writer.writerow(headers) + + # print the results + for model_name in model_order: + for precision in precision_order: + try: + # write the result to the csv row + writer.writerow(organized_dict[model_name][precision]) + except KeyError: + writer.writerow([model_name, precision]+[pd.NA for i in range(len(headers[2:]))]) diff --git a/python/llm/test/benchmark/ceval/ceval_csv_to_html.py b/python/llm/test/benchmark/ceval/ceval_csv_to_html.py new file mode 100644 index 00000000..ed37d2a0 --- /dev/null +++ b/python/llm/test/benchmark/ceval/ceval_csv_to_html.py @@ -0,0 +1,138 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Python program to convert CSV to HTML Table + +import os +import sys +import argparse +import numpy as np +import pandas as pd + +def highlight_vals(val, max=3.0, color1='red', color2='green', color3='yellow', is_last=False): + if isinstance(val, float): + if val > max: + return 'background-color: %s' % color2 + elif val <= -max: + return 'background-color: %s' % color1 + elif val != 0.0 and is_last: + return 'background-color: %s' % color3 + else: + return '' + +def calculate_percentage_difference(cur_array, previous_array): + new_array = [] + for i in range(len(cur_array)): + if type(cur_array[i]) == type(pd.NA) or type(previous_array[i]) == type(pd.NA): + new_array.append(pd.NA) + else: + new_array.append(round((cur_array[i]-previous_array[i])*100/previous_array[i], 2)) + return np.array(new_array) + +def check_diffs_within_normal_range(latest_csv, highlight_set, threshold): + within = True + + for column in highlight_set: + for value in latest_csv[column]: + if type(value) != type(pd.NA): + within = within and abs(value) <= threshold + + return within + + +def main(): + parser = argparse.ArgumentParser(description="convert .csv file to .html file") + parser.add_argument("-f", "--folder_path", type=str, dest="folder_path", + help="The directory which stores the .csv file", default="/home/arda/BigDL/python/llm/dev/benchmark/ceval") + parser.add_argument("-t", "--threshold", type=float, dest="threshold", + help="the threshold of highlight values", default=3.0) + parser.add_argument("-b", "--baseline_path", type=str, dest="baseline_path", + help="the baseline path which stores the baseline.csv file") + args = parser.parse_args() + + csv_files = [] + for file_name in os.listdir(args.folder_path): + file_path = os.path.join(args.folder_path, file_name) + if os.path.isfile(file_path) and file_name.endswith(".csv"): + csv_files.append(file_path) + csv_files.sort(reverse=True) + + highlight_threshold=args.threshold + + # get the newest csv file + latest_csv = pd.read_csv(csv_files[0], index_col=0) + + # create daily html file + daily_html=csv_files[0].split(".")[0]+".html" + + # add index column + latest_csv.reset_index(inplace=True) + + # if found more than 1 csv file + if len(csv_files)>1: + if args.baseline_path: + previous_csv = pd.read_csv(args.baseline_path, index_col=0) + else: + previous_csv = pd.read_csv(csv_files[1], index_col=0) + + subjects = ['STEM', 'Social Science', 'Humanities', 'Other', 'Hard', 'Average'] + precisions = ['sym_int4', 'fp8_e5m2'] + highlight_set = [] + + insert_column = latest_csv.shape[-1]-1 + # in the make_csv step we will handle the missing values and make it pd.NA + for subject in subjects: + # insert last accuracy task + latest_csv.insert(loc=insert_column, column=f'last_{subject}', + value=previous_csv[subject]) + + # insert precentage difference between previous and current value + latest_csv.insert( + loc=insert_column+1, + column=f'diff_{subject}(%)', + value=calculate_percentage_difference(latest_csv[subject], previous_csv[subject])) + # append in the highlight set + highlight_set.append(f'diff_{subject}(%)') + + # update insert column + insert_column += 2 + + columns = {} + for column in latest_csv.columns.values.tolist(): + columns[column] = '{:.2f}' + + styled_df = latest_csv.style.format(columns).applymap(lambda val: highlight_vals(val, max=3.0, is_last=True), subset=highlight_set) + + # add css style to restrict width and wrap text + styled_df.set_table_styles([{ + 'selector': 'th, td', + 'props': [('max-width', '88px'), ('word-wrap', 'break-word')] + }], overwrite=False) + + html_output = styled_df.set_table_attributes("border=1").to_html() + + with open(daily_html, 'w') as f: + f.write(html_output) + else: + latest_csv.to_html(daily_html) + + if args.baseline_path and not check_diffs_within_normal_range(latest_csv, highlight_set, highlight_threshold): + print("The diffs are outside the normal range: %" + str(highlight_threshold)) + return 1 + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/python/llm/test/benchmark/ceval/update_html_in_parent_folder.py b/python/llm/test/benchmark/ceval/update_html_in_parent_folder.py new file mode 100644 index 00000000..9e972e9d --- /dev/null +++ b/python/llm/test/benchmark/ceval/update_html_in_parent_folder.py @@ -0,0 +1,49 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Python program to update Html in parent folder + +import os +import shutil +import argparse +from pathlib import Path + +def update_html_in_parent_folder(folder_path): + # Get parent folder + parent_folder = Path(folder_path).parent + + # List all html files under parent folder and delete them + for html_file in parent_folder.glob('*.html'): + html_file.unlink() + + # Find latest html file under folder_path + latest_html_file = max(Path(folder_path).glob('*.html'), key=os.path.getctime, default=None) + + # Copy the latest html file to parent folder + if latest_html_file is not None: + shutil.copy(latest_html_file, parent_folder) + + print(latest_html_file.name) + +def main(): + parser = argparse.ArgumentParser(description="Update HTML in parent folder.") + parser.add_argument("-f", "--folder", type=str, help="Path to the folder") + args = parser.parse_args() + + update_html_in_parent_folder(args.folder) + +if __name__ == "__main__": + main()