LLM: add whisper models into nightly test (#10193)

* LLM: add whisper models into nightly test * small fix * small fix * add more whisper models * test all cases * test specific cases * collect the csv * store the resut * to html * small fix * small test * test all cases * modify whisper_csv_to_html
2024-03-11 20:00:47 +08:00 · 2024-03-11 20:00:47 +08:00 · 17bdb1a60b
commit 17bdb1a60b
parent dbcfc5c2fa
4 changed files with 400 additions and 4 deletions
--- a/.github/workflows/llm-whisper-evaluation.yml
+++ b/.github/workflows/llm-whisper-evaluation.yml
@ -0,0 +1,207 @@
 name: LLM Whisper Models Evaluation
 # Cancel previous runs in the PR when you push new commits
 concurrency:
  group: ${{ github.workflow }}-llm-nightly-test-${{ github.event.pull_request.number || github.run_id }}
  cancel-in-progress: true
 permissions:
  contents: read
 # Controls when the action will run.
 on:
  schedule:
    - cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China
  pull_request:
    branches: [main]
    paths:
      - ".github/workflows/llm-whisper-evaluation.yml"
  # Allows you to run this workflow manually from the Actions tab
  workflow_dispatch:
    inputs:
      model_name:
        description: 'Model names, separated by comma and must be quoted.'
        required: true
        type: string
      precision:
        description: 'Precisions, separated by comma and must be quoted.'
        required: true
        type: string
      task:
        description: 'Tasks, separated by comma and must be quoted.'
        required: true
        type: string
      runs-on:
        description: 'Labels to filter the runners, separated by comma and must be quoted.'
        default: "accuracy"
        required: false
        type: string
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
  llm-cpp-build: # please uncomment it for PR tests
    uses: ./.github/workflows/llm-binary-build.yml
  # Set the testing matrix based on the event (schedule, PR, or manual dispatch)
  set-matrix:
    runs-on: ubuntu-latest
    outputs:
      model_name: ${{ steps.set-matrix.outputs.model_name }}
      precision: ${{ steps.set-matrix.outputs.precision }}
      task: ${{ steps.set-matrix.outputs.task }}
      runner: ${{ steps.set-matrix.outputs.runner }}
    steps:
      - name: set-env
        env:
          MATRIX_MODEL_NAME: '["whisper-tiny", "whisper-small", "whisper-medium", "whisper-base"]'
          MATRIX_TASK: '["librispeech"]'
          MATRIX_PRECISION: '["sym_int4", "fp8_e5m2"]'
          LABELS: '["self-hosted", "llm", "perf"]'
        run: |
            echo "model_name=$MATRIX_MODEL_NAME" >> $GITHUB_ENV
            echo "task=$MATRIX_TASK" >> $GITHUB_ENV
            echo "precision=$MATRIX_PRECISION" >> $GITHUB_ENV
            echo "runner=$LABELS" >> $GITHUB_ENV
      - name: set-matrix
        id: set-matrix
        run: |
            echo "model_name=$model_name" >> $GITHUB_OUTPUT
            echo "task=$task" >> $GITHUB_OUTPUT
            echo "precision=$precision" >> $GITHUB_OUTPUT
            echo "runner=$runner" >> $GITHUB_OUTPUT
  llm-whisper-evaluation:
    # if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-whisper-evaluation' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
    needs: [llm-cpp-build, set-matrix] # please uncomment it for PR tests
    # needs: [set-matrix] # please comment it for PR tests
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.9"]
        model_name: ${{ fromJson(needs.set-matrix.outputs.model_name) }}
        task: ${{ fromJson(needs.set-matrix.outputs.task) }}
        precision: ${{ fromJson(needs.set-matrix.outputs.precision) }}
        device: [xpu]
    runs-on: ${{ fromJson(needs.set-matrix.outputs.runner) }}
    env:
      ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
      ORIGIN_DIR: /mnt/disk1/models
    steps:
      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        shell: bash
        run: |
          python -m pip install --upgrade pip
          python -m pip install --upgrade wheel
          python -m pip install --upgrade pandas
          python -m pip install --upgrade datasets
          python -m pip install --upgrade evaluate
          python -m pip install --upgrade soundfile
          python -m pip install --upgrade librosa
          python -m pip install --upgrade jiwer
      # please uncomment it and comment the "Install BigDL-LLM from Pypi" part for PR tests
      - name: Download llm binary
        uses: ./.github/actions/llm/download-llm-binary
      - name: Run LLM install (all) test
        uses: ./.github/actions/llm/setup-llm-env
        with:
          extra-dependency: "xpu_2.1"
      # - name: Install BigDL-LLM from Pypi
      #   shell: bash
      #   run: |
      #     pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
      # - name: Test installed xpu version
      #   shell: bash
      #   run: |
      #     source /opt/intel/oneapi/setvars.sh
      #     bash python/llm/test/run-llm-install-tests.sh
      - name: Run whisper evaluation
        shell: bash
        run: |
          source /opt/intel/oneapi/setvars.sh
          export USE_XETLA=OFF
          export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
          echo "MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/" >> "$GITHUB_ENV"
          MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/
          export LIBRISPEECH_DATASET_PATH=/mnt/disk1/datasets/librispeech
          cd python/llm/dev/benchmark/whisper
          python run_whisper.py --model_path ${MODEL_PATH} --data_type other --device xpu --load_in_low_bit ${{ matrix.precision }} --save_result
      - uses: actions/upload-artifact@v3
        with:
          name: whisper_results
          path:
            ${{ github.workspace }}/python/llm/dev/benchmark/whisper/results/**
  llm-whisper-summary:
    if: ${{github.event_name == 'schedule' || github.event_name == 'pull_request'}}
    needs: [set-matrix, llm-whisper-evaluation]
    runs-on: ["self-hosted", "llm", "perf"]
    steps:
      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
      - name: Set up Python 3.9
        uses: actions/setup-python@v4
        with:
          python-version: 3.9
      - name: Set output path
        shell: bash
        run: |
          DATE=$(date +%Y-%m-%d)
          OUTPUT_PATH="results_$DATE"
          echo "OUTPUT_PATH=$OUTPUT_PATH" >> $GITHUB_ENV
          NIGHTLY_FOLDER="/mnt/disk1/whisper_nightly_gpu"
          echo "NIGHTLY_FOLDER=$NIGHTLY_FOLDER" >> $GITHUB_ENV
          PR_FOLDER="/mnt/disk1/whisper_pr_gpu"
          echo "PR_FOLDER=$PR_FOLDER" >> $GITHUB_ENV
      - name: Download all results for nightly run
        if: github.event_name == 'schedule'
        uses: actions/download-artifact@v3
        with:
          name: whisper_results
          path: ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }}
      - name: Download all results for pr run
        if: github.event_name == 'pull_request'
        uses: actions/download-artifact@v3
        with:
          name: whisper_results
          path: ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }}
      - name: Summarize the results for nightly run
        if: github.event_name == 'schedule'
        shell: bash
        run: |
          cp -r /mnt/disk1/datasets/whisper_fp16_results/* /mnt/disk1/whisper_nightly_gpu/${{ env.OUTPUT_PATH }}
          pip install pandas==1.5.3
          python ${{ github.workspace }}/python/llm/dev/benchmark/whisper/whisper_concat_csv.py -i ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }} -o ${{ env.NIGHTLY_FOLDER}}
          python ${{ github.workspace }}/python/llm/dev/benchmark/whisper/whisper_csv_to_html.py -f ${{ env.NIGHTLY_FOLDER}}
      - name: Summarize the results for pull request
        if: github.event_name == 'pull_request'
        shell: bash
        run: |
          cp -r /mnt/disk1/datasets/whisper_fp16_results/* /mnt/disk1/whisper_pr_gpu/${{ env.OUTPUT_PATH }}
          pip install pandas==1.5.3
          python ${{ github.workspace }}/python/llm/dev/benchmark/whisper/whisper_concat_csv.py -i ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }} -o ${{ env.PR_FOLDER}}
          python ${{ github.workspace }}/python/llm/dev/benchmark/whisper/whisper_csv_to_html.py -f ${{ env.PR_FOLDER}}
--- a/python/llm/dev/benchmark/whisper/run_whisper.py
+++ b/python/llm/dev/benchmark/whisper/run_whisper.py
@ -21,12 +21,20 @@ import torch
 from evaluate import load
 import time
 import argparse
 import pandas as pd
 import os
 import csv
 from datetime import date
 current_dir = os.path.dirname(os.path.realpath(__file__))
 def get_args():
    parser = argparse.ArgumentParser(description="Evaluate Whisper performance and accuracy")
    parser.add_argument('--model_path', required=True, help='pretrained model path')
    parser.add_argument('--data_type', required=True, help='clean, other')
    parser.add_argument('--device', required=False, help='cpu, xpu')
    parser.add_argument('--load_in_low_bit', default='sym_int4', help='Specify whether to load data in low bit format (e.g., 4-bit)')
    parser.add_argument('--save_result', action='store_true', help='Save the results to a CSV file')
    args = parser.parse_args()
    return args
@ -40,7 +48,7 @@ if __name__ == '__main__':
    processor = WhisperProcessor.from_pretrained(args.model_path)
    forced_decoder_ids = processor.get_decoder_prompt_ids(language='en', task='transcribe')
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(args.model_path, load_in_low_bit="sym_int4", optimize_model=True).eval().to(args.device)
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(args.model_path, load_in_low_bit=args.load_in_low_bit, optimize_model=True).eval().to(args.device)
    model.config.forced_decoder_ids = None
    def map_to_pred(batch):
@ -67,6 +75,24 @@ if __name__ == '__main__':
    wer = load("./wer")
    speech_length = sum(result["length"][1:])
    prc_time = sum(result["time"][1:])
-    print("Realtime Factor(RTF) is : %.4f" % (prc_time/speech_length))
+
-    print("Realtime X(RTX) is : %.2f" % (speech_length/prc_time))
+    MODEL = args.model_path.split('/')[-2]
-    print(f'WER is {100 * wer.compute(references=result["reference"], predictions=result["prediction"])}')
+    RTF = prc_time/speech_length
    RTX = speech_length/prc_time
    WER = 100 * wer.compute(references=result["reference"], predictions=result["prediction"])
    today = date.today()
    if args.save_result:
        csv_name = f'{current_dir}/results/{MODEL}-{args.data_type}-{args.device}-{args.load_in_low_bit}-{today}.csv'
        os.makedirs(os.path.dirname(csv_name), exist_ok=True)
        with open(csv_name, mode='a', newline='') as file:
            csv_writer = csv.writer(file)
            file.seek(0, os.SEEK_END)
            if file.tell() == 0:
                csv_writer.writerow(["models","precision","WER","RTF"])
            csv_writer.writerow([MODEL, args.load_in_low_bit, WER, RTF])
        print(f'Results saved to {csv_name}')
    print("Realtime Factor(RTF) is : %.4f" % RTF)
    print("Realtime X(RTX) is : %.2f" % RTX)
    print(f'WER is {WER}')
--- a/python/llm/dev/benchmark/whisper/whisper_concat_csv.py
+++ b/python/llm/dev/benchmark/whisper/whisper_concat_csv.py
@ -0,0 +1,50 @@
 #
 # Copyright 2016 The BigDL Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Python program to concat CSVs
 import os
 import sys
 import argparse
 import pandas as pd
 from datetime import date
 def main():
    parser = argparse.ArgumentParser(description="concat .csv files")
    parser.add_argument("-i", "--input_path", type=str, dest="input_path",
                        help="The directory which stores the original CSV files", default="./")
    parser.add_argument("-o", "--output_path", type=str, dest="output_path",
                        help="The directory which stores the concated CSV file", default="./")
    args = parser.parse_args()
    csv_files = []
    for file_name in os.listdir(args.input_path):
        file_path = os.path.join(args.input_path, file_name)
        if os.path.isfile(file_path) and file_name.endswith(".csv"):
            csv_files.append(file_path)
    csv_files.sort()
    merged_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
    merged_df.reset_index(drop=True, inplace=True)
    today = date.today()
    csv_name = f'whisper-{today}.csv'
    output_file_path = os.path.join(args.output_path, csv_name)
    merged_df.to_csv(output_file_path)
 if __name__ == "__main__":
    sys.exit(main())
--- a/python/llm/dev/benchmark/whisper/whisper_csv_to_html.py
+++ b/python/llm/dev/benchmark/whisper/whisper_csv_to_html.py
@ -0,0 +1,113 @@
 #
 # Copyright 2016 The BigDL Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Python program to convert CSV to HTML Table
 import os
 import sys
 import argparse
 import pandas as pd
 def highlight_vals(val, max=3.0, color1='red', color2='green'):
    if isinstance(val, float):
        if val > max:
            return 'background-color: %s' % color2
        elif val <= -max:
            return 'background-color: %s' % color1
    else:
        return ''
 def main():
    parser = argparse.ArgumentParser(description="convert .csv file to .html file")
    parser.add_argument("-f", "--folder_path", type=str, dest="folder_path",
                        help="The directory which stores the .csv file", default="/mnt/disk1/whisper_pr_gpu/")
    parser.add_argument("-t", "--threshold", type=float, dest="threshold",
                        help="the threshold of highlight values", default=1.0)
    args = parser.parse_args()
    csv_files = []
    for file_name in os.listdir(args.folder_path):
        file_path = os.path.join(args.folder_path, file_name)
        if os.path.isfile(file_path) and file_name.endswith(".csv"):
            csv_files.append(file_path)
    csv_files.sort(reverse=True)
    latest_csv = pd.read_csv(csv_files[0], index_col=0)
    daily_html=csv_files[0].split(".")[0]+".html"
    if len(csv_files)>1:
        previous_csv = pd.read_csv(csv_files[1], index_col=0)
        last1=['']*len(latest_csv.index)
        diff1=['']*len(latest_csv.index)
        last2=['']*len(latest_csv.index)
        diff2=['']*len(latest_csv.index)
        WER='WER'
        RTF='RTF'
        for latest_csv_ind,latest_csv_row in latest_csv.iterrows():
            latest_csv_model=latest_csv_row['models'].strip()
            latest_csv_precision=latest_csv_row['precision'].strip()
            latest_WER=latest_csv_row[WER]
            latest_RTF=latest_csv_row[RTF]
            in_previous_flag=False
            for previous_csv_ind,previous_csv_row in previous_csv.iterrows():
                previous_csv_model=previous_csv_row['models'].strip()
                previous_csv_precision=previous_csv_row['precision'].strip()
                if latest_csv_model==previous_csv_model and latest_csv_precision==previous_csv_precision:
                    previous_WER=previous_csv_row[WER]
                    previous_RTF=previous_csv_row[RTF]
                    if previous_WER > 0.0 and previous_RTF > 0.0:
                        last1[latest_csv_ind]=previous_WER
                        diff1[latest_csv_ind]=round((previous_WER-latest_WER)*100/previous_WER,2)
                        last2[latest_csv_ind]=previous_RTF
                        diff2[latest_csv_ind]=round((previous_RTF-latest_RTF)*100/previous_RTF,2)
                        in_previous_flag=True
            if not in_previous_flag:
                last1[latest_csv_ind]=pd.NA
                diff1[latest_csv_ind]=pd.NA
                last2[latest_csv_ind]=pd.NA
                diff2[latest_csv_ind]=pd.NA
        latest_csv.insert(loc=4,column='last1',value=last1)
        latest_csv.insert(loc=5,column='diff1(%)',value=diff1)
        latest_csv.insert(loc=6,column='last2',value=last2)
        latest_csv.insert(loc=7,column='diff2(%)',value=diff2)
        subset1=['diff1(%)','diff2(%)']
        columns={'WER': '{:.6f}', 'RTF': '{:.6f}', 'last1': '{:.6f}', 'diff1(%)': '{:.6f}','last2': '{:.6f}', 'diff2(%)': '{:.6f}'}
        styled_df = latest_csv.style.format(columns).applymap(lambda val: highlight_vals(val, max=1.0, color1='red', color2='green'), subset=subset1)
        html_output = styled_df.set_table_attributes("border=1").render()
        with open(daily_html, 'w') as f:
            f.write(html_output)
    else:
        latest_csv.to_html(daily_html)
    return 0
 if __name__ == "__main__":
    sys.exit(main())