Modify harness evaluation workflow (#10174)
* Modify table head in harness * Specify the file path of fp16.csv * change run to run nightly and run pr to debug * Modify the way to get fp16.csv to downloading from github * Change the method to calculate diff in html table * Change the method to calculate diff in html table * Re-arrange job order * Re-arrange job order * Change limit * Change fp16.csv path * Change highlight rules * Change limit
This commit is contained in:
		
							parent
							
								
									b55fd00fb1
								
							
						
					
					
						commit
						de3dc609ee
					
				
					 2 changed files with 49 additions and 14 deletions
				
			
		
							
								
								
									
										36
									
								
								.github/workflows/llm-harness-evaluation.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										36
									
								
								.github/workflows/llm-harness-evaluation.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
				
			
			@ -166,7 +166,8 @@ jobs:
 | 
			
		|||
          fi
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
      - name: Run harness
 | 
			
		||||
      - name: Run harness nightly
 | 
			
		||||
        if: ${{github.event_name == 'schedule'}}
 | 
			
		||||
        shell: bash
 | 
			
		||||
        working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness
 | 
			
		||||
        env:
 | 
			
		||||
| 
						 | 
				
			
			@ -185,6 +186,28 @@ jobs:
 | 
			
		|||
            --device ${{ matrix.device }} \
 | 
			
		||||
            --tasks ${{ matrix.task }} \
 | 
			
		||||
            --batch_size 1 --no_cache --output_path results \
 | 
			
		||||
      
 | 
			
		||||
      - name: Run harness pr
 | 
			
		||||
        if: ${{github.event_name == 'pull_request'}}
 | 
			
		||||
        shell: bash
 | 
			
		||||
        working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness
 | 
			
		||||
        env:
 | 
			
		||||
          USE_XETLA: OFF
 | 
			
		||||
          # SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS: 1
 | 
			
		||||
        run: |
 | 
			
		||||
          export HF_HOME=${HARNESS_HF_HOME}
 | 
			
		||||
          export HF_DATASETS=$HARNESS_HF_HOME/datasets
 | 
			
		||||
          export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets
 | 
			
		||||
          source /opt/intel/oneapi/setvars.sh
 | 
			
		||||
          
 | 
			
		||||
          python run_llb.py \
 | 
			
		||||
            --model bigdl-llm \
 | 
			
		||||
            --pretrained ${MODEL_PATH} \
 | 
			
		||||
            --precision ${{ matrix.precision }} \
 | 
			
		||||
            --device ${{ matrix.device }} \
 | 
			
		||||
            --tasks ${{ matrix.task }} \
 | 
			
		||||
            --batch_size 1 --no_cache --output_path results \
 | 
			
		||||
            --limit 3 \
 | 
			
		||||
 | 
			
		||||
      - uses: actions/upload-artifact@v3
 | 
			
		||||
        with:
 | 
			
		||||
| 
						 | 
				
			
			@ -226,7 +249,7 @@ jobs:
 | 
			
		|||
          python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py results
 | 
			
		||||
 | 
			
		||||
  # TODO: change machine to store the results later        
 | 
			
		||||
  llm-harness-summary-nightly:
 | 
			
		||||
  llm-harness-summary-html:
 | 
			
		||||
    if: ${{github.event_name == 'schedule' || github.event_name == 'pull_request'}}
 | 
			
		||||
    needs: [set-matrix, llm-harness-evaluation]
 | 
			
		||||
    runs-on: ["self-hosted", "llm", "accuracy1", "accuracy-nightly"]
 | 
			
		||||
| 
						 | 
				
			
			@ -267,6 +290,13 @@ jobs:
 | 
			
		|||
          name: harness_results
 | 
			
		||||
          path: ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }}
 | 
			
		||||
 | 
			
		||||
      # Save fp16.csv in the parent folder of env.nightly_folder
 | 
			
		||||
      - name: Download fp16.csv for summary
 | 
			
		||||
        shell: bash
 | 
			
		||||
        run: |
 | 
			
		||||
          wget https://raw.githubusercontent.com/intel-analytics/BigDL/main/python/llm/dev/benchmark/harness/fp16.csv -O ${{ env.NIGHTLY_FOLDER}}/../fp16.csv
 | 
			
		||||
          ls ${{ env.NIGHTLY_FOLDER}}/..
 | 
			
		||||
 | 
			
		||||
      - name: Summarize the results for nightly run
 | 
			
		||||
        if: github.event_name == 'schedule'
 | 
			
		||||
        shell: bash
 | 
			
		||||
| 
						 | 
				
			
			@ -275,7 +305,7 @@ jobs:
 | 
			
		|||
          pip install pandas==1.5.3
 | 
			
		||||
          python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }} ${{ env.NIGHTLY_FOLDER}}
 | 
			
		||||
          python ${{ github.workspace }}/python/llm/dev/benchmark/harness/harness_csv_to_html.py -f ${{ env.NIGHTLY_FOLDER}}
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
      - name: Summarize the results for pull request
 | 
			
		||||
        if: github.event_name == 'pull_request'
 | 
			
		||||
        shell: bash
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -21,12 +21,14 @@ import sys
 | 
			
		|||
import argparse
 | 
			
		||||
import pandas as pd
 | 
			
		||||
 | 
			
		||||
def highlight_vals(val, max=3.0, color1='red', color2='green'):
 | 
			
		||||
def highlight_vals(val, max=3.0, color1='red', color2='green', color3='yellow'):
 | 
			
		||||
    if isinstance(val, float):
 | 
			
		||||
        if val > max:
 | 
			
		||||
            return 'background-color: %s' % color2
 | 
			
		||||
        elif val <= -max:
 | 
			
		||||
            return 'background-color: %s' % color1
 | 
			
		||||
        elif val != 0.0:
 | 
			
		||||
            return 'background-color: %s' % color3
 | 
			
		||||
    else:
 | 
			
		||||
        return ''
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -80,7 +82,10 @@ def main():
 | 
			
		|||
                        help="the baseline path which stores the baseline.csv file")
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
 | 
			
		||||
    fp16_dict = create_fp16_dict('fp16.csv')
 | 
			
		||||
    # fp16.csv is downloaded previously under the parent folder of the folder_path
 | 
			
		||||
    parent_dir = os.path.dirname((args.folder_path))
 | 
			
		||||
    fp16_path = os.path.join(parent_dir, 'fp16.csv')
 | 
			
		||||
    fp16_dict = create_fp16_dict(fp16_path)
 | 
			
		||||
 | 
			
		||||
    csv_files = []
 | 
			
		||||
    for file_name in os.listdir(args.folder_path):
 | 
			
		||||
| 
						 | 
				
			
			@ -157,11 +162,11 @@ def main():
 | 
			
		|||
                    previous_winogrande=previous_csv_row[Winogrande]
 | 
			
		||||
                    if previous_arc > 0.0 and previous_truthfulqa > 0.0 and previous_winogrande > 0.0:
 | 
			
		||||
                        last_Arc[latest_csv_ind]=previous_arc
 | 
			
		||||
                        diff_Arc[latest_csv_ind]=round((previous_arc-latest_arc)*100/previous_arc,2)
 | 
			
		||||
                        diff_Arc[latest_csv_ind]=round((latest_arc-previous_arc)*100/previous_arc,2)
 | 
			
		||||
                        last_TruthfulQA[latest_csv_ind]=previous_truthfulqa
 | 
			
		||||
                        diff_TruthfulQA[latest_csv_ind]=round((previous_truthfulqa-latest_truthfulqa)*100/previous_truthfulqa,2)
 | 
			
		||||
                        diff_TruthfulQA[latest_csv_ind]=round((latest_truthfulqa-previous_truthfulqa)*100/previous_truthfulqa,2)
 | 
			
		||||
                        last_Winogrande[latest_csv_ind]=previous_winogrande
 | 
			
		||||
                        diff_Winogrande[latest_csv_ind]=round((previous_winogrande-latest_winogrande)*100/previous_winogrande,2)
 | 
			
		||||
                        diff_Winogrande[latest_csv_ind]=round((latest_winogrande-previous_winogrande)*100/previous_winogrande,2)
 | 
			
		||||
                        in_previous_flag=True
 | 
			
		||||
 | 
			
		||||
            if not in_previous_flag:
 | 
			
		||||
| 
						 | 
				
			
			@ -172,12 +177,12 @@ def main():
 | 
			
		|||
                last_Winogrande[latest_csv_ind]=pd.NA
 | 
			
		||||
                diff_Winogrande[latest_csv_ind]=pd.NA
 | 
			
		||||
 | 
			
		||||
        latest_csv.insert(loc=5,column='last_Arc',value=last_Arc)
 | 
			
		||||
        latest_csv.insert(loc=6,column='diff_Arc(%)',value=diff_Arc)
 | 
			
		||||
        latest_csv.insert(loc=7,column='last_TruthfulQA',value=last_TruthfulQA)
 | 
			
		||||
        latest_csv.insert(loc=8,column='diff_TruthfulQA(%)',value=diff_TruthfulQA)
 | 
			
		||||
        latest_csv.insert(loc=9,column='last_Winogrande',value=last_Winogrande)
 | 
			
		||||
        latest_csv.insert(loc=10,column='diff_Winogrande(%)',value=diff_Winogrande)
 | 
			
		||||
        latest_csv.insert(loc=6,column='last_Arc',value=last_Arc)
 | 
			
		||||
        latest_csv.insert(loc=7,column='diff_Arc(%)',value=diff_Arc)
 | 
			
		||||
        latest_csv.insert(loc=8,column='last_TruthfulQA',value=last_TruthfulQA)
 | 
			
		||||
        latest_csv.insert(loc=9,column='diff_TruthfulQA(%)',value=diff_TruthfulQA)
 | 
			
		||||
        latest_csv.insert(loc=10,column='last_Winogrande',value=last_Winogrande)
 | 
			
		||||
        latest_csv.insert(loc=11,column='diff_Winogrande(%)',value=diff_Winogrande)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        diffs_within_normal_range = is_diffs_within_normal_range(diff_Arc, diff_TruthfulQA, diff_Winogrande, threshold=highlight_threshold)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue