diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 6cc9802c..96473621 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -8,7 +8,7 @@ concurrency: # Controls when the action will run. on: schedule: - - cron: "00 16 * * *" # GMT time, 16:00 GMT == 00:00 China + - cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China pull_request: branches: [main] paths: @@ -52,7 +52,7 @@ jobs: if: ${{github.event_name == 'schedule'}} env: NIGHTLY_MATRIX_MODEL_NAME: '["Llama2-7b-guanaco-dolphin-500", "falcon-7b-instruct-with-patch", - "Mistral-7B-v0.1", "mpt-7b-chat", "Baichuan2-7B-Chat-LLaMAfied", "stablelm-3b"]' + "Mistral-7B-v0.1", "mpt-7b-chat", "Baichuan2-7B-Chat-LLaMAfied"]' NIGHTLY_MATRIX_TASK: '["arc", "truthfulqa", "winogrande"]' NIGHTLY_MATRIX_PRECISION: '["sym_int4", "fp8"]' NIGHTLY_LABELS: '["self-hosted", "llm", "accuracy-nightly"]' @@ -65,9 +65,9 @@ jobs: - name: set-pr-env if: ${{github.event_name == 'pull_request'}} env: - PR_MATRIX_MODEL_NAME: '["stablelm-3b-4e1t", "Mistral-7B-v0.1"]' + PR_MATRIX_MODEL_NAME: '["Llama2-7b-guanaco-dolphin-500"]' PR_MATRIX_TASK: '["truthfulqa"]' - PR_MATRIX_PRECISION: '["fp8"]' + PR_MATRIX_PRECISION: '["sys_int4"]' PR_LABELS: '["self-hosted", "llm", "temp-arc01"]' run: | echo "model_name=$PR_MATRIX_MODEL_NAME" >> $GITHUB_ENV @@ -99,11 +99,6 @@ jobs: strategy: fail-fast: false matrix: - # include: - # python-version: "3.9" - # model_name: "stablelm-3b-4e1t" - # task: "arc" - # precision: "sym_int4" #options: sym_int4, fp4, mixed_fp4, sym_int8, fp8, mixed_fp8 python-version: ["3.9"] model_name: ${{ fromJson(needs.set-matrix.outputs.model_name) }} task: ${{ fromJson(needs.set-matrix.outputs.task) }} @@ -111,6 +106,8 @@ jobs: device: [xpu] runs-on: ${{ fromJson(needs.set-matrix.outputs.runner) }} + outputs: + output_path: ${{ steps.run_harness.outputs.output_path }} env: ANALYTICS_ZOO_ROOT: ${{ github.workspace }} ORIGIN_DIR: /mnt/disk1/models @@ -176,24 +173,16 @@ jobs: export HF_DATASETS=$HARNESS_HF_HOME/datasets export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets source /opt/intel/oneapi/setvars.sh + python run_llb.py \ - --model bigdl-llm \ - --pretrained ${MODEL_PATH} \ - --precision ${{ matrix.precision }} \ - --device ${{ matrix.device }} \ - --tasks ${{ matrix.task }} \ - --batch_size 1 --no_cache --output_path results + --model bigdl-llm \ + --pretrained ${MODEL_PATH} \ + --precision ${{ matrix.precision }} \ + --device ${{ matrix.device }} \ + --tasks ${{ matrix.task }} \ + --batch_size 1 --no_cache --output_path results - - name: Compare with golden accuracy - shell: bash - if: ${{github.event_name == 'schedule'}} - working-directory: ${{ github.workspace }}/python/llm - run: | - python test/benchmark/harness_nightly/accuracy_regression.py \ - dev/benchmark/harness/results/${{ matrix.model_name }}/${{ matrix.device }}/${{ matrix.precision }}/${{ matrix.task }}/result.json \ - test/benchmark/harness_nightly/golden_results.json - - uses: actions/upload-artifact@v3 with: name: harness_results @@ -222,13 +211,18 @@ jobs: run: | pip install --upgrade pip pip install jsonlines pytablewriter regex + + DATE=$(date +%Y-%m-%d) + OUTPUT_PATH="results_$DATE" + echo "OUTPUT_PATH=$OUTPUT_PATH" >> $GITHUB_ENV - name: Download all results uses: actions/download-artifact@v3 with: name: harness_results - path: results + path: ${{ env.OUTPUT_PATH }} - name: Summarize the results shell: bash run: | - ls results - python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_results.py results + echo ${{ env.OUTPUT_PATH }} + ls ${{ env.OUTPUT_PATH }} + python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_results.py ${{ env.OUTPUT_PATH }} \ No newline at end of file