diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml index ced9d75d..511c3e4f 100644 --- a/.github/workflows/llm_performance_tests.yml +++ b/.github/workflows/llm_performance_tests.yml @@ -9,72 +9,23 @@ concurrency: on: schedule: - cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China - pull_request: - branches: [main] - paths: - - ".github/workflows/llm_performance_tests.yml" - - "python/llm/test/benchmark/**" - - "python/llm/dev/benchmark/all-in-one/**" + # pull_request: + # branches: [main] + # paths: + # - ".github/workflows/llm_performance_tests.yml" + # - "python/llm/test/benchmark/**" + # - "python/llm/dev/benchmark/all-in-one/**" workflow_dispatch: workflow_call: # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: llm-cpp-build: + if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-cpp-build' || github.event.inputs.artifact == 'all' }} uses: ./.github/workflows/llm-binary-build.yml - llm-performance-test: - if: false # skip cpu performance test for now; may add it back with separated runner - needs: llm-cpp-build - strategy: - fail-fast: false - matrix: - python-version: ["3.9"] - instruction: ["AVX512"] - runs-on: [self-hosted, llm, perf] - env: - THREAD_NUM: 24 - steps: - - name: Set environment variables - shell: bash - run: | - echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV" - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install --upgrade setuptools==58.0.4 - python -m pip install --upgrade wheel - - name: Download llm binary - uses: ./.github/actions/llm/download-llm-binary - - - name: Run LLM install (all) test - uses: ./.github/actions/llm/setup-llm-env - env: - ANALYTICS_ZOO_ROOT: ${{ github.workspace }} - - - name: Download LLMs - shell: bash - run: | - if [ ! -d $LLAMA2_7B_ORIGIN_PATH ]; then - echo "Directory $LLAMA2_7B_ORIGIN_PATH not found. Downloading from FTP server..." - wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-7b-chat-hf -P $ORIGIN_DIR - fi - - - name: Run LLM Performance test - env: - ANALYTICS_ZOO_ROOT: ${{ github.workspace }} - run: bash python/llm/dev/benchmark/run-benchmark-tests.sh - - # - name: Clean up test environment - # uses: ./.github/actions/llm/remove-llm-env - # env: - # ANALYTICS_ZOO_ROOT: ${{ github.workspace }} llm-performance-test-on-arc: + if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} needs: llm-cpp-build strategy: fail-fast: false @@ -142,6 +93,7 @@ jobs: fi llm-performance-test-on-spr: + if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-performance-test-on-spr' || github.event.inputs.artifact == 'all' }} needs: llm-cpp-build strategy: fail-fast: false @@ -190,6 +142,7 @@ jobs: python csv_to_html.py -f /mnt/disk1/nightly_perf_cpu/ llm-performance-test-on-core: + if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-performance-test-on-core' || github.event.inputs.artifact == 'all' }} needs: llm-cpp-build strategy: fail-fast: false diff --git a/python/llm/dev/benchmark/pipelines/llama2_test.py b/python/llm/dev/benchmark/pipelines/llama2_test.py deleted file mode 100644 index 0eadf9e2..00000000 --- a/python/llm/dev/benchmark/pipelines/llama2_test.py +++ /dev/null @@ -1,94 +0,0 @@ -# -# Copyright 2016 The BigDL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -# this code is copied from llama2 example test, and added performance test -import torch -import time -import argparse - -from bigdl.llm.transformers import AutoModelForCausalLM -from transformers import LlamaTokenizer - - -import os -benchmark_util_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..') -import sys -sys.path.append(benchmark_util_path) -from benchmark_util import BenchmarkWrapper - -# you could tune the prompt based on your own model, -# here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style -LLAMA2_PROMPT_FORMAT = """### HUMAN: -{prompt} - -### RESPONSE: -""" - -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model') - parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf", - help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded' - ', or the path to the huggingface checkpoint folder') - parser.add_argument('--prompt', type=str, default="What is AI?", - help='Prompt to infer') - parser.add_argument('--n-predict', type=int, default=32, - help='Max tokens to predict') - - args = parser.parse_args() - model_path = args.repo_id_or_model_path - - # Load model in 4 bit, - # which convert the relevant layers in the model into INT4 format - model = AutoModelForCausalLM.from_pretrained(model_path, - load_in_4bit=True, - trust_remote_code=True) - - - model = BenchmarkWrapper(model, do_print=False) - - # Load tokenizer - tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) - - # Generate predicted tokens - with torch.inference_mode(): - prompt = LLAMA2_PROMPT_FORMAT.format(prompt=args.prompt) - input_ids = tokenizer.encode(prompt, return_tensors="pt") - st = time.time() - # if your selected model is capable of utilizing previous key/value attentions - # to enhance decoding speed, but has `"use_cache": false` in its model config, - # it is important to set `use_cache=True` explicitly in the `generate` function - # to obtain optimal performance with BigDL-LLM INT4 optimizations - output = model.generate(input_ids, - max_new_tokens=args.n_predict) - end = time.time() - output_str = tokenizer.decode(output[0], skip_special_tokens=True) - print(f'Inference time: {end-st} s') - print('-'*20, 'Prompt', '-'*20) - print(prompt) - print('-'*20, 'Output', '-'*20) - print(output_str) - - assert "AI is a term" in output_str, "output is not as expected, the correctness may be wrong." - llama2_baseline = os.getenv('LLAMA2_BASELINE') - if llama2_baseline is None: - print('baseline is not set, skipping baseline validation') - else: - llama2_baseline = float(llama2_baseline) - ratio = model.rest_cost_mean / llama2_baseline - assert ratio < 1.1, f"performance did not meet baseline, the cost is {(ratio - 1) * 100}% higher than the baseline" - diff --git a/python/llm/dev/benchmark/run-benchmark-tests.sh b/python/llm/dev/benchmark/run-benchmark-tests.sh deleted file mode 100644 index 5ec5c489..00000000 --- a/python/llm/dev/benchmark/run-benchmark-tests.sh +++ /dev/null @@ -1,17 +0,0 @@ -# Performance tests usually use dedicated machines, see below to set env vars, e.g. model paths -# The following environment variables should be ready -# ORIGINAL_LLAMA2_PATH -# LLAMA2_BASELINE -# LLM_DIR - -if [ -z "$THREAD_NUM" ]; then - THREAD_NUM=2 -fi -export OMP_NUM_THREADS=$THREAD_NUM - -######## LLAMA2 -# transformers - -echo ">>> Testing LLAMA2 transformers API" -taskset -c 0-$((THREAD_NUM - 1)) python python/llm/dev/benchmark/pipelines/llama2_test.py --repo-id-or-model-path $LLAMA2_7B_ORIGIN_PATH -