bigdl-llm stress test for stable version (#9781)

* 1k-512 2k-512 baseline * add cpu stress test * update yaml name * update * update * clean up * test * update * update * update * test * update
2023-12-27 15:40:53 +08:00 · 2023-12-27 15:40:53 +08:00 · 6c75c689ea
commit 6c75c689ea
parent 5cfb4c4f5b
6 changed files with 850 additions and 7 deletions
--- a/.github/workflows/llm_performance_tests_stable_version.yml
+++ b/.github/workflows/llm_performance_tests_stable_version.yml
@ -1,4 +1,4 @@
-name: LLM Performance Test for Stable Version
+name: LLM Test for Stable Version
 # Cancel previous runs in the PR when you push new commits
 concurrency:
@ -21,7 +21,7 @@ jobs:
  llm-cpp-build:
    uses: ./.github/workflows/llm-binary-build.yml
-  llm-performance-test-on-arc:
+  llm-perf-regression-test-on-arc:
    needs: llm-cpp-build
    strategy:
      fail-fast: false
@ -104,7 +104,7 @@ jobs:
          python csv_to_html.py -f $CSV_SAVE_PATH/fp8 -b $CSV_SAVE_PATH/fp8/transformer_int4_gpu-results-1baseline.csv -t 5.0
-  llm-performance-test-on-spr:
+  llm-perf-regression-test-on-spr:
    needs: llm-cpp-build
    strategy:
      fail-fast: false
@ -152,9 +152,61 @@ jobs:
          # hide time info
          sed -i 's/str(end - st)/"xxxxxx"/g' run.py
          python run.py
-          cp ./*.csv /models/nightly_perf_cpu/
+          cp ./*.csv /models/stable_version_perf_regression_test_cpu/
          cd ../../../test/benchmark
          python -m pip install pandas==1.5.3
-          python csv_to_html.py -f /models/nightly_perf_cpu/ -b /models/nightly_perf_cpu/transformer_int4-results-1baseline.csv -t 5.0
+          python csv_to_html.py -f /models/stable_version_perf_regression_test_cpu/ -b /models/stable_version_perf_regression_test_cpu/transformer_int4-results-1baseline.csv -t 5.0
  llm-stress-test-on-spr:
    needs: llm-perf-regression-test-on-spr
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.9"]
    runs-on: [self-hosted, llm, spr01-perf]
    env:
      OMP_NUM_THREADS: 16
      THREAD_NUM: 16
      ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        shell: bash
        run: |
          python -m pip install --upgrade pip
          python -m pip install --upgrade wheel
          python -m pip install --upgrade omegaconf
          python -m pip install --upgrade pandas
          python -m pip install --upgrade einops
          python -m pip install --upgrade tiktoken
          python -m pip install --upgrade transformers_stream_generator
      - name: Download llm binary
        uses: ./.github/actions/llm/download-llm-binary
      - name: Run LLM install (all) test
        uses: ./.github/actions/llm/setup-llm-env
      - name: Test on cpu
        shell: bash
        run: |
          mv python/llm/test/benchmark/stable-version-cpu-stress-test.yaml python/llm/dev/benchmark/all-in-one/config.yaml
          cd python/llm/dev/benchmark/all-in-one
          export http_proxy=${HTTP_PROXY}
          export https_proxy=${HTTPS_PROXY}
          source bigdl-llm-init -t
          export OMP_NUM_THREADS=48
          # hide time info
          sed -i 's/str(end - st)/"xxxxxx"/g' run-stress-test.py
          python run-stress-test.py
          cp ./*.csv /models/stable_version_stress_test_cpu/
          cd ../../../test/benchmark
          python -m pip install pandas==1.5.3
          python csv_to_html.py -f /models/stable_version_stress_test_cpu/
--- a/python/llm/dev/benchmark/all-in-one/prompt/stress_test.txt
+++ b/python/llm/dev/benchmark/all-in-one/prompt/stress_test.txt
--- a/python/llm/dev/benchmark/all-in-one/prompt/stress_test_copy.txt
+++ b/python/llm/dev/benchmark/all-in-one/prompt/stress_test_copy.txt
--- a/python/llm/dev/benchmark/all-in-one/run-stress-test.py
+++ b/python/llm/dev/benchmark/all-in-one/run-stress-test.py
@ -0,0 +1,256 @@
 #
 # Copyright 2016 The BigDL Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # this code is copied from llama2 example test, and added performance test
 import torch
 import time
 import gc
 import traceback
 import threading
 import numpy as np
 from datetime import date
 import os
 current_dir = os.path.dirname(os.path.realpath(__file__))
 benchmark_util_path = os.path.join(current_dir, '..')
 import sys
 sys.path.append(benchmark_util_path)
 from benchmark_util import BenchmarkWrapper
 from bigdl.llm.utils.common.log4Error import invalidInputError
 LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
             'meta-llama/Llama-2-70b-chat-hf','decapoda-research/llama-7b-hf',
             'decapoda-research/llama-65b-hf','lmsys/vicuna-7b-v1.5',
             'lmsys/vicuna-13b-v1.3','project-baize/merged-baize-30b']
 CHATGLM_IDS = ['THUDM/chatglm-6b', 'THUDM/chatglm2-6b', 'THUDM/chatglm3-6b']
 LLAVA_IDS = ['liuhaotian/llava-v1.5-7b']
 results = []
 excludes = []
 def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials):
    for i in range(num_trials + warm_up):
        st = time.perf_counter()
        output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
                                    num_beams=num_beams)
        torch.xpu.synchronize()
        end = time.perf_counter()
        output_ids = output_ids.cpu()
        print("model generate cost: " + str(end - st))
        output = tokenizer.batch_decode(output_ids)
        print(output[0])
        actual_out_len = output_ids.shape[1] - actual_in_len
        if i >= warm_up:
            result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
                                actual_in_len, actual_out_len])
 def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False):
    # TODO: make a parameter
    result= {}
    if test_api == 'transformer_int4':
        result = run_transformer_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
    elif test_api == 'transformer_int4_gpu':
        result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit)
    for in_out_pair in in_out_pairs:
        if result and result[in_out_pair]:
            results.append([repo_id,
                            round(np.mean(result[in_out_pair], axis=0)[0]*1000.0, 2),
                            round(np.mean(result[in_out_pair], axis=0)[1]*1000.0, 2),
                            round(np.mean(result[in_out_pair], axis=0)[2]*1000.0, 2),
                            in_out_pair,
                            f'{int(np.mean(result[in_out_pair], axis=0)[3])}' +
                            f'-{int(np.mean(result[in_out_pair], axis=0)[4])}',
                            num_beams,
                            low_bit,
                            cpu_embedding if 'win' in test_api else 'N/A',
                            result[in_out_pair][-1][5] if 'win' in test_api else 'N/A']) # currently only peak mem for win gpu is caught here
 def get_model_path(repo_id, local_model_hub):
    if local_model_hub:
        repo_model_name = repo_id.split("/")[1]
        local_model_path = local_model_hub + os.path.sep + repo_model_name
        invalidInputError(os.path.isdir(local_model_path),
                          local_model_path + " not exists!, Please check your models' folder.")
        return local_model_path
    else:
        return repo_id
 def run_transformer_int4(repo_id,
                         local_model_hub,
                         in_out_pairs,
                         warm_up,
                         num_trials,
                         num_beams,
                         low_bit):
    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
    from transformers import AutoTokenizer, LlamaTokenizer
    model_path = get_model_path(repo_id, local_model_hub)
    # Load model in 4 bit,
    # which convert the relevant layers in the model into INT4 format
    st = time.perf_counter()
    if repo_id in CHATGLM_IDS:
        model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype='auto').eval()
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    elif repo_id in LLAMA_IDS:
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
                                                     use_cache=True).eval()
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
                                                     use_cache=True).eval()
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    end = time.perf_counter()
    print(">> loading of model costs {}s".format(end - st))
    model = BenchmarkWrapper(model)
    result = {}
    with torch.inference_mode():
        for in_out in in_out_pairs:
            in_out_len = in_out.split("-")
            in_len = int(in_out_len[0])
            out_len = int(in_out_len[1])
            i = 0
            with open("prompt/stress_test.txt", 'r') as file:
                for input_str in file:
                    # As different tokenizer has different encodings,
                    # slice the input_ids to ensure the prompt length is required length.
                    input_ids = tokenizer.encode(input_str, return_tensors="pt")
                    input_ids = input_ids[:, :in_len]
                    true_str = tokenizer.batch_decode(input_ids)[0]
                    input_ids = tokenizer.encode(true_str, return_tensors="pt")
                    actual_in_len = input_ids.shape[1]
                    result[in_out] = []
                    st = time.perf_counter()
                    output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
                                                num_beams=num_beams)
                    end = time.perf_counter()
                    print("model generate cost: " + str(end - st))
                    output = tokenizer.batch_decode(output_ids)
                    print(output[0])
                    actual_out_len = output_ids.shape[1] - actual_in_len
                    if i >= warm_up:
                        result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
                                            actual_in_len, actual_out_len])
                    i += 1
    return result
 def run_transformer_int4_gpu(repo_id,
                             local_model_hub,
                             in_out_pairs,
                             warm_up,
                             num_trials,
                             num_beams,
                             low_bit):
    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
    from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
    import intel_extension_for_pytorch as ipex
    model_path = get_model_path(repo_id, local_model_hub)
    # Load model in 4 bit,
    # which convert the relevant layers in the model into INT4 format
    st = time.perf_counter()
    if repo_id in CHATGLM_IDS:
        model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
                                          trust_remote_code=True, use_cache=True).eval()
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        model = model.to('xpu')
    elif repo_id in LLAMA_IDS:
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
                                                     use_cache=True).eval()
        tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
        model = model.to('xpu')
    else:
        model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
                                                     trust_remote_code=True, use_cache=True).eval()
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        model = model.to('xpu')
        if isinstance(model, GPTJForCausalLM):
            # For gpt-j model family, this optimization can provide a better performance.
            model = ipex.optimize(model.eval(), inplace=True)
    end = time.perf_counter()
    print(">> loading of model costs {}s".format(end - st))
    model = BenchmarkWrapper(model)
    result = {}
    with torch.inference_mode():
        for in_out in in_out_pairs:
            in_out_len = in_out.split("-")
            in_len = int(in_out_len[0])
            out_len = int(in_out_len[1])
            # As different tokenizer has different encodings,
            # in_len.txt maybe shorter than we need,
            # use much longer context to make sure input length
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
            input_str = open(f"prompt/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
            input_ids = input_ids[:, :in_len]
            true_str = tokenizer.batch_decode(input_ids)[0]
            input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
            actual_in_len = input_ids.shape[1]
            result[in_out] = []
            thread = threading.Thread(target=run_model_in_thread, args=(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials))
            thread.start()
            thread.join()
    del model
    torch.xpu.empty_cache()
    return result
 if __name__ == '__main__':
    from omegaconf import OmegaConf
    conf = OmegaConf.load(f'{current_dir}/config.yaml')
    today = date.today()
    if 'exclude' in conf:
        excludes = conf['exclude']
    import pandas as pd
    for api in conf.test_api:
        for model in conf.repo_id:
            in_out_pairs = conf['in_out_pairs'].copy()
            if excludes:
                for in_out in conf['in_out_pairs']:
                    model_id_input = model + ':' + in_out.split('-')[0]
                    if model_id_input in excludes:
                        in_out_pairs.remove(in_out)
            run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
                      conf['low_bit'], conf['cpu_embedding'])
        df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
                                            'input/output tokens', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding', 
                                            'peak mem (GB)'])
        df.to_csv(f'{current_dir}/{api}-results-{today}.csv')
        results = []
--- a/python/llm/test/benchmark/csv_to_html.py
+++ b/python/llm/test/benchmark/csv_to_html.py
@ -31,7 +31,7 @@ def highlight_vals(val, max=3.0):
        return ''
 def is_diffs_within_normal_range(diff1, diff2, threshold=5.0):
-    return not any(diff < (-threshold) for diff in diff1 + diff2)
+    return not any(diff < (-threshold) for diff in diff1 + diff2 if isinstance(diff, float))
 def main():
    parser = argparse.ArgumentParser(description="convert .csv file to .html file")
--- a/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml
+++ b/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml
@ -0,0 +1,20 @@
 repo_id:
  - 'meta-llama/Llama-2-7b-chat-hf'
  - 'meta-llama/Llama-2-13b-chat-hf'
  - 'THUDM/chatglm2-6b'
  - 'THUDM/chatglm3-6b'
  - 'baichuan-inc/Baichuan2-7B-Chat'
  - 'baichuan-inc/Baichuan2-13B-Chat'
  - 'Qwen/Qwen-14B-Chat'
 local_model_hub: '/models'
 warm_up: 1
 num_trials: 4
 num_beams: 1 # default to greedy search
 low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 in_out_pairs:
  - '1024-512'
  - '2048-512'
 test_api:
  - "transformer_int4"
  # - "transformer_int4_gpu"  # on Intel GPU
 cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)