From 231b968ababedc2921c2ea664fd27f7c9981cd8c Mon Sep 17 00:00:00 2001 From: Wenjing Margaret Mao Date: Wed, 5 Jun 2024 15:04:55 +0800 Subject: [PATCH] Modify the check_results.py to support batch 2&4 (#11133) * add batch 2&4 and exclude to perf_test * modify the perf-test&437 yaml * modify llm_performance_test.yml * remove batch 4 * modify check_results.py to support batch 2&4 * change the batch_size format * remove genxir * add str(batch_size) * change actual_test_casese in check_results file to support batch_size * change html highlight * less models to test html and html_path * delete the moe model * split batch html * split * use installing from pypi * use installing from pypi - batch2 * revert cpp * revert cpp * merge two jobs into one, test batch_size in one job * merge two jobs into one, test batch_size in one job * change file directory in workflow * try catch deal with odd file without batch_size * modify pandas version * change the dir * organize the code * organize the code * remove Qwen-MOE * modify based on feedback * modify based on feedback * modify based on second round of feedback * modify based on second round of feedback + change run-arc.sh mode * modify based on second round of feedback + revert config * modify based on second round of feedback + revert config * modify based on second round of feedback + remove comments * modify based on second round of feedback + remove comments * modify based on second round of feedback + revert arc-perf-test * modify based on third round of feedback * change error type * change error type * modify check_results.html * split batch into two folders * add all models * move csv_name * revert pr test * revert pr test --------- Co-authored-by: Yishuo Wang --- .github/workflows/llm_performance_tests.yml | 67 ++++++++++++++++--- python/llm/dev/benchmark/all-in-one/run.py | 27 +++++--- .../test/benchmark/arc-perf-test-batch2.yaml | 38 +++++++++++ .../arc-perf-transformers-437-batch2.yaml | 19 ++++++ python/llm/test/benchmark/check_results.py | 12 ++-- python/llm/test/benchmark/csv_to_html.py | 21 ++++-- 6 files changed, 152 insertions(+), 32 deletions(-) create mode 100644 python/llm/test/benchmark/arc-perf-test-batch2.yaml create mode 100644 python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml index 9cea1ca1..c9f2c830 100644 --- a/.github/workflows/llm_performance_tests.yml +++ b/.github/workflows/llm_performance_tests.yml @@ -97,11 +97,23 @@ jobs: export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 cp python/llm/test/benchmark/arc-perf-test.yaml python/llm/dev/benchmark/all-in-one/config.yaml cd python/llm/dev/benchmark/all-in-one + mkdir test_batch1 + mkdir test_batch2 + # batch_size 1 # hide time info sed -i 's/str(end - st)/"xxxxxx"/g' run.py # change csv name - sed -i 's/{today}/{today}_test1/g' run.py + sed -i 's/{today}/{today}_test1_batch1/g' run.py python run.py + mv *.csv test_batch1 + # batch_size 2 + cd ../../../../../ + cp python/llm/test/benchmark/arc-perf-test-batch2.yaml python/llm/dev/benchmark/all-in-one/config.yaml + cd python/llm/dev/benchmark/all-in-one + # change csv name + sed -i 's/batch1/batch2/g' run.py + python run.py + mv *.csv test_batch2 - name: Test on xpu(transformers==4.37.0) shell: bash @@ -111,33 +123,68 @@ jobs: export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 # upgrade transformers for model Qwen/Qwen1.5-7B-Chat python -m pip install transformers==4.37.0 + # batch_size 1 cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml cd python/llm/dev/benchmark/all-in-one # change csv name - sed -i 's/test1/test2/g' run.py + sed -i 's/test1_batch2/test2_batch1/g' run.py python run.py + mv *.csv test_batch1 + # batch_size 2 + cd ../../../../../ + cp python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml python/llm/dev/benchmark/all-in-one/config.yaml + cd python/llm/dev/benchmark/all-in-one + # change csv name + sed -i 's/batch1/batch2/g' run.py + python run.py + mv *.csv test_batch2 - name: Concat csv and generate html shell: bash run: | - cd python/llm/dev/benchmark/all-in-one - python ../../../test/benchmark/concat_csv.py + # batch_size 1 + cd python/llm/dev/benchmark/all-in-one/test_batch1 + python ../../../../test/benchmark/concat_csv.py for file in *.csv; do if [[ $file != *test* ]]; then - cp "$file" $CSV_SAVE_PATH + cp "$file" $CSV_SAVE_PATH/batch_size_1 fi done python -m pip install pandas==1.5.3 - cd ../../../test/benchmark - python csv_to_html.py -f $CSV_SAVE_PATH + cd ../../../../test/benchmark + python csv_to_html.py -f $CSV_SAVE_PATH/batch_size_1 + # batch_size 2 + cd ../../../../ + cd python/llm/dev/benchmark/all-in-one/test_batch2 + python ../../../../test/benchmark/concat_csv.py + for file in *.csv; do + if [[ $file != *test* ]]; then + cp "$file" $CSV_SAVE_PATH/batch_size_2 + fi + done + cd ../../../../test/benchmark + python csv_to_html.py -f $CSV_SAVE_PATH/batch_size_2 - name: Check and upload results to ftp shell: bash run: | - cd python/llm/dev/benchmark/all-in-one - python ../../../test/benchmark/check_results.py -c test1 -y ../../../test/benchmark/arc-perf-test.yaml - python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-437.yaml + # batch_size 1 + cd python/llm/dev/benchmark/all-in-one/test_batch1 + python ../../../../test/benchmark/check_results.py -c test1 -y ../../../../test/benchmark/arc-perf-test.yaml + python ../../../../test/benchmark/check_results.py -c test2 -y ../../../../test/benchmark/arc-perf-transformers-437.yaml find . -name "*test*.csv" -delete + cd ../ + rm -r test_batch1 + if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then + curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/ + fi + # batch_size 2 + cd test_batch2 + python ../../../../test/benchmark/check_results.py -c test1 -y ../../../../test/benchmark/arc-perf-test-batch2.yaml + python ../../../../test/benchmark/check_results.py -c test2 -y ../../../../test/benchmark/arc-perf-transformers-437-batch2.yaml + find . -name "*test*.csv" -delete + cd ../ + rm -r test_batch2 if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/ fi diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index d0b3b363..dfbae2de 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -1844,17 +1844,22 @@ if __name__ == '__main__': import pandas as pd for api in conf.test_api: global csv_name - csv_name = f'{current_dir}/{api}-results-{today}.csv' - for model in conf.repo_id: - in_out_pairs = conf['in_out_pairs'].copy() - if excludes: - for in_out in conf['in_out_pairs']: - model_id_input = model + ':' + in_out.split('-')[0] - model_id_input_batch_size = model_id_input + ':' + str(conf['batch_size']) - if model_id_input in excludes or model_id_input_batch_size in excludes: - in_out_pairs.remove(in_out) - run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'], - conf['low_bit'], conf['cpu_embedding'], conf['batch_size'], streaming, use_fp16_torch_dtype, n_gpu) + csv_name = f'{current_dir}/{api}-results-{today}.csv' + if not OmegaConf.is_list(conf["batch_size"]): + batch_list = [conf["batch_size"]] + else: + batch_list = conf["batch_size"] + for batch_size in batch_list: + for model in conf.repo_id: + in_out_pairs = conf['in_out_pairs'].copy() + if excludes: + for in_out in conf['in_out_pairs']: + model_id_input = model + ':' + in_out.split('-')[0] + model_id_input_batch_size = model_id_input + ':' + str(batch_size) + if model_id_input in excludes or model_id_input_batch_size in excludes: + in_out_pairs.remove(in_out) + run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'], + conf['low_bit'], conf['cpu_embedding'], batch_size, streaming, use_fp16_torch_dtype, n_gpu) df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)', 'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding', 'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype']) diff --git a/python/llm/test/benchmark/arc-perf-test-batch2.yaml b/python/llm/test/benchmark/arc-perf-test-batch2.yaml new file mode 100644 index 00000000..00b2e4c1 --- /dev/null +++ b/python/llm/test/benchmark/arc-perf-test-batch2.yaml @@ -0,0 +1,38 @@ +repo_id: + - 'meta-llama/Llama-2-7b-chat-hf' + - 'meta-llama/Llama-2-13b-chat-hf' + - 'THUDM/chatglm2-6b' + - 'THUDM/chatglm3-6b-4bit' + - 'tiiuae/falcon-7b-instruct-with-patch' + - 'mosaicml/mpt-7b-chat' + - 'redpajama/gptneox-7b-redpajama-bf16' + - 'bigcode/starcoder-15.5b-4bit' + - 'databricks/dolly-v1-6b' + - 'databricks/dolly-v2-7b' + - 'databricks/dolly-v2-12b' + - 'internlm/internlm-chat-7b' + - 'Qwen/Qwen-7B-Chat' + - 'BAAI/AquilaChat-7B' + - 'baichuan-inc/Baichuan2-7B-Chat' + - 'baichuan-inc/Baichuan2-13B-Chat-4bit' + - 'bigscience/bloomz-7b1' +# - 'fnlp/moss-moon-003-sft-4bit' # moss-moon-003-sft cannot work on transformers 4.34+ + - 'mistralai/Mistral-7B-v0.1' +local_model_hub: '/mnt/disk1/models' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 2 # default to 1 +in_out_pairs: + - '32-32' + - '1024-128' + - '2048-256' +test_api: + - "transformer_int4_gpu" # on Intel GPU +cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) +exclude: + - 'bigcode/starcoder-15.5b-4bit:2048' + - 'databricks/dolly-v2-12b:2048' + - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048' + - 'bigscience/bloomz-7b1:2048' \ No newline at end of file diff --git a/python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml b/python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml new file mode 100644 index 00000000..c9644dc9 --- /dev/null +++ b/python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml @@ -0,0 +1,19 @@ +# For the models that require transformers 4.37.0 +repo_id: + - 'Qwen/Qwen1.5-7B-Chat' + - 'microsoft/phi-2' + - 'microsoft/Phi-3-mini-4k-instruct' + - 'meta-llama/Meta-Llama-3-8B-Instruct' +local_model_hub: '/mnt/disk1/models' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 2 # default to 1 +in_out_pairs: + - '32-32' + - '1024-128' + - '2048-256' +test_api: + - "transformer_int4_gpu" # on Intel GPU +cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/check_results.py b/python/llm/test/benchmark/check_results.py index 528c41df..861c3ecf 100644 --- a/python/llm/test/benchmark/check_results.py +++ b/python/llm/test/benchmark/check_results.py @@ -34,16 +34,20 @@ def main(): actual_test_num = len(csv_dataframe) actual_test_cases = [] for index, row in csv_dataframe.iterrows(): - actual_test_cases.append(row['model'] + ":" + row['input/output tokens'].split('-')[0]) - + actual_test_cases.append(row['model'] + ":" + row['input/output tokens'].split('-')[0] + ":" + str(row['batch_size'])) if args.yaml_name: yaml_name = args.yaml_name conf = OmegaConf.load(yaml_name) all_test_cases = [] for model in conf.repo_id: for in_out in conf['in_out_pairs']: - model_id_input = model + ':' + in_out.split('-')[0] - all_test_cases.append(model_id_input) + if not OmegaConf.is_list(conf["batch_size"]): + batch_list = [conf["batch_size"]] + else: + batch_list = conf["batch_size"] + for batch_size in batch_list: + model_id_input = model + ':' + in_out.split('-')[0] + ':' + str(batch_size) + all_test_cases.append(model_id_input) exclude_test_cases = [] if 'exclude' in conf and conf['exclude'] is not None: exclude_test_cases = conf['exclude'] diff --git a/python/llm/test/benchmark/csv_to_html.py b/python/llm/test/benchmark/csv_to_html.py index 9b146f9a..2720b338 100644 --- a/python/llm/test/benchmark/csv_to_html.py +++ b/python/llm/test/benchmark/csv_to_html.py @@ -99,10 +99,15 @@ def main(): for current_csv_ind,current_csv_row in current_csv.iterrows(): current_csv_model=current_csv_row['model'].strip() current_csv_input_output_pairs=current_csv_row['input/output tokens'].strip() - current_csv_model_input_1st=current_csv_model+'-'+current_csv_input_output_pairs+'-'+'1st' - current_csv_model_input_2nd=current_csv_model+'-'+current_csv_input_output_pairs+'-'+'2nd' - add_to_dict(csv_dict, current_csv_model_input_1st, current_csv_row[latency_1st_token]) - add_to_dict(csv_dict, current_csv_model_input_2nd, current_csv_row[latency_2_avg]) + try: + current_csv_batch_size=str(current_csv_row['batch_size']) + current_csv_model_input_1st=current_csv_model+'-'+current_csv_input_output_pairs+'-'+current_csv_batch_size+'-'+'1st' + current_csv_model_input_2nd=current_csv_model+'-'+current_csv_input_output_pairs+'-'+current_csv_batch_size+'-'+'2nd' + add_to_dict(csv_dict, current_csv_model_input_1st, current_csv_row[latency_1st_token]) + add_to_dict(csv_dict, current_csv_model_input_2nd, current_csv_row[latency_2_avg]) + except KeyError: + #Old csv/html files didn't include 'batch_size' + pass for latest_csv_ind,latest_csv_row in latest_csv.iterrows(): @@ -110,9 +115,10 @@ def main(): latest_csv_input_output_pairs=latest_csv_row['input/output tokens'].strip() latest_1st_token_latency=latest_csv_row[latency_1st_token] latest_2_avg_latency=latest_csv_row[latency_2_avg] + latest_csv_batch_size=str(latest_csv_row['batch_size']) - key1=latest_csv_model+'-'+latest_csv_input_output_pairs+'-'+'1st' - key2=latest_csv_model+'-'+latest_csv_input_output_pairs+'-'+'2nd' + key1=latest_csv_model+'-'+latest_csv_input_output_pairs+'-'+latest_csv_batch_size+'-'+'1st' + key2=latest_csv_model+'-'+latest_csv_input_output_pairs+'-'+latest_csv_batch_size+'-'+'2nd' best_last1_value=best_in_dict(csv_dict, key1, latest_1st_token_latency) best_last2_value=best_in_dict(csv_dict, key2, latest_2_avg_latency) @@ -128,8 +134,9 @@ def main(): previous_csv_model=previous_csv_row['model'].strip() previous_csv_input_output_pairs=previous_csv_row['input/output tokens'].strip() + previous_csv_batch_size=str(previous_csv_row['batch_size']) - if latest_csv_model==previous_csv_model and latest_csv_input_output_pairs==previous_csv_input_output_pairs: + if latest_csv_model==previous_csv_model and latest_csv_input_output_pairs==previous_csv_input_output_pairs and latest_csv_batch_size==previous_csv_batch_size: previous_1st_token_latency=previous_csv_row[latency_1st_token] previous_2_avg_latency=previous_csv_row[latency_2_avg]