From 36c9442c6d730207a7b474f0e27df88e2a4f4d3a Mon Sep 17 00:00:00 2001 From: dingbaorong Date: Tue, 6 Feb 2024 10:23:50 +0800 Subject: [PATCH] Arc Stable version test (#10087) * add batch_size in stable version test * add batch_size in excludes * add excludes for batch_size * fix ci * triger regression test * fix xpu version * disable ci * address kai's comment --------- Co-authored-by: Ariadne --- .../llm_tests_for_stable_version_on_arc.yml | 66 +++++++++++++++---- python/llm/dev/benchmark/all-in-one/run.py | 3 +- .../stable-version-arc-perf-test-fp8.yaml | 23 +++++++ ...stable-version-arc-perf-test-sym_int4.yaml | 19 ++++++ 4 files changed, 97 insertions(+), 14 deletions(-) diff --git a/.github/workflows/llm_tests_for_stable_version_on_arc.yml b/.github/workflows/llm_tests_for_stable_version_on_arc.yml index e8e716a7..a7114223 100644 --- a/.github/workflows/llm_tests_for_stable_version_on_arc.yml +++ b/.github/workflows/llm_tests_for_stable_version_on_arc.yml @@ -61,47 +61,87 @@ jobs: - name: Run LLM install (all) test uses: ./.github/actions/llm/setup-llm-env with: - extra-dependency: "xpu" + extra-dependency: "xpu_2.1" - name: Test installed xpu version shell: bash run: | - source /home/arda/intel/oneapi/setvars.sh + source /opt/intel/oneapi/setvars.sh bash python/llm/test/run-llm-install-tests.sh - name: Test on xpu (int4) shell: bash run: | - source /home/arda/intel/oneapi/setvars.sh + source /opt/intel/oneapi/setvars.sh export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 mv python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml python/llm/dev/benchmark/all-in-one/config.yaml cd python/llm/dev/benchmark/all-in-one # hide time info sed -i 's/str(end - st)/"xxxxxx"/g' run.py + # batch_size = 1 + sed -i '/batch_size/c\batch_size: 1' config.yaml python run.py - cp ./*.csv $CSV_SAVE_PATH/int4 + cp ./*.csv $CSV_SAVE_PATH/int4/batch_size_1 + rm ./*.csv + # batch_size = 2 + sed -i '/batch_size/c\batch_size: 2' config.yaml + python run.py + cp ./*.csv $CSV_SAVE_PATH/int4/batch_size_2 + rm ./*.csv + # batch_size = 4 + sed -i '/batch_size/c\batch_size: 4' config.yaml + python run.py + cp ./*.csv $CSV_SAVE_PATH/int4/batch_size_4 + rm ./*.csv + # batch_size = 8 + sed -i '/batch_size/c\batch_size: 8' config.yaml + python run.py + cp ./*.csv $CSV_SAVE_PATH/int4/batch_size_8 rm ./*.csv cd ../../../test/benchmark python -m pip install pandas==1.5.3 - python csv_to_html.py -f $CSV_SAVE_PATH/int4 -b $CSV_SAVE_PATH/int4/transformer_int4_gpu-results-1baseline.csv -t 5.0 - + python csv_to_html.py -f $CSV_SAVE_PATH/int4/batch_size_1 -b $CSV_SAVE_PATH/int4/batch_size_1/transformer_int4_gpu-results-1baseline.csv -t 5.0 + python csv_to_html.py -f $CSV_SAVE_PATH/int4/batch_size_2 -b $CSV_SAVE_PATH/int4/batch_size_2/transformer_int4_gpu-results-1baseline.csv -t 5.0 + python csv_to_html.py -f $CSV_SAVE_PATH/int4/batch_size_4 -b $CSV_SAVE_PATH/int4/batch_size_4/transformer_int4_gpu-results-1baseline.csv -t 5.0 + python csv_to_html.py -f $CSV_SAVE_PATH/int4/batch_size_8 -b $CSV_SAVE_PATH/int4/batch_size_8/transformer_int4_gpu-results-1baseline.csv -t 5.0 + - name: Test on xpu (fp8) shell: bash run: | - source /home/arda/intel/oneapi/setvars.sh + source /opt/intel/oneapi/setvars.sh export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 mv python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml python/llm/dev/benchmark/all-in-one/config.yaml cd python/llm/dev/benchmark/all-in-one # hide time info sed -i 's/str(end - st)/"xxxxxx"/g' run.py + # batch_size = 1 + sed -i '/batch_size/c\batch_size: 1' config.yaml python run.py - cp ./*.csv $CSV_SAVE_PATH/fp8 + cp ./*.csv $CSV_SAVE_PATH/fp8/batch_size_1 + rm ./*.csv + # batch_size = 2 + sed -i '/batch_size/c\batch_size: 2' config.yaml + python run.py + cp ./*.csv $CSV_SAVE_PATH/fp8/batch_size_2 + rm ./*.csv + # batch_size = 4 + sed -i '/batch_size/c\batch_size: 4' config.yaml + python run.py + cp ./*.csv $CSV_SAVE_PATH/fp8/batch_size_4 + rm ./*.csv + # batch_size = 8 + sed -i '/batch_size/c\batch_size: 8' config.yaml + python run.py + cp ./*.csv $CSV_SAVE_PATH/fp8/batch_size_8 rm ./*.csv cd ../../../test/benchmark python -m pip install pandas==1.5.3 - python csv_to_html.py -f $CSV_SAVE_PATH/fp8 -b $CSV_SAVE_PATH/fp8/transformer_int4_gpu-results-1baseline.csv -t 5.0 + python csv_to_html.py -f $CSV_SAVE_PATH/fp8/batch_size_1 -b $CSV_SAVE_PATH/fp8/batch_size_1/transformer_int4_gpu-results-1baseline.csv -t 5.0 + python csv_to_html.py -f $CSV_SAVE_PATH/fp8/batch_size_2 -b $CSV_SAVE_PATH/fp8/batch_size_2/transformer_int4_gpu-results-1baseline.csv -t 5.0 + python csv_to_html.py -f $CSV_SAVE_PATH/fp8/batch_size_4 -b $CSV_SAVE_PATH/fp8/batch_size_4/transformer_int4_gpu-results-1baseline.csv -t 5.0 + python csv_to_html.py -f $CSV_SAVE_PATH/fp8/batch_size_8 -b $CSV_SAVE_PATH/fp8/batch_size_8/transformer_int4_gpu-results-1baseline.csv -t 5.0 llm-stress-test-on-arc: needs: llm-perf-regression-test-on-arc @@ -143,18 +183,18 @@ jobs: - name: Run LLM install (all) test uses: ./.github/actions/llm/setup-llm-env with: - extra-dependency: "xpu" + extra-dependency: "xpu_2.1" - name: Test installed xpu version shell: bash run: | - source /home/arda/intel/oneapi/setvars.sh + source /opt/intel/oneapi/setvars.sh bash python/llm/test/run-llm-install-tests.sh - name: Test on xpu (int4) shell: bash run: | - source /home/arda/intel/oneapi/setvars.sh + source /opt/intel/oneapi/setvars.sh export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 mv python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml python/llm/dev/benchmark/all-in-one/config.yaml @@ -171,7 +211,7 @@ jobs: - name: Test on xpu (fp8) shell: bash run: | - source /home/arda/intel/oneapi/setvars.sh + source /opt/intel/oneapi/setvars.sh export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 mv python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml python/llm/dev/benchmark/all-in-one/config.yaml diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 05574365..d56cea06 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -956,7 +956,8 @@ if __name__ == '__main__': if excludes: for in_out in conf['in_out_pairs']: model_id_input = model + ':' + in_out.split('-')[0] - if model_id_input in excludes: + model_id_input_batch_size = model_id_input + ':' + str(conf['batch_size']) + if model_id_input in excludes or model_id_input_batch_size in excludes: in_out_pairs.remove(in_out) run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'], conf['low_bit'], conf['cpu_embedding'], conf['batch_size']) diff --git a/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml b/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml index db44e31a..00884dbe 100644 --- a/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml +++ b/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml @@ -12,8 +12,31 @@ low_bit: 'fp8' # default to use 'sym_int4' (i.e. symmetric int4) batch_size: 1 # default to 1 in_out_pairs: - '32-32' + - '512-256' - '1024-128' - '2048-256' test_api: - "transformer_int4_gpu" # on Intel GPU cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) +exclude: + - 'meta-llama/Llama-2-7b-chat-hf:2048:4' + - 'meta-llama/Llama-2-7b-chat-hf:512:8' + - 'meta-llama/Llama-2-7b-chat-hf:1024:8' + - 'meta-llama/Llama-2-7b-chat-hf:2048:8' + - 'THUDM/chatglm2-6b:2048:8' + - 'THUDM/chatglm3-6b:2048:8' + - 'baichuan-inc/Baichuan2-7B-Chat:2048:2' + - 'baichuan-inc/Baichuan2-7B-Chat:1024:4' + - 'baichuan-inc/Baichuan2-7B-Chat:2048:4' + - 'baichuan-inc/Baichuan2-7B-Chat:512:8' + - 'baichuan-inc/Baichuan2-7B-Chat:1024:8' + - 'baichuan-inc/Baichuan2-7B-Chat:2048:8' + - 'Qwen/Qwen-7B-Chat:2048:1' + - 'Qwen/Qwen-7B-Chat:1024:2' + - 'Qwen/Qwen-7B-Chat:2048:2' + - 'Qwen/Qwen-7B-Chat:512:4' + - 'Qwen/Qwen-7B-Chat:1024:4' + - 'Qwen/Qwen-7B-Chat:2048:4' + - 'Qwen/Qwen-7B-Chat:512:8' + - 'Qwen/Qwen-7B-Chat:1024:8' + - 'Qwen/Qwen-7B-Chat:2048:8' diff --git a/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml b/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml index 4c8b0904..cb9f7b30 100644 --- a/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml +++ b/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml @@ -12,8 +12,27 @@ low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) batch_size: 1 # default to 1 in_out_pairs: - '32-32' + - '512-256' - '1024-128' - '2048-256' test_api: - "transformer_int4_gpu" # on Intel GPU cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) +exclude: + - 'meta-llama/Llama-2-7b-chat-hf:2048:4' + - 'meta-llama/Llama-2-7b-chat-hf:1024:8' + - 'meta-llama/Llama-2-7b-chat-hf:2048:8' + - 'THUDM/chatglm2-6b:2048:8' + - 'THUDM/chatglm3-6b:2048:8' + - 'baichuan-inc/Baichuan2-7B-Chat:2048:2' + - 'baichuan-inc/Baichuan2-7B-Chat:1024:4' + - 'baichuan-inc/Baichuan2-7B-Chat:2048:4' + - 'baichuan-inc/Baichuan2-7B-Chat:512:8' + - 'baichuan-inc/Baichuan2-7B-Chat:1024:8' + - 'baichuan-inc/Baichuan2-7B-Chat:2048:8' + - 'Qwen/Qwen-7B-Chat:2048:2' + - 'Qwen/Qwen-7B-Chat:1024:4' + - 'Qwen/Qwen-7B-Chat:2048:4' + - 'Qwen/Qwen-7B-Chat:512:8' + - 'Qwen/Qwen-7B-Chat:1024:8' + - 'Qwen/Qwen-7B-Chat:2048:8'