diff --git a/.github/workflows/llm_performance_tests_stable_version.yml b/.github/workflows/llm_performance_tests_stable_version.yml new file mode 100644 index 00000000..2159c5dc --- /dev/null +++ b/.github/workflows/llm_performance_tests_stable_version.yml @@ -0,0 +1,155 @@ +name: LLM Performance Test for Stable Version + +# Cancel previous runs in the PR when you push new commits +concurrency: + group: ${{ github.workflow }}-llm-performance-tests-${{ github.event.pull_request.number || github.run_id }} + cancel-in-progress: true + +# Controls when the action will run. +on: + # pull_request: + # branches: [main] + # paths: + # - ".github/workflows/llm_performance_tests.yml" + # - "python/llm/test/benchmark/**" + # - "python/llm/dev/benchmark/all-in-one/**" + workflow_dispatch: + workflow_call: + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + llm-cpp-build: + uses: ./.github/workflows/llm-binary-build.yml + + llm-performance-test-on-arc: + needs: llm-cpp-build + strategy: + fail-fast: false + matrix: + python-version: ["3.9"] + runs-on: [self-hosted, llm, perf] + env: + OMP_NUM_THREADS: 16 + THREAD_NUM: 16 + ANALYTICS_ZOO_ROOT: ${{ github.workspace }} + CSV_SAVE_PATH: ${{ github.event.schedule && '/mnt/disk1/nightly_perf_gpu/' || '/mnt/disk1/pr_perf_gpu/' }} + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + shell: bash + # pip install transformers_stream_generator for model internlm-chat-7b-8k + # pip install tiktoken for model Qwen-7B-Chat-10-12 + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade wheel + python -m pip install --upgrade omegaconf + python -m pip install --upgrade pandas + python -m pip install --upgrade einops + python -m pip install --upgrade transformers_stream_generator + python -m pip install --upgrade tiktoken + + - name: Download llm binary + uses: ./.github/actions/llm/download-llm-binary + + - name: Run LLM install (all) test + uses: ./.github/actions/llm/setup-llm-env + with: + extra-dependency: "xpu" + + - name: Test installed xpu version + shell: bash + run: | + source /home/arda/intel/oneapi/setvars.sh + bash python/llm/test/run-llm-install-tests.sh + + - name: Test on xpu + shell: bash + run: | + source /home/arda/intel/oneapi/setvars.sh + export USE_XETLA=OFF + export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 + mv python/llm/test/benchmark/stable-version-arc-perf-test.yaml python/llm/dev/benchmark/all-in-one/config.yaml + cd python/llm/dev/benchmark/all-in-one + # hide time info + sed -i 's/str(end - st)/"xxxxxx"/g' run.py + # change csv name + sed -i 's/{today}/{today}_test1/g' run.py + python run.py + # upgrade transformers for model Mistral-7B-v0.1 + python -m pip install transformers==4.34.0 + mv ../../../test/benchmark/arc-perf-transformers-434.yaml ./config.yaml + # change csv name + sed -i 's/test1/test2/g' run.py + python run.py + python ../../../test/benchmark/concat_csv.py + cp ./*.csv $CSV_SAVE_PATH + cd ../../../test/benchmark + python -m pip install pandas==1.5.3 + python csv_to_html.py -f $CSV_SAVE_PATH + cd ../../dev/benchmark/all-in-one/ + python ../../../test/benchmark/check_results.py -n 45 + if [ ${{ github.event.schedule}} ]; then + curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/ + fi + + llm-performance-test-on-spr: + needs: llm-cpp-build + strategy: + fail-fast: false + matrix: + python-version: ["3.9"] + runs-on: [self-hosted, llm, spr01-perf] + env: + OMP_NUM_THREADS: 16 + THREAD_NUM: 16 + ANALYTICS_ZOO_ROOT: ${{ github.workspace }} + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + shell: bash + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade wheel + python -m pip install --upgrade omegaconf + python -m pip install --upgrade pandas + python -m pip install --upgrade einops + python -m pip install --upgrade tiktoken + python -m pip install --upgrade transformers_stream_generator + + - name: Download llm binary + uses: ./.github/actions/llm/download-llm-binary + + - name: Run LLM install (all) test + uses: ./.github/actions/llm/setup-llm-env + + - name: Test on cpu + shell: bash + run: | + mv python/llm/test/benchmark/stable-version-cpu-perf-test.yaml python/llm/dev/benchmark/all-in-one/config.yaml + cd python/llm/dev/benchmark/all-in-one + export http_proxy=${HTTP_PROXY} + export https_proxy=${HTTPS_PROXY} + source bigdl-llm-init -t + export OMP_NUM_THREADS=48 + # hide time info + sed -i 's/str(end - st)/"xxxxxx"/g' run.py + python run.py + cp ./*.csv /models/nightly_perf_cpu/ + cd ../../../test/benchmark + python -m pip install pandas==1.5.3 + python csv_to_html.py -f /models/nightly_perf_cpu/ + + \ No newline at end of file diff --git a/python/llm/test/benchmark/stable-version-arc-perf-test.yaml b/python/llm/test/benchmark/stable-version-arc-perf-test.yaml new file mode 100644 index 00000000..17967038 --- /dev/null +++ b/python/llm/test/benchmark/stable-version-arc-perf-test.yaml @@ -0,0 +1,22 @@ +repo_id: + - 'meta-llama/Llama-2-7b-chat-hf' + - 'THUDM/chatglm2-6b' + - 'THUDM/chatglm3-6b' + - 'baichuan-inc/Baichuan2-7B-Chat' + - 'Qwen/Qwen-7B-Chat' +local_model_hub: '/mnt/disk1/models' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +in_out_pairs: + - '32-32' + - '1024-128' + - '2048-256' +test_api: + - "transformer_int4_gpu" # on Intel GPU +cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) +exclude: + - 'fnlp/moss-moon-003-sft:1024' + - 'fnlp/moss-moon-003-sft:2048' + - 'bigscience/bloomz-7b1:2048' diff --git a/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml b/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml new file mode 100644 index 00000000..ff11c034 --- /dev/null +++ b/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml @@ -0,0 +1,28 @@ +repo_id: + - 'meta-llama/Llama-2-7b-chat-hf' + - 'meta-llama/Llama-2-13b-chat-hf' + - 'THUDM/chatglm2-6b' + - 'THUDM/chatglm3-6b' + - 'baichuan-inc/Baichuan2-7B-Chat' + - 'baichuan-inc/Baichuan2-13B-Chat' + - 'Qwen/Qwen-14B-Chat' +local_model_hub: '/models' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +in_out_pairs: + - '32-32' + - '1024-128' + - '2048-256' +test_api: + - "transformer_int4" + # - "native_int4" + # - "optimize_model" + # - "pytorch_autocast_bf16" + # - "ipex_fp16_gpu" # on Intel GPU + # - "transformer_int4_gpu" # on Intel GPU + # - "optimize_model_gpu" # on Intel GPU + # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server + # - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) +cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)