Arc stress test (#9795)

* add arc stress test * triger ci * triger CI * triger ci * disable ci
2023-12-27 21:02:41 +08:00 · 2023-12-27 21:02:41 +08:00 · f6bb4ab313
commit f6bb4ab313
parent 40eaf76ae3
7 changed files with 668 additions and 559 deletions
--- a/.github/workflows/llm_tests_for_stable_version.yml
+++ b/.github/workflows/llm_tests_for_stable_version.yml
@ -102,6 +102,88 @@ jobs:
          cd ../../../test/benchmark
          python -m pip install pandas==1.5.3
          python csv_to_html.py -f $CSV_SAVE_PATH/fp8 -b $CSV_SAVE_PATH/fp8/transformer_int4_gpu-results-1baseline.csv -t 5.0
+  
+  llm-stress-test-on-arc:
+    needs: llm-perf-regression-test-on-arc
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9"]
+    runs-on: [self-hosted, llm, perf]
+    env:
+      OMP_NUM_THREADS: 16
+      THREAD_NUM: 16
+      ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
+      CSV_SAVE_PATH: '/mnt/disk1/stable_version_stress_test_gpu/'
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        shell: bash
+        # pip install transformers_stream_generator for model internlm-chat-7b-8k
+        # pip install tiktoken for model Qwen-7B-Chat-10-12
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade wheel
+          python -m pip install --upgrade omegaconf
+          python -m pip install --upgrade pandas
+          python -m pip install --upgrade einops
+          python -m pip install --upgrade transformers_stream_generator
+          python -m pip install --upgrade tiktoken
+
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
+
+      - name: Run LLM install (all) test
+        uses: ./.github/actions/llm/setup-llm-env
+        with:
+          extra-dependency: "xpu"
+
+      - name: Test installed xpu version
+        shell: bash
+        run: |
+          source /home/arda/intel/oneapi/setvars.sh
+          bash python/llm/test/run-llm-install-tests.sh
+
+      - name: Test on xpu (int4)
+        shell: bash
+        run: |
+          source /home/arda/intel/oneapi/setvars.sh
+          export USE_XETLA=OFF
+          export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+          mv python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml python/llm/dev/benchmark/all-in-one/config.yaml
+          cd python/llm/dev/benchmark/all-in-one
+          # hide time info
+          sed -i 's/str(end - st)/"xxxxxx"/g' run-stress-test.py
+          python run-stress-test.py
+          cp ./*.csv $CSV_SAVE_PATH/int4
+          rm ./*.csv
+          cd ../../../test/benchmark
+          python -m pip install pandas==1.5.3
+          python csv_to_html.py -f $CSV_SAVE_PATH/int4
+
+      - name: Test on xpu (fp8)
+        shell: bash
+        run: |
+          source /home/arda/intel/oneapi/setvars.sh
+          export USE_XETLA=OFF
+          export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+          mv python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml python/llm/dev/benchmark/all-in-one/config.yaml
+          cd python/llm/dev/benchmark/all-in-one
+          # hide time info
+          sed -i 's/str(end - st)/"xxxxxx"/g' run-stress-test.py
+          python run-stress-test.py
+          cp ./*.csv $CSV_SAVE_PATH/fp8
+          rm ./*.csv
+          cd ../../../test/benchmark
+          python -m pip install pandas==1.5.3
+          python csv_to_html.py -f $CSV_SAVE_PATH/fp8


  llm-perf-regression-test-on-spr:
@ -209,4 +291,4 @@ jobs:
          cp ./*.csv /models/stable_version_stress_test_cpu/
          cd ../../../test/benchmark
          python -m pip install pandas==1.5.3
-          python csv_to_html.py -f /models/stable_version_stress_test_cpu/
+          python csv_to_html.py -f /models/stable_version_stress_test_cpu/
--- a/python/llm/dev/benchmark/all-in-one/prompt/stress_test.txt
+++ b/python/llm/dev/benchmark/all-in-one/prompt/stress_test.txt
--- a/python/llm/dev/benchmark/all-in-one/prompt/stress_test_copy.txt
+++ b/python/llm/dev/benchmark/all-in-one/prompt/stress_test_copy.txt
--- a/python/llm/dev/benchmark/all-in-one/run-stress-test.py
+++ b/python/llm/dev/benchmark/all-in-one/run-stress-test.py
@ -45,22 +45,6 @@ LLAVA_IDS = ['liuhaotian/llava-v1.5-7b']
 results = []
 excludes = []

-def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials):
-    for i in range(num_trials + warm_up):
-        st = time.perf_counter()
-        output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
-                                    num_beams=num_beams)
-        torch.xpu.synchronize()
-        end = time.perf_counter()
-        output_ids = output_ids.cpu()
-        print("model generate cost: " + str(end - st))
-        output = tokenizer.batch_decode(output_ids)
-        print(output[0])
-        actual_out_len = output_ids.shape[1] - actual_in_len
-        if i >= warm_up:
-            result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
-                                actual_in_len, actual_out_len])
-
 def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False):
    # TODO: make a parameter
    result= {}
@ -82,8 +66,7 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1,
                            num_beams,
                            low_bit,
                            cpu_embedding if 'win' in test_api else 'N/A',
-                            result[in_out_pair][-1][5] if 'win' in test_api else 'N/A']) # currently only peak mem for win gpu is caught here
-
+                            result[in_out_pair][-1][5] if 'int4_gpu' in test_api else 'N/A']) # currently only peak mem for win gpu is caught here

 def get_model_path(repo_id, local_model_hub):
    if local_model_hub:
@ -95,10 +78,6 @@ def get_model_path(repo_id, local_model_hub):
    else:
        return repo_id

-
-
-
-
 def run_transformer_int4(repo_id,
                         local_model_hub,
                         in_out_pairs,
@ -158,10 +137,11 @@ def run_transformer_int4(repo_id,
                        result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
                                            actual_in_len, actual_out_len])
                    i += 1
+                    if i >= warm_up+num_trials:
+                        break

    return result

-
 def run_transformer_int4_gpu(repo_id,
                             local_model_hub,
                             in_out_pairs,
@ -172,6 +152,7 @@ def run_transformer_int4_gpu(repo_id,
    from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
    from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
    import intel_extension_for_pytorch as ipex
+    reserved_mem_list = []
    model_path = get_model_path(repo_id, local_model_hub)
    # Load model in 4 bit,
    # which convert the relevant layers in the model into INT4 format
@ -196,6 +177,7 @@ def run_transformer_int4_gpu(repo_id,
            model = ipex.optimize(model.eval(), inplace=True)
    end = time.perf_counter()
    print(">> loading of model costs {}s".format(end - st))
+    reserved_mem_list.append(torch.xpu.memory.memory_reserved()/(1024**3))

    model = BenchmarkWrapper(model)

@ -205,31 +187,42 @@ def run_transformer_int4_gpu(repo_id,
            in_out_len = in_out.split("-")
            in_len = int(in_out_len[0])
            out_len = int(in_out_len[1])
-            # As different tokenizer has different encodings,
-            # in_len.txt maybe shorter than we need,
-            # use much longer context to make sure input length
-            test_length = min(in_len*2, 8192)
-            while test_length not in [32, 256, 1024, 2048, 8192]:
-                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
-            # As different tokenizer has different encodings,
-            # slice the input_ids to ensure the prompt length is required length.
-            input_ids = tokenizer.encode(input_str, return_tensors="pt")
-            input_ids = input_ids[:, :in_len]
-            true_str = tokenizer.batch_decode(input_ids)[0]
-            input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
-            actual_in_len = input_ids.shape[1]
-            result[in_out] = []
-            thread = threading.Thread(target=run_model_in_thread, args=(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials))
-            thread.start()
-            thread.join()
-    del model
+            i = 0
+            with open("prompt/stress_test.txt", 'r') as file:
+                for input_str in file:
+                    # As different tokenizer has different encodings,
+                    # slice the input_ids to ensure the prompt length is required length.
+                    input_ids = tokenizer.encode(input_str, return_tensors="pt")
+                    input_ids = input_ids[:, :in_len]
+                    true_str = tokenizer.batch_decode(input_ids)[0]
+                    input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
+                    actual_in_len = input_ids.shape[1]
+                    result[in_out] = []
+                    st = time.perf_counter()
+                    output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
+                                                num_beams=num_beams)
+                    torch.xpu.synchronize()
+                    end = time.perf_counter()
+                    reserved_mem_list.append(torch.xpu.memory.memory_reserved()/(1024**3))
+                    gpu_peak_mem = max(reserved_mem_list) # always keep the peak gpu mem at current stage
+                    output_ids = output_ids.cpu()
+                    print("model generate cost: " + str(end - st))
+                    output = tokenizer.batch_decode(output_ids)
+                    print(output[0])
+                    actual_out_len = output_ids.shape[1] - actual_in_len
+                    if i >= warm_up:
+                        result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
+                                            actual_in_len, actual_out_len, gpu_peak_mem])
+                    i += 1
+                    if i >= warm_up+num_trials:
+                        break
+    model.to('cpu')
+    torch.xpu.synchronize()
    torch.xpu.empty_cache()
+    del model
+    gc.collect()
    return result

-
-
-
 if __name__ == '__main__':
    from omegaconf import OmegaConf
    conf = OmegaConf.load(f'{current_dir}/config.yaml')
--- a/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml
+++ b/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml
@ -0,0 +1,20 @@
+repo_id:
+  - 'meta-llama/Llama-2-7b-chat-hf'
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'Qwen/Qwen-7B-Chat'
+local_model_hub: '/mnt/disk1/models'
+warm_up: 10
+num_trials: 100
+num_beams: 1 # default to greedy search
+low_bit: 'fp8' # default to use 'sym_int4' (i.e. symmetric int4)
+in_out_pairs:
+  - '1024-512'
+  - '2048-512'
+test_api:
+  - "transformer_int4_gpu"  # on Intel GPU
+cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
+exclude:
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048'
+  - 'Qwen/Qwen-7B-Chat:2048'
--- a/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml
+++ b/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml
@ -0,0 +1,19 @@
+repo_id:
+  - 'meta-llama/Llama-2-7b-chat-hf'
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'Qwen/Qwen-7B-Chat'
+local_model_hub: '/mnt/disk1/models'
+warm_up: 10
+num_trials: 100
+num_beams: 1 # default to greedy search
+low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
+in_out_pairs:
+  - '1024-512'
+  - '2048-512'
+test_api:
+  - "transformer_int4_gpu"  # on Intel GPU
+cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
+exclude:
+  - 'Qwen/Qwen-7B-Chat:2048'
--- a/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml
+++ b/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml
@ -7,8 +7,8 @@ repo_id:
  - 'baichuan-inc/Baichuan2-13B-Chat'
  - 'Qwen/Qwen-14B-Chat'
 local_model_hub: '/models'
-warm_up: 1
-num_trials: 4
+warm_up: 10
+num_trials: 100
 num_beams: 1 # default to greedy search
 low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 in_out_pairs: