fix nightly perf test (#11251)

2024-06-07 09:33:14 +08:00 · 2024-06-07 09:33:14 +08:00 · 85df5e7699
commit 85df5e7699
parent 2f809116e2
2 changed files with 20 additions and 20 deletions
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@ -210,7 +210,7 @@ def run_native_int4(repo_id,
        in_out_len = in_out.split("-")
        in_len = int(in_out_len[0])
        out_len = int(in_out_len[1])
-        input_str = open(f"prompt/{in_len}.txt", 'r').read()
+        input_str = open(f"prompt/continuation/{in_len}.txt", 'r').read()
        # As different tokenizer has different encodings,
        # slice the input_ids to ensure the prompt length is required length.
        n_ctx = in_len + out_len if in_len + out_len > 512 else 512
@ -272,7 +272,7 @@ def run_transformer_int4(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -337,7 +337,7 @@ def run_pytorch_autocast_bf16(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -408,7 +408,7 @@ def run_optimize_model(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -597,7 +597,7 @@ def run_optimize_model_gpu(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -669,7 +669,7 @@ def run_ipex_fp16_gpu(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -747,7 +747,7 @@ def run_bigdl_fp16_gpu(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -839,7 +839,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -926,7 +926,7 @@ def run_transformer_int4_gpu_win(repo_id,
                test_length = min(in_len*2, 8192)
                while test_length not in [32, 256, 1024, 2048, 8192]:
                    test_length = test_length * 2
-                input_str = open(f"prompt/{test_length}.txt", 'r').read()
+                input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
                # As different tokenizer has different encodings,
                # slice the input_ids to ensure the prompt length is required length.
                input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -1035,7 +1035,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
                test_length = min(in_len*2, 8192)
                while test_length not in [32, 256, 1024, 2048, 8192]:
                    test_length = test_length * 2
-                input_str = open(f"prompt/{test_length}.txt", 'r').read()
+                input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
                # As different tokenizer has different encodings,
                # slice the input_ids to ensure the prompt length is required length.
                input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -1139,7 +1139,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
                test_length = min(in_len*2, 8192)
                while test_length not in [32, 256, 1024, 2048, 8192]:
                    test_length = test_length * 2
-                input_str = open(f"prompt/{test_length}.txt", 'r').read()
+                input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
                # As different tokenizer has different encodings,
                # slice the input_ids to ensure the prompt length is required length.
                input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -1228,7 +1228,7 @@ def run_transformer_autocast_bf16( repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -1299,7 +1299,7 @@ def run_bigdl_ipex_bf16(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -1369,7 +1369,7 @@ def run_bigdl_ipex_int4(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -1439,7 +1439,7 @@ def run_bigdl_ipex_int8(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -1550,7 +1550,7 @@ def run_deepspeed_optimize_model_gpu(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192] and test_length < 8192:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -1627,7 +1627,7 @@ def run_speculative_cpu(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -1705,7 +1705,7 @@ def run_speculative_gpu(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
@ -1825,7 +1825,7 @@ def run_pipeline_parallel_gpu(repo_id,
            test_length = min(in_len*2, 8192)
            while test_length not in [32, 256, 1024, 2048, 8192]:
                test_length = test_length * 2
-            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read()
            # As different tokenizer has different encodings,
            # slice the input_ids to ensure the prompt length is required length.
            input_ids = tokenizer.encode(input_str, return_tensors="pt")
--- a/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py
+++ b/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py
@ -247,7 +247,7 @@ parser.add_argument(
 )
 args = parser.parse_args()
 PROMPT_LENGTH = args.prompt_length
-PROMPT = open(f"prompt/{PROMPT_LENGTH}.txt", "r").read()
+PROMPT = open(f"prompt/continuation/{PROMPT_LENGTH}.txt", "r").read()
 MAX_TOKENS = args.max_new_tokens