Update all-in-one benchmark for continuation task input preparation (#11760)
* All use 8192.txt for prompt preparation for now * Small fix * Fix text encoding mode to utf-8 * Small update
This commit is contained in:
parent
1b05caba2b
commit
f97a77ea4e
1 changed files with 25 additions and 32 deletions
|
|
@ -75,16 +75,9 @@ def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, in
|
|||
actual_in_len, actual_out_len, load_time, model.peak_memory])
|
||||
|
||||
|
||||
def get_continuation_input_str(in_len):
|
||||
# in_len.txt maybe shorter than we need,
|
||||
# use much longer context to make sure input length
|
||||
test_length = min(in_len*2, 8192)
|
||||
while test_length not in [32, 256, 1024, 2048, 8192] and test_length < 8192:
|
||||
test_length = test_length * 2
|
||||
# Force the test_length to be 8192, such that we can use 8192.txt
|
||||
if test_length > 8192:
|
||||
test_length = 8192
|
||||
return open(f"prompt/continuation/{test_length}.txt", 'r').read()
|
||||
def get_continuation_input_str():
|
||||
# all use 8192.txt for prompt preparation for now; and keep 'utf-8' as character encoding mode
|
||||
return open(f"prompt/continuation/8192.txt", 'r', encoding='utf-8').read()
|
||||
|
||||
|
||||
def preprocess_prompt(tokenizer, in_len, task):
|
||||
|
|
@ -231,7 +224,7 @@ def run_native_int4(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
n_ctx = in_len + out_len if in_len + out_len > 512 else 512
|
||||
|
|
@ -287,7 +280,7 @@ def run_transformer_int4(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -346,7 +339,7 @@ def run_pytorch_autocast_bf16(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -411,7 +404,7 @@ def run_optimize_model(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -520,7 +513,7 @@ def run_transformer_int4_gpu(repo_id,
|
|||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
if task == 'continuation':
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -603,7 +596,7 @@ def transformers_int4_npu_win(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -674,7 +667,7 @@ def run_optimize_model_gpu(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -739,7 +732,7 @@ def run_ipex_fp16_gpu(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -810,7 +803,7 @@ def run_bigdl_fp16_gpu(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -896,7 +889,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -989,7 +982,7 @@ def run_transformer_int4_gpu_win(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -1106,7 +1099,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -1216,7 +1209,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -1326,7 +1319,7 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -1409,7 +1402,7 @@ def run_transformer_autocast_bf16( repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -1474,7 +1467,7 @@ def run_bigdl_ipex_bf16(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -1538,7 +1531,7 @@ def run_bigdl_ipex_int4(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -1602,7 +1595,7 @@ def run_bigdl_ipex_int8(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -1706,7 +1699,7 @@ def run_deepspeed_optimize_model_gpu(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -1777,7 +1770,7 @@ def run_speculative_cpu(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -1849,7 +1842,7 @@ def run_speculative_gpu(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
@ -1930,7 +1923,7 @@ def run_pipeline_parallel_gpu(repo_id,
|
|||
in_out_len = in_out.split("-")
|
||||
in_len = int(in_out_len[0])
|
||||
out_len = int(in_out_len[1])
|
||||
input_str = get_continuation_input_str(in_len)
|
||||
input_str = get_continuation_input_str()
|
||||
# As different tokenizer has different encodings,
|
||||
# slice the input_ids to ensure the prompt length is required length.
|
||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||
|
|
|
|||
Loading…
Reference in a new issue