LLM: fix inaccurate input / output tokens of current all-in-one benchmark (#9137)

* first fix

* fix all apis

* fix
This commit is contained in:
Ruonan Wang 2023-10-11 17:13:34 +08:00 committed by GitHub
parent e02fbb40cc
commit 62ac7ae444

View file

@ -60,7 +60,10 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1,
np.mean(result[in_out_pair], axis=0)[0], np.mean(result[in_out_pair], axis=0)[0],
np.mean(result[in_out_pair], axis=0)[1], np.mean(result[in_out_pair], axis=0)[1],
np.mean(result[in_out_pair], axis=0)[2], np.mean(result[in_out_pair], axis=0)[2],
in_out_pair]) in_out_pair,
f'{int(np.mean(result[in_out_pair], axis=0)[3])}' +
f'-{int(np.mean(result[in_out_pair], axis=0)[4])}'])
def get_model_path(repo_id, local_model_hub): def get_model_path(repo_id, local_model_hub):
if local_model_hub: if local_model_hub:
@ -144,13 +147,20 @@ def run_transformer_int4(repo_id,
in_out_len = in_out.split("-") in_out_len = in_out.split("-")
in_len = int(in_out_len[0]) in_len = int(in_out_len[0])
out_len = int(in_out_len[1]) out_len = int(in_out_len[1])
input_str = open(f"prompt/{in_len}.txt", 'r').read() # As different tokenizer has different encodings,
# in_len.txt maybe shorter than we need,
# use much longer context to make sure input length
test_length = min(in_len*2, 8192)
while test_length not in [32, 256, 1024, 2048, 8192]:
test_length = test_length * 2
input_str = open(f"prompt/{test_length}.txt", 'r').read()
# As different tokenizer has different encodings, # As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length. # slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len] input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0] true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt") input_ids = tokenizer.encode(true_str, return_tensors="pt")
actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
for i in range(num_trials + warm_up): for i in range(num_trials + warm_up):
st = time.perf_counter() st = time.perf_counter()
@ -159,8 +169,10 @@ def run_transformer_int4(repo_id,
print("model generate cost: " + str(end - st)) print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids) output = tokenizer.batch_decode(output_ids)
print(output[0]) print(output[0])
actual_out_len = output_ids.shape[1] - actual_in_len
if i >= warm_up: if i >= warm_up:
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
actual_in_len, actual_out_len])
return result return result
def run_pytorch_autocast_bf16(repo_id, def run_pytorch_autocast_bf16(repo_id,
@ -192,13 +204,20 @@ def run_pytorch_autocast_bf16(repo_id,
in_out_len = in_out.split("-") in_out_len = in_out.split("-")
in_len = int(in_out_len[0]) in_len = int(in_out_len[0])
out_len = int(in_out_len[1]) out_len = int(in_out_len[1])
input_str = open(f"prompt/{in_len}.txt", 'r').read() # As different tokenizer has different encodings,
# in_len.txt maybe shorter than we need,
# use much longer context to make sure input length
test_length = min(in_len*2, 8192)
while test_length not in [32, 256, 1024, 2048, 8192]:
test_length = test_length * 2
input_str = open(f"prompt/{test_length}.txt", 'r').read()
# As different tokenizer has different encodings, # As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length. # slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len] input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0] true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt") input_ids = tokenizer.encode(true_str, return_tensors="pt")
actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
print("input tokens: {}".format(input_ids.shape[1])) print("input tokens: {}".format(input_ids.shape[1]))
for i in range(num_trials + warm_up): for i in range(num_trials + warm_up):
@ -208,8 +227,10 @@ def run_pytorch_autocast_bf16(repo_id,
print("model generate cost: " + str(end - st)) print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids) output = tokenizer.batch_decode(output_ids)
print(output[0]) print(output[0])
actual_out_len = output_ids.shape[1] - actual_in_len
if i >= warm_up: if i >= warm_up:
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
actual_in_len, actual_out_len])
return result return result
def run_optimize_model(repo_id, def run_optimize_model(repo_id,
@ -248,13 +269,20 @@ def run_optimize_model(repo_id,
in_out_len = in_out.split("-") in_out_len = in_out.split("-")
in_len = int(in_out_len[0]) in_len = int(in_out_len[0])
out_len = int(in_out_len[1]) out_len = int(in_out_len[1])
input_str = open(f"prompt/{in_len}.txt", 'r').read() # As different tokenizer has different encodings,
# in_len.txt maybe shorter than we need,
# use much longer context to make sure input length
test_length = min(in_len*2, 8192)
while test_length not in [32, 256, 1024, 2048, 8192]:
test_length = test_length * 2
input_str = open(f"prompt/{test_length}.txt", 'r').read()
# As different tokenizer has different encodings, # As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length. # slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len] input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0] true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt") input_ids = tokenizer.encode(true_str, return_tensors="pt")
actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
for i in range(num_trials + warm_up): for i in range(num_trials + warm_up):
st = time.perf_counter() st = time.perf_counter()
@ -263,8 +291,10 @@ def run_optimize_model(repo_id,
print("model generate cost: " + str(end - st)) print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids) output = tokenizer.batch_decode(output_ids)
print(output[0]) print(output[0])
actual_out_len = output_ids.shape[1] - actual_in_len
if i >= warm_up: if i >= warm_up:
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
actual_in_len, actual_out_len])
return result return result
@ -309,13 +339,20 @@ def run_transformer_int4_gpu(repo_id,
in_out_len = in_out.split("-") in_out_len = in_out.split("-")
in_len = int(in_out_len[0]) in_len = int(in_out_len[0])
out_len = int(in_out_len[1]) out_len = int(in_out_len[1])
input_str = open(f"prompt/{in_len}.txt", 'r').read() # As different tokenizer has different encodings,
# in_len.txt maybe shorter than we need,
# use much longer context to make sure input length
test_length = min(in_len*2, 8192)
while test_length not in [32, 256, 1024, 2048, 8192]:
test_length = test_length * 2
input_str = open(f"prompt/{test_length}.txt", 'r').read()
# As different tokenizer has different encodings, # As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length. # slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len] input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0] true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
for i in range(num_trials + warm_up): for i in range(num_trials + warm_up):
st = time.perf_counter() st = time.perf_counter()
@ -326,8 +363,10 @@ def run_transformer_int4_gpu(repo_id,
print("model generate cost: " + str(end - st)) print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids) output = tokenizer.batch_decode(output_ids)
print(output[0]) print(output[0])
actual_out_len = output_ids.shape[1] - actual_in_len
if i >= warm_up: if i >= warm_up:
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
actual_in_len, actual_out_len])
torch.xpu.empty_cache() torch.xpu.empty_cache()
return result return result
@ -376,13 +415,20 @@ def run_optimize_model_gpu(repo_id,
in_out_len = in_out.split("-") in_out_len = in_out.split("-")
in_len = int(in_out_len[0]) in_len = int(in_out_len[0])
out_len = int(in_out_len[1]) out_len = int(in_out_len[1])
input_str = open(f"prompt/{in_len}.txt", 'r').read() # As different tokenizer has different encodings,
# in_len.txt maybe shorter than we need,
# use much longer context to make sure input length
test_length = min(in_len*2, 8192)
while test_length not in [32, 256, 1024, 2048, 8192]:
test_length = test_length * 2
input_str = open(f"prompt/{test_length}.txt", 'r').read()
# As different tokenizer has different encodings, # As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length. # slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len] input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0] true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
for i in range(num_trials + warm_up): for i in range(num_trials + warm_up):
st = time.perf_counter() st = time.perf_counter()
@ -392,9 +438,11 @@ def run_optimize_model_gpu(repo_id,
output_ids = output_ids.cpu() output_ids = output_ids.cpu()
print("model generate cost: " + str(end - st)) print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids) output = tokenizer.batch_decode(output_ids)
actual_out_len = output_ids.shape[1] - actual_in_len
print(output[0]) print(output[0])
if i >= warm_up: if i >= warm_up:
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
actual_in_len, actual_out_len])
torch.xpu.empty_cache() torch.xpu.empty_cache()
return result return result
@ -436,13 +484,20 @@ def run_ipex_fp16_gpu(repo_id,
in_out_len = in_out.split("-") in_out_len = in_out.split("-")
in_len = int(in_out_len[0]) in_len = int(in_out_len[0])
out_len = int(in_out_len[1]) out_len = int(in_out_len[1])
input_str = open(f"prompt/{in_len}.txt", 'r').read() # As different tokenizer has different encodings,
# in_len.txt maybe shorter than we need,
# use much longer context to make sure input length
test_length = min(in_len*2, 8192)
while test_length not in [32, 256, 1024, 2048, 8192]:
test_length = test_length * 2
input_str = open(f"prompt/{test_length}.txt", 'r').read()
# As different tokenizer has different encodings, # As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length. # slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len] input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0] true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
for i in range(num_trials + warm_up): for i in range(num_trials + warm_up):
st = time.perf_counter() st = time.perf_counter()
@ -452,9 +507,11 @@ def run_ipex_fp16_gpu(repo_id,
output_ids = output_ids.cpu() output_ids = output_ids.cpu()
print("model generate cost: " + str(end - st)) print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids) output = tokenizer.batch_decode(output_ids)
actual_out_len = output_ids.shape[1] - actual_in_len
print(output[0]) print(output[0])
if i >= warm_up: if i >= warm_up:
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
actual_in_len, actual_out_len])
torch.xpu.empty_cache() torch.xpu.empty_cache()
return result return result
@ -468,6 +525,6 @@ if __name__ == '__main__':
for api in conf.test_api: for api in conf.test_api:
for model in conf.repo_id: for model in conf.repo_id:
run_model(model, api, conf['in_out_pairs'], conf['local_model_hub'], conf['warm_up'], conf['num_trials']) run_model(model, api, conf['in_out_pairs'], conf['local_model_hub'], conf['warm_up'], conf['num_trials'])
df = pd.DataFrame(results, columns=['model', '1st token avg latency (s)', '2+ avg latency (s/token)', 'encoder time (s)', 'input/output tokens']) df = pd.DataFrame(results, columns=['model', '1st token avg latency (s)', '2+ avg latency (s/token)', 'encoder time (s)', 'input/output tokens', 'actual input/output tokens'])
df.to_csv(f'{current_dir}/{api}-results-{today}.csv') df.to_csv(f'{current_dir}/{api}-results-{today}.csv')
results = [] results = []