LLM: fix inaccurate input / output tokens of current all-in-one benchmark (#9137)
* first fix * fix all apis * fix
This commit is contained in:
parent
e02fbb40cc
commit
62ac7ae444
1 changed files with 71 additions and 14 deletions
|
|
@ -60,7 +60,10 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1,
|
||||||
np.mean(result[in_out_pair], axis=0)[0],
|
np.mean(result[in_out_pair], axis=0)[0],
|
||||||
np.mean(result[in_out_pair], axis=0)[1],
|
np.mean(result[in_out_pair], axis=0)[1],
|
||||||
np.mean(result[in_out_pair], axis=0)[2],
|
np.mean(result[in_out_pair], axis=0)[2],
|
||||||
in_out_pair])
|
in_out_pair,
|
||||||
|
f'{int(np.mean(result[in_out_pair], axis=0)[3])}' +
|
||||||
|
f'-{int(np.mean(result[in_out_pair], axis=0)[4])}'])
|
||||||
|
|
||||||
|
|
||||||
def get_model_path(repo_id, local_model_hub):
|
def get_model_path(repo_id, local_model_hub):
|
||||||
if local_model_hub:
|
if local_model_hub:
|
||||||
|
|
@ -144,13 +147,20 @@ def run_transformer_int4(repo_id,
|
||||||
in_out_len = in_out.split("-")
|
in_out_len = in_out.split("-")
|
||||||
in_len = int(in_out_len[0])
|
in_len = int(in_out_len[0])
|
||||||
out_len = int(in_out_len[1])
|
out_len = int(in_out_len[1])
|
||||||
input_str = open(f"prompt/{in_len}.txt", 'r').read()
|
# As different tokenizer has different encodings,
|
||||||
|
# in_len.txt maybe shorter than we need,
|
||||||
|
# use much longer context to make sure input length
|
||||||
|
test_length = min(in_len*2, 8192)
|
||||||
|
while test_length not in [32, 256, 1024, 2048, 8192]:
|
||||||
|
test_length = test_length * 2
|
||||||
|
input_str = open(f"prompt/{test_length}.txt", 'r').read()
|
||||||
# As different tokenizer has different encodings,
|
# As different tokenizer has different encodings,
|
||||||
# slice the input_ids to ensure the prompt length is required length.
|
# slice the input_ids to ensure the prompt length is required length.
|
||||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||||
input_ids = input_ids[:, :in_len]
|
input_ids = input_ids[:, :in_len]
|
||||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||||
input_ids = tokenizer.encode(true_str, return_tensors="pt")
|
input_ids = tokenizer.encode(true_str, return_tensors="pt")
|
||||||
|
actual_in_len = input_ids.shape[1]
|
||||||
result[in_out] = []
|
result[in_out] = []
|
||||||
for i in range(num_trials + warm_up):
|
for i in range(num_trials + warm_up):
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
|
|
@ -159,8 +169,10 @@ def run_transformer_int4(repo_id,
|
||||||
print("model generate cost: " + str(end - st))
|
print("model generate cost: " + str(end - st))
|
||||||
output = tokenizer.batch_decode(output_ids)
|
output = tokenizer.batch_decode(output_ids)
|
||||||
print(output[0])
|
print(output[0])
|
||||||
|
actual_out_len = output_ids.shape[1] - actual_in_len
|
||||||
if i >= warm_up:
|
if i >= warm_up:
|
||||||
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time])
|
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
|
||||||
|
actual_in_len, actual_out_len])
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def run_pytorch_autocast_bf16(repo_id,
|
def run_pytorch_autocast_bf16(repo_id,
|
||||||
|
|
@ -192,13 +204,20 @@ def run_pytorch_autocast_bf16(repo_id,
|
||||||
in_out_len = in_out.split("-")
|
in_out_len = in_out.split("-")
|
||||||
in_len = int(in_out_len[0])
|
in_len = int(in_out_len[0])
|
||||||
out_len = int(in_out_len[1])
|
out_len = int(in_out_len[1])
|
||||||
input_str = open(f"prompt/{in_len}.txt", 'r').read()
|
# As different tokenizer has different encodings,
|
||||||
|
# in_len.txt maybe shorter than we need,
|
||||||
|
# use much longer context to make sure input length
|
||||||
|
test_length = min(in_len*2, 8192)
|
||||||
|
while test_length not in [32, 256, 1024, 2048, 8192]:
|
||||||
|
test_length = test_length * 2
|
||||||
|
input_str = open(f"prompt/{test_length}.txt", 'r').read()
|
||||||
# As different tokenizer has different encodings,
|
# As different tokenizer has different encodings,
|
||||||
# slice the input_ids to ensure the prompt length is required length.
|
# slice the input_ids to ensure the prompt length is required length.
|
||||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||||
input_ids = input_ids[:, :in_len]
|
input_ids = input_ids[:, :in_len]
|
||||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||||
input_ids = tokenizer.encode(true_str, return_tensors="pt")
|
input_ids = tokenizer.encode(true_str, return_tensors="pt")
|
||||||
|
actual_in_len = input_ids.shape[1]
|
||||||
result[in_out] = []
|
result[in_out] = []
|
||||||
print("input tokens: {}".format(input_ids.shape[1]))
|
print("input tokens: {}".format(input_ids.shape[1]))
|
||||||
for i in range(num_trials + warm_up):
|
for i in range(num_trials + warm_up):
|
||||||
|
|
@ -208,8 +227,10 @@ def run_pytorch_autocast_bf16(repo_id,
|
||||||
print("model generate cost: " + str(end - st))
|
print("model generate cost: " + str(end - st))
|
||||||
output = tokenizer.batch_decode(output_ids)
|
output = tokenizer.batch_decode(output_ids)
|
||||||
print(output[0])
|
print(output[0])
|
||||||
|
actual_out_len = output_ids.shape[1] - actual_in_len
|
||||||
if i >= warm_up:
|
if i >= warm_up:
|
||||||
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time])
|
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
|
||||||
|
actual_in_len, actual_out_len])
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def run_optimize_model(repo_id,
|
def run_optimize_model(repo_id,
|
||||||
|
|
@ -248,13 +269,20 @@ def run_optimize_model(repo_id,
|
||||||
in_out_len = in_out.split("-")
|
in_out_len = in_out.split("-")
|
||||||
in_len = int(in_out_len[0])
|
in_len = int(in_out_len[0])
|
||||||
out_len = int(in_out_len[1])
|
out_len = int(in_out_len[1])
|
||||||
input_str = open(f"prompt/{in_len}.txt", 'r').read()
|
# As different tokenizer has different encodings,
|
||||||
|
# in_len.txt maybe shorter than we need,
|
||||||
|
# use much longer context to make sure input length
|
||||||
|
test_length = min(in_len*2, 8192)
|
||||||
|
while test_length not in [32, 256, 1024, 2048, 8192]:
|
||||||
|
test_length = test_length * 2
|
||||||
|
input_str = open(f"prompt/{test_length}.txt", 'r').read()
|
||||||
# As different tokenizer has different encodings,
|
# As different tokenizer has different encodings,
|
||||||
# slice the input_ids to ensure the prompt length is required length.
|
# slice the input_ids to ensure the prompt length is required length.
|
||||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||||
input_ids = input_ids[:, :in_len]
|
input_ids = input_ids[:, :in_len]
|
||||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||||
input_ids = tokenizer.encode(true_str, return_tensors="pt")
|
input_ids = tokenizer.encode(true_str, return_tensors="pt")
|
||||||
|
actual_in_len = input_ids.shape[1]
|
||||||
result[in_out] = []
|
result[in_out] = []
|
||||||
for i in range(num_trials + warm_up):
|
for i in range(num_trials + warm_up):
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
|
|
@ -263,8 +291,10 @@ def run_optimize_model(repo_id,
|
||||||
print("model generate cost: " + str(end - st))
|
print("model generate cost: " + str(end - st))
|
||||||
output = tokenizer.batch_decode(output_ids)
|
output = tokenizer.batch_decode(output_ids)
|
||||||
print(output[0])
|
print(output[0])
|
||||||
|
actual_out_len = output_ids.shape[1] - actual_in_len
|
||||||
if i >= warm_up:
|
if i >= warm_up:
|
||||||
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time])
|
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
|
||||||
|
actual_in_len, actual_out_len])
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -309,13 +339,20 @@ def run_transformer_int4_gpu(repo_id,
|
||||||
in_out_len = in_out.split("-")
|
in_out_len = in_out.split("-")
|
||||||
in_len = int(in_out_len[0])
|
in_len = int(in_out_len[0])
|
||||||
out_len = int(in_out_len[1])
|
out_len = int(in_out_len[1])
|
||||||
input_str = open(f"prompt/{in_len}.txt", 'r').read()
|
# As different tokenizer has different encodings,
|
||||||
|
# in_len.txt maybe shorter than we need,
|
||||||
|
# use much longer context to make sure input length
|
||||||
|
test_length = min(in_len*2, 8192)
|
||||||
|
while test_length not in [32, 256, 1024, 2048, 8192]:
|
||||||
|
test_length = test_length * 2
|
||||||
|
input_str = open(f"prompt/{test_length}.txt", 'r').read()
|
||||||
# As different tokenizer has different encodings,
|
# As different tokenizer has different encodings,
|
||||||
# slice the input_ids to ensure the prompt length is required length.
|
# slice the input_ids to ensure the prompt length is required length.
|
||||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||||
input_ids = input_ids[:, :in_len]
|
input_ids = input_ids[:, :in_len]
|
||||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||||
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
||||||
|
actual_in_len = input_ids.shape[1]
|
||||||
result[in_out] = []
|
result[in_out] = []
|
||||||
for i in range(num_trials + warm_up):
|
for i in range(num_trials + warm_up):
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
|
|
@ -326,8 +363,10 @@ def run_transformer_int4_gpu(repo_id,
|
||||||
print("model generate cost: " + str(end - st))
|
print("model generate cost: " + str(end - st))
|
||||||
output = tokenizer.batch_decode(output_ids)
|
output = tokenizer.batch_decode(output_ids)
|
||||||
print(output[0])
|
print(output[0])
|
||||||
|
actual_out_len = output_ids.shape[1] - actual_in_len
|
||||||
if i >= warm_up:
|
if i >= warm_up:
|
||||||
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time])
|
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
|
||||||
|
actual_in_len, actual_out_len])
|
||||||
torch.xpu.empty_cache()
|
torch.xpu.empty_cache()
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
@ -376,13 +415,20 @@ def run_optimize_model_gpu(repo_id,
|
||||||
in_out_len = in_out.split("-")
|
in_out_len = in_out.split("-")
|
||||||
in_len = int(in_out_len[0])
|
in_len = int(in_out_len[0])
|
||||||
out_len = int(in_out_len[1])
|
out_len = int(in_out_len[1])
|
||||||
input_str = open(f"prompt/{in_len}.txt", 'r').read()
|
# As different tokenizer has different encodings,
|
||||||
|
# in_len.txt maybe shorter than we need,
|
||||||
|
# use much longer context to make sure input length
|
||||||
|
test_length = min(in_len*2, 8192)
|
||||||
|
while test_length not in [32, 256, 1024, 2048, 8192]:
|
||||||
|
test_length = test_length * 2
|
||||||
|
input_str = open(f"prompt/{test_length}.txt", 'r').read()
|
||||||
# As different tokenizer has different encodings,
|
# As different tokenizer has different encodings,
|
||||||
# slice the input_ids to ensure the prompt length is required length.
|
# slice the input_ids to ensure the prompt length is required length.
|
||||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||||
input_ids = input_ids[:, :in_len]
|
input_ids = input_ids[:, :in_len]
|
||||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||||
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
||||||
|
actual_in_len = input_ids.shape[1]
|
||||||
result[in_out] = []
|
result[in_out] = []
|
||||||
for i in range(num_trials + warm_up):
|
for i in range(num_trials + warm_up):
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
|
|
@ -392,9 +438,11 @@ def run_optimize_model_gpu(repo_id,
|
||||||
output_ids = output_ids.cpu()
|
output_ids = output_ids.cpu()
|
||||||
print("model generate cost: " + str(end - st))
|
print("model generate cost: " + str(end - st))
|
||||||
output = tokenizer.batch_decode(output_ids)
|
output = tokenizer.batch_decode(output_ids)
|
||||||
|
actual_out_len = output_ids.shape[1] - actual_in_len
|
||||||
print(output[0])
|
print(output[0])
|
||||||
if i >= warm_up:
|
if i >= warm_up:
|
||||||
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time])
|
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
|
||||||
|
actual_in_len, actual_out_len])
|
||||||
torch.xpu.empty_cache()
|
torch.xpu.empty_cache()
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
@ -436,13 +484,20 @@ def run_ipex_fp16_gpu(repo_id,
|
||||||
in_out_len = in_out.split("-")
|
in_out_len = in_out.split("-")
|
||||||
in_len = int(in_out_len[0])
|
in_len = int(in_out_len[0])
|
||||||
out_len = int(in_out_len[1])
|
out_len = int(in_out_len[1])
|
||||||
input_str = open(f"prompt/{in_len}.txt", 'r').read()
|
# As different tokenizer has different encodings,
|
||||||
|
# in_len.txt maybe shorter than we need,
|
||||||
|
# use much longer context to make sure input length
|
||||||
|
test_length = min(in_len*2, 8192)
|
||||||
|
while test_length not in [32, 256, 1024, 2048, 8192]:
|
||||||
|
test_length = test_length * 2
|
||||||
|
input_str = open(f"prompt/{test_length}.txt", 'r').read()
|
||||||
# As different tokenizer has different encodings,
|
# As different tokenizer has different encodings,
|
||||||
# slice the input_ids to ensure the prompt length is required length.
|
# slice the input_ids to ensure the prompt length is required length.
|
||||||
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
input_ids = tokenizer.encode(input_str, return_tensors="pt")
|
||||||
input_ids = input_ids[:, :in_len]
|
input_ids = input_ids[:, :in_len]
|
||||||
true_str = tokenizer.batch_decode(input_ids)[0]
|
true_str = tokenizer.batch_decode(input_ids)[0]
|
||||||
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
|
||||||
|
actual_in_len = input_ids.shape[1]
|
||||||
result[in_out] = []
|
result[in_out] = []
|
||||||
for i in range(num_trials + warm_up):
|
for i in range(num_trials + warm_up):
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
|
|
@ -452,9 +507,11 @@ def run_ipex_fp16_gpu(repo_id,
|
||||||
output_ids = output_ids.cpu()
|
output_ids = output_ids.cpu()
|
||||||
print("model generate cost: " + str(end - st))
|
print("model generate cost: " + str(end - st))
|
||||||
output = tokenizer.batch_decode(output_ids)
|
output = tokenizer.batch_decode(output_ids)
|
||||||
|
actual_out_len = output_ids.shape[1] - actual_in_len
|
||||||
print(output[0])
|
print(output[0])
|
||||||
if i >= warm_up:
|
if i >= warm_up:
|
||||||
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time])
|
result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
|
||||||
|
actual_in_len, actual_out_len])
|
||||||
torch.xpu.empty_cache()
|
torch.xpu.empty_cache()
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
@ -468,6 +525,6 @@ if __name__ == '__main__':
|
||||||
for api in conf.test_api:
|
for api in conf.test_api:
|
||||||
for model in conf.repo_id:
|
for model in conf.repo_id:
|
||||||
run_model(model, api, conf['in_out_pairs'], conf['local_model_hub'], conf['warm_up'], conf['num_trials'])
|
run_model(model, api, conf['in_out_pairs'], conf['local_model_hub'], conf['warm_up'], conf['num_trials'])
|
||||||
df = pd.DataFrame(results, columns=['model', '1st token avg latency (s)', '2+ avg latency (s/token)', 'encoder time (s)', 'input/output tokens'])
|
df = pd.DataFrame(results, columns=['model', '1st token avg latency (s)', '2+ avg latency (s/token)', 'encoder time (s)', 'input/output tokens', 'actual input/output tokens'])
|
||||||
df.to_csv(f'{current_dir}/{api}-results-{today}.csv')
|
df.to_csv(f'{current_dir}/{api}-results-{today}.csv')
|
||||||
results = []
|
results = []
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue