diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 597563c5..3f26a886 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -60,7 +60,10 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, np.mean(result[in_out_pair], axis=0)[0], np.mean(result[in_out_pair], axis=0)[1], np.mean(result[in_out_pair], axis=0)[2], - in_out_pair]) + in_out_pair, + f'{int(np.mean(result[in_out_pair], axis=0)[3])}' + + f'-{int(np.mean(result[in_out_pair], axis=0)[4])}']) + def get_model_path(repo_id, local_model_hub): if local_model_hub: @@ -144,13 +147,20 @@ def run_transformer_int4(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = open(f"prompt/{in_len}.txt", 'r').read() + # As different tokenizer has different encodings, + # in_len.txt maybe shorter than we need, + # use much longer context to make sure input length + test_length = min(in_len*2, 8192) + while test_length not in [32, 256, 1024, 2048, 8192]: + test_length = test_length * 2 + input_str = open(f"prompt/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] input_ids = tokenizer.encode(true_str, return_tensors="pt") + actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): st = time.perf_counter() @@ -159,8 +169,10 @@ def run_transformer_int4(repo_id, print("model generate cost: " + str(end - st)) output = tokenizer.batch_decode(output_ids) print(output[0]) + actual_out_len = output_ids.shape[1] - actual_in_len if i >= warm_up: - result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) + result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, + actual_in_len, actual_out_len]) return result def run_pytorch_autocast_bf16(repo_id, @@ -192,13 +204,20 @@ def run_pytorch_autocast_bf16(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = open(f"prompt/{in_len}.txt", 'r').read() + # As different tokenizer has different encodings, + # in_len.txt maybe shorter than we need, + # use much longer context to make sure input length + test_length = min(in_len*2, 8192) + while test_length not in [32, 256, 1024, 2048, 8192]: + test_length = test_length * 2 + input_str = open(f"prompt/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] input_ids = tokenizer.encode(true_str, return_tensors="pt") + actual_in_len = input_ids.shape[1] result[in_out] = [] print("input tokens: {}".format(input_ids.shape[1])) for i in range(num_trials + warm_up): @@ -208,8 +227,10 @@ def run_pytorch_autocast_bf16(repo_id, print("model generate cost: " + str(end - st)) output = tokenizer.batch_decode(output_ids) print(output[0]) + actual_out_len = output_ids.shape[1] - actual_in_len if i >= warm_up: - result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) + result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, + actual_in_len, actual_out_len]) return result def run_optimize_model(repo_id, @@ -248,13 +269,20 @@ def run_optimize_model(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = open(f"prompt/{in_len}.txt", 'r').read() + # As different tokenizer has different encodings, + # in_len.txt maybe shorter than we need, + # use much longer context to make sure input length + test_length = min(in_len*2, 8192) + while test_length not in [32, 256, 1024, 2048, 8192]: + test_length = test_length * 2 + input_str = open(f"prompt/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] input_ids = tokenizer.encode(true_str, return_tensors="pt") + actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): st = time.perf_counter() @@ -263,8 +291,10 @@ def run_optimize_model(repo_id, print("model generate cost: " + str(end - st)) output = tokenizer.batch_decode(output_ids) print(output[0]) + actual_out_len = output_ids.shape[1] - actual_in_len if i >= warm_up: - result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) + result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, + actual_in_len, actual_out_len]) return result @@ -309,13 +339,20 @@ def run_transformer_int4_gpu(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = open(f"prompt/{in_len}.txt", 'r').read() + # As different tokenizer has different encodings, + # in_len.txt maybe shorter than we need, + # use much longer context to make sure input length + test_length = min(in_len*2, 8192) + while test_length not in [32, 256, 1024, 2048, 8192]: + test_length = test_length * 2 + input_str = open(f"prompt/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') + actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): st = time.perf_counter() @@ -326,8 +363,10 @@ def run_transformer_int4_gpu(repo_id, print("model generate cost: " + str(end - st)) output = tokenizer.batch_decode(output_ids) print(output[0]) + actual_out_len = output_ids.shape[1] - actual_in_len if i >= warm_up: - result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) + result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, + actual_in_len, actual_out_len]) torch.xpu.empty_cache() return result @@ -376,13 +415,20 @@ def run_optimize_model_gpu(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = open(f"prompt/{in_len}.txt", 'r').read() + # As different tokenizer has different encodings, + # in_len.txt maybe shorter than we need, + # use much longer context to make sure input length + test_length = min(in_len*2, 8192) + while test_length not in [32, 256, 1024, 2048, 8192]: + test_length = test_length * 2 + input_str = open(f"prompt/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') + actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): st = time.perf_counter() @@ -392,9 +438,11 @@ def run_optimize_model_gpu(repo_id, output_ids = output_ids.cpu() print("model generate cost: " + str(end - st)) output = tokenizer.batch_decode(output_ids) + actual_out_len = output_ids.shape[1] - actual_in_len print(output[0]) if i >= warm_up: - result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) + result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, + actual_in_len, actual_out_len]) torch.xpu.empty_cache() return result @@ -436,13 +484,20 @@ def run_ipex_fp16_gpu(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = open(f"prompt/{in_len}.txt", 'r').read() + # As different tokenizer has different encodings, + # in_len.txt maybe shorter than we need, + # use much longer context to make sure input length + test_length = min(in_len*2, 8192) + while test_length not in [32, 256, 1024, 2048, 8192]: + test_length = test_length * 2 + input_str = open(f"prompt/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] true_str = tokenizer.batch_decode(input_ids)[0] input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') + actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): st = time.perf_counter() @@ -452,9 +507,11 @@ def run_ipex_fp16_gpu(repo_id, output_ids = output_ids.cpu() print("model generate cost: " + str(end - st)) output = tokenizer.batch_decode(output_ids) + actual_out_len = output_ids.shape[1] - actual_in_len print(output[0]) if i >= warm_up: - result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time]) + result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, + actual_in_len, actual_out_len]) torch.xpu.empty_cache() return result @@ -468,6 +525,6 @@ if __name__ == '__main__': for api in conf.test_api: for model in conf.repo_id: run_model(model, api, conf['in_out_pairs'], conf['local_model_hub'], conf['warm_up'], conf['num_trials']) - df = pd.DataFrame(results, columns=['model', '1st token avg latency (s)', '2+ avg latency (s/token)', 'encoder time (s)', 'input/output tokens']) + df = pd.DataFrame(results, columns=['model', '1st token avg latency (s)', '2+ avg latency (s/token)', 'encoder time (s)', 'input/output tokens', 'actual input/output tokens']) df.to_csv(f'{current_dir}/{api}-results-{today}.csv') results = []