LLM: support num_beams in all-in-one benchmark (#9141)

* support num_beams

* fix
This commit is contained in:
Ruonan Wang 2023-10-12 13:35:12 +08:00 committed by GitHub
parent 62ac7ae444
commit 4f34557224
3 changed files with 52 additions and 32 deletions

View file

@ -19,6 +19,7 @@ repo_id:
local_model_hub: 'path to your local model hub' local_model_hub: 'path to your local model hub'
warm_up: 1 warm_up: 1
num_trials: 3 num_trials: 3
num_beams: 1 # default to greedy search
in_out_pairs: in_out_pairs:
- '32-32' - '32-32'
- '1024-128' - '1024-128'

View file

@ -5,6 +5,7 @@ repo_id:
local_model_hub: 'path to your local model hub' local_model_hub: 'path to your local model hub'
warm_up: 1 warm_up: 1
num_trials: 3 num_trials: 3
num_beams: 1 # default to greedy search
in_out_pairs: in_out_pairs:
- '32-32' - '32-32'
- '1024-128' - '1024-128'

View file

@ -38,22 +38,22 @@ LLAMA_IDS = ['meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-13b-chat-hf',
results = [] results = []
def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3): def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1):
# TODO: make a parameter # TODO: make a parameter
if test_api == 'transformer_int4': if test_api == 'transformer_int4':
result = run_transformer_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) result = run_transformer_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
elif test_api == 'native_int4': elif test_api == 'native_int4':
run_native_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) run_native_int4(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials)
elif test_api == 'optimize_model': elif test_api == 'optimize_model':
result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) result = run_optimize_model(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
elif test_api == 'transformer_int4_gpu': elif test_api == 'transformer_int4_gpu':
result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
elif test_api == 'optimize_model_gpu': elif test_api == 'optimize_model_gpu':
result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) result = run_optimize_model_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
elif test_api == 'pytorch_autocast_bf16': elif test_api == 'pytorch_autocast_bf16':
result = run_pytorch_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) result = run_pytorch_autocast_bf16(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
elif test_api == 'ipex_fp16_gpu': elif test_api == 'ipex_fp16_gpu':
result = run_ipex_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials) result = run_ipex_fp16_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams)
for in_out_pair in in_out_pairs: for in_out_pair in in_out_pairs:
results.append([repo_id, results.append([repo_id,
@ -62,7 +62,8 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1,
np.mean(result[in_out_pair], axis=0)[2], np.mean(result[in_out_pair], axis=0)[2],
in_out_pair, in_out_pair,
f'{int(np.mean(result[in_out_pair], axis=0)[3])}' + f'{int(np.mean(result[in_out_pair], axis=0)[3])}' +
f'-{int(np.mean(result[in_out_pair], axis=0)[4])}']) f'-{int(np.mean(result[in_out_pair], axis=0)[4])}',
num_beams])
def get_model_path(repo_id, local_model_hub): def get_model_path(repo_id, local_model_hub):
@ -119,7 +120,8 @@ def run_transformer_int4(repo_id,
local_model_hub, local_model_hub,
in_out_pairs, in_out_pairs,
warm_up, warm_up,
num_trials): num_trials,
num_beams):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, LlamaTokenizer from transformers import AutoTokenizer, LlamaTokenizer
@ -131,10 +133,12 @@ def run_transformer_int4(repo_id,
model = AutoModel.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, torch_dtype='auto') model = AutoModel.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, torch_dtype='auto')
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
elif repo_id in LLAMA_IDS: elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True,
use_cache=True)
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
else: else:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True,
use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
end = time.perf_counter() end = time.perf_counter()
print(">> loading of model costs {}s".format(end - st)) print(">> loading of model costs {}s".format(end - st))
@ -159,12 +163,13 @@ def run_transformer_int4(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len] input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0] true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt") input_ids = tokenizer.encode(true_str, return_tensors="pt")[:, :in_len]
actual_in_len = input_ids.shape[1] actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
for i in range(num_trials + warm_up): for i in range(num_trials + warm_up):
st = time.perf_counter() st = time.perf_counter()
output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len, use_cache=True) output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
num_beams=num_beams)
end = time.perf_counter() end = time.perf_counter()
print("model generate cost: " + str(end - st)) print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids) output = tokenizer.batch_decode(output_ids)
@ -179,7 +184,8 @@ def run_pytorch_autocast_bf16(repo_id,
local_model_hub, local_model_hub,
in_out_pairs, in_out_pairs,
warm_up, warm_up,
num_trials): num_trials,
num_beams):
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, LlamaTokenizer from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, LlamaTokenizer
model_path = get_model_path(repo_id, local_model_hub) model_path = get_model_path(repo_id, local_model_hub)
@ -188,11 +194,13 @@ def run_pytorch_autocast_bf16(repo_id,
# TODO: need verify chatglm family run bf16. # TODO: need verify chatglm family run bf16.
invalidInputError(False, "Currently pytorch do not support bfloat16 on cpu for chatglm models.") invalidInputError(False, "Currently pytorch do not support bfloat16 on cpu for chatglm models.")
elif repo_id in LLAMA_IDS: elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,
use_cache=True)
# Need to use LlamaTokenizer, reason please refer to issue: https://github.com/intel-analytics/BigDL/issues/8944 # Need to use LlamaTokenizer, reason please refer to issue: https://github.com/intel-analytics/BigDL/issues/8944
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
else: else:
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,
use_cache=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
end = time.perf_counter() end = time.perf_counter()
print(">> loading of model costs {}s".format(end - st)) print(">> loading of model costs {}s".format(end - st))
@ -216,13 +224,14 @@ def run_pytorch_autocast_bf16(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len] input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0] true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt") input_ids = tokenizer.encode(true_str, return_tensors="pt")[:, :in_len]
actual_in_len = input_ids.shape[1] actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
print("input tokens: {}".format(input_ids.shape[1])) print("input tokens: {}".format(input_ids.shape[1]))
for i in range(num_trials + warm_up): for i in range(num_trials + warm_up):
st = time.perf_counter() st = time.perf_counter()
output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len, use_cache=True) output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
num_beams=num_beams)
end = time.perf_counter() end = time.perf_counter()
print("model generate cost: " + str(end - st)) print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids) output = tokenizer.batch_decode(output_ids)
@ -237,7 +246,8 @@ def run_optimize_model(repo_id,
local_model_hub, local_model_hub,
in_out_pairs, in_out_pairs,
warm_up, warm_up,
num_trials): num_trials,
num_beams):
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
from bigdl.llm import optimize_model from bigdl.llm import optimize_model
@ -281,12 +291,13 @@ def run_optimize_model(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len] input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0] true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt") input_ids = tokenizer.encode(true_str, return_tensors="pt")[:, :in_len]
actual_in_len = input_ids.shape[1] actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
for i in range(num_trials + warm_up): for i in range(num_trials + warm_up):
st = time.perf_counter() st = time.perf_counter()
output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len) output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
num_beams=num_beams)
end = time.perf_counter() end = time.perf_counter()
print("model generate cost: " + str(end - st)) print("model generate cost: " + str(end - st))
output = tokenizer.batch_decode(output_ids) output = tokenizer.batch_decode(output_ids)
@ -302,7 +313,8 @@ def run_transformer_int4_gpu(repo_id,
local_model_hub, local_model_hub,
in_out_pairs, in_out_pairs,
warm_up, warm_up,
num_trials): num_trials,
num_beams):
from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
@ -351,12 +363,13 @@ def run_transformer_int4_gpu(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len] input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0] true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') input_ids = tokenizer.encode(true_str, return_tensors="pt")[:, :in_len].to('xpu')
actual_in_len = input_ids.shape[1] actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
for i in range(num_trials + warm_up): for i in range(num_trials + warm_up):
st = time.perf_counter() st = time.perf_counter()
output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len) output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
num_beams=num_beams)
torch.xpu.synchronize() torch.xpu.synchronize()
end = time.perf_counter() end = time.perf_counter()
output_ids = output_ids.cpu() output_ids = output_ids.cpu()
@ -375,7 +388,8 @@ def run_optimize_model_gpu(repo_id,
local_model_hub, local_model_hub,
in_out_pairs, in_out_pairs,
warm_up, warm_up,
num_trials): num_trials,
num_beams):
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
from bigdl.llm import optimize_model from bigdl.llm import optimize_model
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
@ -427,12 +441,13 @@ def run_optimize_model_gpu(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len] input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0] true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') input_ids = tokenizer.encode(true_str, return_tensors="pt")[:, :in_len].to('xpu')
actual_in_len = input_ids.shape[1] actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
for i in range(num_trials + warm_up): for i in range(num_trials + warm_up):
st = time.perf_counter() st = time.perf_counter()
output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len) output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
num_beams=num_beams)
torch.xpu.synchronize() torch.xpu.synchronize()
end = time.perf_counter() end = time.perf_counter()
output_ids = output_ids.cpu() output_ids = output_ids.cpu()
@ -451,7 +466,8 @@ def run_ipex_fp16_gpu(repo_id,
local_model_hub, local_model_hub,
in_out_pairs, in_out_pairs,
warm_up, warm_up,
num_trials): num_trials,
num_beams):
from transformers import AutoModel, AutoModelForCausalLM from transformers import AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
@ -496,12 +512,13 @@ def run_ipex_fp16_gpu(repo_id,
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len] input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0] true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu') input_ids = tokenizer.encode(true_str, return_tensors="pt")[:, :in_len].to('xpu')
actual_in_len = input_ids.shape[1] actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
for i in range(num_trials + warm_up): for i in range(num_trials + warm_up):
st = time.perf_counter() st = time.perf_counter()
output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len) output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
num_beams=num_beams)
torch.xpu.synchronize() torch.xpu.synchronize()
end = time.perf_counter() end = time.perf_counter()
output_ids = output_ids.cpu() output_ids = output_ids.cpu()
@ -524,7 +541,8 @@ if __name__ == '__main__':
import pandas as pd import pandas as pd
for api in conf.test_api: for api in conf.test_api:
for model in conf.repo_id: for model in conf.repo_id:
run_model(model, api, conf['in_out_pairs'], conf['local_model_hub'], conf['warm_up'], conf['num_trials']) run_model(model, api, conf['in_out_pairs'], conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'])
df = pd.DataFrame(results, columns=['model', '1st token avg latency (s)', '2+ avg latency (s/token)', 'encoder time (s)', 'input/output tokens', 'actual input/output tokens']) df = pd.DataFrame(results, columns=['model', '1st token avg latency (s)', '2+ avg latency (s/token)', 'encoder time (s)',
'input/output tokens', 'actual input/output tokens', 'num_beams'])
df.to_csv(f'{current_dir}/{api}-results-{today}.csv') df.to_csv(f'{current_dir}/{api}-results-{today}.csv')
results = [] results = []