[NPU] Fix load-low-bit benchmark script (#12502)

This commit is contained in:
binbin Deng 2024-12-05 10:01:32 +08:00 committed by GitHub
parent 84f1c4ad57
commit f56a111aa2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -789,7 +789,8 @@ def run_transformer_int4_loadlowbit_npu_win(repo_id,
load_time = end - st load_time = end - st
print(">> loading of model costs {}s".format(load_time)) print(">> loading of model costs {}s".format(load_time))
model = BenchmarkWrapper(model) if not hasattr(model, "model_ptr"):
model = BenchmarkWrapper(model)
result = {} result = {}
with torch.inference_mode(): with torch.inference_mode():
@ -802,10 +803,11 @@ def run_transformer_int4_loadlowbit_npu_win(repo_id,
# slice the input_ids to ensure the prompt length is required length. # slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = tokenizer.encode(input_str, return_tensors="pt")
input_ids = input_ids[:, :in_len] input_ids = input_ids[:, :in_len]
true_str = tokenizer.batch_decode(input_ids)[0] if repo_id not in MINICPM_IDS:
input_list = [true_str] * batch_size true_str = tokenizer.batch_decode(input_ids)[0]
input_ids = tokenizer(input_list, return_tensors="pt").input_ids input_list = [true_str] * batch_size
input_ids = input_ids[:, :in_len] input_ids = tokenizer(input_list, return_tensors="pt").input_ids
input_ids = input_ids[:, :in_len]
actual_in_len = input_ids.shape[1] actual_in_len = input_ids.shape[1]
result[in_out] = [] result[in_out] = []
for i in range(num_trials + warm_up): for i in range(num_trials + warm_up):