update benchmark (#8899)

This commit is contained in:
Xin Qiu 2023-09-06 15:11:43 +08:00 committed by GitHub
parent 2d97827ec5
commit 49a39452c6
2 changed files with 38 additions and 4704 deletions

View file

@ -510,9 +510,12 @@ class BenchmarkWrapper:
learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies). learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
""" """
def __init__(self, model, do_print=True): def __init__(self, model, do_print=False):
self.model = model self.model = model
self.do_print = do_print self.do_print = do_print
self.encoder_time = 0.0
self.first_cost = 0.0
self.rest_cost_mean = 0.0
print(self.model.__class__) print(self.model.__class__)
def __getattr__(self, attr): def __getattr__(self, attr):
@ -1360,9 +1363,14 @@ class BenchmarkWrapper:
if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs: if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
# if model is encoder decoder encoder_outputs are created # if model is encoder decoder encoder_outputs are created
# and added to `model_kwargs` # and added to `model_kwargs`
enc_st = time.perf_counter()
model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation( model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
inputs_tensor, model_kwargs, model_input_name inputs_tensor, model_kwargs, model_input_name
) )
enc_end = time.perf_counter()
self.encoder_time = enc_end - enc_st
if self.do_print:
print(f"=====================encoder cost {enc_end - enc_st} s=======================")
# 5. Prepare `input_ids` which will be used for auto-regressive generation # 5. Prepare `input_ids` which will be used for auto-regressive generation
if self.config.is_encoder_decoder: if self.config.is_encoder_decoder:
@ -2359,6 +2367,7 @@ class BenchmarkWrapper:
first_token_time = None first_token_time = None
last_token_time = [] last_token_time = []
while True: while True:
st = time.perf_counter()
if synced_gpus: if synced_gpus:
# Under synced_gpus the `forward` call must continue until all gpus complete their sequence. # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
# The following logic allows an early break if all peers finished generating their sequence # The following logic allows an early break if all peers finished generating their sequence
@ -2373,19 +2382,12 @@ class BenchmarkWrapper:
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
# forward pass to get next token # forward pass to get next token
st = time.perf_counter()
outputs = self( outputs = self(
**model_inputs, **model_inputs,
return_dict=True, return_dict=True,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
) )
end = time.perf_counter()
if first_token_time is None:
first_token_time = end - st
else:
last_token_time.append(end - st)
if synced_gpus and this_peer_finished: if synced_gpus and this_peer_finished:
continue # don't waste resources running the code we don't need continue # don't waste resources running the code we don't need
@ -2439,6 +2441,14 @@ class BenchmarkWrapper:
if unfinished_sequences.max() == 0: if unfinished_sequences.max() == 0:
this_peer_finished = True this_peer_finished = True
if self.device.type == "xpu":
torch.xpu.synchronize()
end = time.perf_counter()
if first_token_time is None:
first_token_time = end - st
else:
last_token_time.append(end - st)
# stop if we exceed the maximum length # stop if we exceed the maximum length
if stopping_criteria(input_ids, scores): if stopping_criteria(input_ids, scores):
this_peer_finished = True this_peer_finished = True
@ -2447,12 +2457,13 @@ class BenchmarkWrapper:
break break
if self.do_print: if self.do_print:
print(f"=========First token cost {first_token_time:.4f}s=========") print(f"=========First token cost {first_token_time:.4f} s=========")
if len(last_token_time) > 1: if len(last_token_time) > 1:
self.first_cost = first_token_time self.first_cost = first_token_time
self.rest_cost_mean = np.mean(last_token_time) self.rest_cost_mean = np.mean(last_token_time)
if self.do_print: if self.do_print:
print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f}s ({len(last_token_time)} tokens in all)=========") print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
f" tokens in all)=========")
if streamer is not None: if streamer is not None:
streamer.end() streamer.end()
@ -2947,6 +2958,7 @@ class BenchmarkWrapper:
last_token_time = [] last_token_time = []
this_peer_finished = False # used by synced_gpus only this_peer_finished = False # used by synced_gpus only
while True: while True:
st = time.perf_counter()
if synced_gpus: if synced_gpus:
# Under synced_gpus the `forward` call must continue until all gpus complete their sequence. # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
# The following logic allows an early break if all peers finished generating their sequence # The following logic allows an early break if all peers finished generating their sequence
@ -2959,18 +2971,12 @@ class BenchmarkWrapper:
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
st = time.perf_counter()
outputs = self( outputs = self(
**model_inputs, **model_inputs,
return_dict=True, return_dict=True,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
) )
end = time.perf_counter()
if first_token_time is None:
first_token_time = end - st
else:
last_token_time.append(end - st)
if synced_gpus and this_peer_finished: if synced_gpus and this_peer_finished:
cur_len = cur_len + 1 cur_len = cur_len + 1
@ -3046,6 +3052,14 @@ class BenchmarkWrapper:
# increase cur_len # increase cur_len
cur_len = cur_len + 1 cur_len = cur_len + 1
if self.device.type == "xpu":
torch.xpu.synchronize()
end = time.perf_counter()
if first_token_time is None:
first_token_time = end - st
else:
last_token_time.append(end - st)
if beam_scorer.is_done or stopping_criteria(input_ids, scores): if beam_scorer.is_done or stopping_criteria(input_ids, scores):
if not synced_gpus: if not synced_gpus:
break break
@ -3063,9 +3077,14 @@ class BenchmarkWrapper:
beam_indices=beam_indices, beam_indices=beam_indices,
) )
print(f"=========First token cost {first_token_time}s=========") if self.do_print:
print(f"=========First token cost {first_token_time:.4f} s=========")
if len(last_token_time) > 1: if len(last_token_time) > 1:
print(f"=========Rest token cost average {np.mean(last_token_time)}s ({len(last_token_time)}tokens in all)=========") self.first_cost = first_token_time
self.rest_cost_mean = np.mean(last_token_time)
if self.do_print:
print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
f" tokens in all)=========")
if return_dict_in_generate: if return_dict_in_generate:
if not output_scores: if not output_scores:

File diff suppressed because it is too large Load diff