update benchmark (#8899)
This commit is contained in:
parent
2d97827ec5
commit
49a39452c6
2 changed files with 38 additions and 4704 deletions
|
|
@ -510,9 +510,12 @@ class BenchmarkWrapper:
|
||||||
learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
|
learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, model, do_print=True):
|
def __init__(self, model, do_print=False):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.do_print = do_print
|
self.do_print = do_print
|
||||||
|
self.encoder_time = 0.0
|
||||||
|
self.first_cost = 0.0
|
||||||
|
self.rest_cost_mean = 0.0
|
||||||
print(self.model.__class__)
|
print(self.model.__class__)
|
||||||
|
|
||||||
def __getattr__(self, attr):
|
def __getattr__(self, attr):
|
||||||
|
|
@ -1360,9 +1363,14 @@ class BenchmarkWrapper:
|
||||||
if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
|
if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
|
||||||
# if model is encoder decoder encoder_outputs are created
|
# if model is encoder decoder encoder_outputs are created
|
||||||
# and added to `model_kwargs`
|
# and added to `model_kwargs`
|
||||||
|
enc_st = time.perf_counter()
|
||||||
model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
|
model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
|
||||||
inputs_tensor, model_kwargs, model_input_name
|
inputs_tensor, model_kwargs, model_input_name
|
||||||
)
|
)
|
||||||
|
enc_end = time.perf_counter()
|
||||||
|
self.encoder_time = enc_end - enc_st
|
||||||
|
if self.do_print:
|
||||||
|
print(f"=====================encoder cost {enc_end - enc_st} s=======================")
|
||||||
|
|
||||||
# 5. Prepare `input_ids` which will be used for auto-regressive generation
|
# 5. Prepare `input_ids` which will be used for auto-regressive generation
|
||||||
if self.config.is_encoder_decoder:
|
if self.config.is_encoder_decoder:
|
||||||
|
|
@ -2359,6 +2367,7 @@ class BenchmarkWrapper:
|
||||||
first_token_time = None
|
first_token_time = None
|
||||||
last_token_time = []
|
last_token_time = []
|
||||||
while True:
|
while True:
|
||||||
|
st = time.perf_counter()
|
||||||
if synced_gpus:
|
if synced_gpus:
|
||||||
# Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
|
# Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
|
||||||
# The following logic allows an early break if all peers finished generating their sequence
|
# The following logic allows an early break if all peers finished generating their sequence
|
||||||
|
|
@ -2373,19 +2382,12 @@ class BenchmarkWrapper:
|
||||||
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
||||||
|
|
||||||
# forward pass to get next token
|
# forward pass to get next token
|
||||||
st = time.perf_counter()
|
|
||||||
outputs = self(
|
outputs = self(
|
||||||
**model_inputs,
|
**model_inputs,
|
||||||
return_dict=True,
|
return_dict=True,
|
||||||
output_attentions=output_attentions,
|
output_attentions=output_attentions,
|
||||||
output_hidden_states=output_hidden_states,
|
output_hidden_states=output_hidden_states,
|
||||||
)
|
)
|
||||||
end = time.perf_counter()
|
|
||||||
if first_token_time is None:
|
|
||||||
first_token_time = end - st
|
|
||||||
else:
|
|
||||||
last_token_time.append(end - st)
|
|
||||||
|
|
||||||
if synced_gpus and this_peer_finished:
|
if synced_gpus and this_peer_finished:
|
||||||
continue # don't waste resources running the code we don't need
|
continue # don't waste resources running the code we don't need
|
||||||
|
|
||||||
|
|
@ -2439,6 +2441,14 @@ class BenchmarkWrapper:
|
||||||
if unfinished_sequences.max() == 0:
|
if unfinished_sequences.max() == 0:
|
||||||
this_peer_finished = True
|
this_peer_finished = True
|
||||||
|
|
||||||
|
if self.device.type == "xpu":
|
||||||
|
torch.xpu.synchronize()
|
||||||
|
end = time.perf_counter()
|
||||||
|
if first_token_time is None:
|
||||||
|
first_token_time = end - st
|
||||||
|
else:
|
||||||
|
last_token_time.append(end - st)
|
||||||
|
|
||||||
# stop if we exceed the maximum length
|
# stop if we exceed the maximum length
|
||||||
if stopping_criteria(input_ids, scores):
|
if stopping_criteria(input_ids, scores):
|
||||||
this_peer_finished = True
|
this_peer_finished = True
|
||||||
|
|
@ -2452,7 +2462,8 @@ class BenchmarkWrapper:
|
||||||
self.first_cost = first_token_time
|
self.first_cost = first_token_time
|
||||||
self.rest_cost_mean = np.mean(last_token_time)
|
self.rest_cost_mean = np.mean(last_token_time)
|
||||||
if self.do_print:
|
if self.do_print:
|
||||||
print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f}s ({len(last_token_time)} tokens in all)=========")
|
print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
|
||||||
|
f" tokens in all)=========")
|
||||||
|
|
||||||
if streamer is not None:
|
if streamer is not None:
|
||||||
streamer.end()
|
streamer.end()
|
||||||
|
|
@ -2947,6 +2958,7 @@ class BenchmarkWrapper:
|
||||||
last_token_time = []
|
last_token_time = []
|
||||||
this_peer_finished = False # used by synced_gpus only
|
this_peer_finished = False # used by synced_gpus only
|
||||||
while True:
|
while True:
|
||||||
|
st = time.perf_counter()
|
||||||
if synced_gpus:
|
if synced_gpus:
|
||||||
# Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
|
# Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
|
||||||
# The following logic allows an early break if all peers finished generating their sequence
|
# The following logic allows an early break if all peers finished generating their sequence
|
||||||
|
|
@ -2959,18 +2971,12 @@ class BenchmarkWrapper:
|
||||||
|
|
||||||
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
||||||
|
|
||||||
st = time.perf_counter()
|
|
||||||
outputs = self(
|
outputs = self(
|
||||||
**model_inputs,
|
**model_inputs,
|
||||||
return_dict=True,
|
return_dict=True,
|
||||||
output_attentions=output_attentions,
|
output_attentions=output_attentions,
|
||||||
output_hidden_states=output_hidden_states,
|
output_hidden_states=output_hidden_states,
|
||||||
)
|
)
|
||||||
end = time.perf_counter()
|
|
||||||
if first_token_time is None:
|
|
||||||
first_token_time = end - st
|
|
||||||
else:
|
|
||||||
last_token_time.append(end - st)
|
|
||||||
|
|
||||||
if synced_gpus and this_peer_finished:
|
if synced_gpus and this_peer_finished:
|
||||||
cur_len = cur_len + 1
|
cur_len = cur_len + 1
|
||||||
|
|
@ -3046,6 +3052,14 @@ class BenchmarkWrapper:
|
||||||
# increase cur_len
|
# increase cur_len
|
||||||
cur_len = cur_len + 1
|
cur_len = cur_len + 1
|
||||||
|
|
||||||
|
if self.device.type == "xpu":
|
||||||
|
torch.xpu.synchronize()
|
||||||
|
end = time.perf_counter()
|
||||||
|
if first_token_time is None:
|
||||||
|
first_token_time = end - st
|
||||||
|
else:
|
||||||
|
last_token_time.append(end - st)
|
||||||
|
|
||||||
if beam_scorer.is_done or stopping_criteria(input_ids, scores):
|
if beam_scorer.is_done or stopping_criteria(input_ids, scores):
|
||||||
if not synced_gpus:
|
if not synced_gpus:
|
||||||
break
|
break
|
||||||
|
|
@ -3063,9 +3077,14 @@ class BenchmarkWrapper:
|
||||||
beam_indices=beam_indices,
|
beam_indices=beam_indices,
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f"=========First token cost {first_token_time}s=========")
|
if self.do_print:
|
||||||
|
print(f"=========First token cost {first_token_time:.4f} s=========")
|
||||||
if len(last_token_time) > 1:
|
if len(last_token_time) > 1:
|
||||||
print(f"=========Rest token cost average {np.mean(last_token_time)}s ({len(last_token_time)}tokens in all)=========")
|
self.first_cost = first_token_time
|
||||||
|
self.rest_cost_mean = np.mean(last_token_time)
|
||||||
|
if self.do_print:
|
||||||
|
print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
|
||||||
|
f" tokens in all)=========")
|
||||||
|
|
||||||
if return_dict_in_generate:
|
if return_dict_in_generate:
|
||||||
if not output_scores:
|
if not output_scores:
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue