diff --git a/python/llm/dev/benchmark/benchmark_util.py b/python/llm/dev/benchmark/benchmark_util.py index de6cafa1..e2a202da 100644 --- a/python/llm/dev/benchmark/benchmark_util.py +++ b/python/llm/dev/benchmark/benchmark_util.py @@ -2662,8 +2662,12 @@ class BenchmarkWrapper: unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device) this_peer_finished = False # used by synced_gpus only + + first_token_time = None + last_token_time = [] # auto-regressive generation while True: + st = time.perf_counter() if synced_gpus: # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. # The following logic allows an early break if all peers finished generating their sequence @@ -2740,6 +2744,14 @@ class BenchmarkWrapper: if unfinished_sequences.max() == 0: this_peer_finished = True + if self.device.type == "xpu": + torch.xpu.synchronize() + end = time.perf_counter() + if first_token_time is None: + first_token_time = end - st + else: + last_token_time.append(end - st) + # stop if we exceed the maximum length if stopping_criteria(input_ids, scores): this_peer_finished = True @@ -2747,6 +2759,15 @@ class BenchmarkWrapper: if this_peer_finished and not synced_gpus: break + if self.do_print: + print(f"=========First token cost {first_token_time:.4f} s=========") + if len(last_token_time) > 1: + self.first_cost = first_token_time + self.rest_cost_mean = np.mean(last_token_time) + if self.do_print: + print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" + f" tokens in all)=========") + if streamer is not None: streamer.end() @@ -3301,7 +3322,11 @@ class BenchmarkWrapper: beam_scores = beam_scores.view((batch_size * num_beams,)) this_peer_finished = False # used by synced_gpus only + + first_token_time = None + last_token_time = [] while True: + st = time.perf_counter() if synced_gpus: # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. # The following logic allows an early break if all peers finished generating their sequence @@ -3408,6 +3433,14 @@ class BenchmarkWrapper: else: this_peer_finished = True + if self.device.type == "xpu": + torch.xpu.synchronize() + end = time.perf_counter() + if first_token_time is None: + first_token_time = end - st + else: + last_token_time.append(end - st) + sequence_outputs = beam_scorer.finalize( input_ids, beam_scores, @@ -3419,6 +3452,15 @@ class BenchmarkWrapper: beam_indices=beam_indices, ) + if self.do_print: + print(f"=========First token cost {first_token_time:.4f} s=========") + if len(last_token_time) > 1: + self.first_cost = first_token_time + self.rest_cost_mean = np.mean(last_token_time) + if self.do_print: + print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" + f" tokens in all)=========") + if return_dict_in_generate: if not output_scores: sequence_outputs["sequence_scores"] = None