update benchmark (#8899)

2023-09-06 15:11:43 +08:00 · 2023-09-06 15:11:43 +08:00 · 49a39452c6
commit 49a39452c6
parent 2d97827ec5
2 changed files with 38 additions and 4704 deletions
--- a/python/llm/dev/benchmark/benchmark_util.py
+++ b/python/llm/dev/benchmark/benchmark_util.py
@ -510,9 +510,12 @@ class BenchmarkWrapper:
    learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
    """
    
-    def __init__(self, model, do_print=True):
+    def __init__(self, model, do_print=False):
        self.model = model
        self.do_print = do_print
+        self.encoder_time = 0.0
+        self.first_cost = 0.0
+        self.rest_cost_mean = 0.0
        print(self.model.__class__)

    def __getattr__(self, attr):
@ -1360,9 +1363,14 @@ class BenchmarkWrapper:
        if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
            # if model is encoder decoder encoder_outputs are created
            # and added to `model_kwargs`
+            enc_st = time.perf_counter()
            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
                inputs_tensor, model_kwargs, model_input_name
            )
+            enc_end = time.perf_counter()
+            self.encoder_time = enc_end - enc_st
+            if self.do_print:
+                print(f"=====================encoder cost {enc_end - enc_st} s=======================")

        # 5. Prepare `input_ids` which will be used for auto-regressive generation
        if self.config.is_encoder_decoder:
@ -2359,6 +2367,7 @@ class BenchmarkWrapper:
        first_token_time = None
        last_token_time = []
        while True:
+            st = time.perf_counter()
            if synced_gpus:
                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
                # The following logic allows an early break if all peers finished generating their sequence
@ -2373,19 +2382,12 @@ class BenchmarkWrapper:
            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)

            # forward pass to get next token
-            st = time.perf_counter()
            outputs = self(
                **model_inputs,
                return_dict=True,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
            )
-            end = time.perf_counter()
-            if first_token_time is None:
-                first_token_time = end - st
-            else:
-                last_token_time.append(end - st)
-
            if synced_gpus and this_peer_finished:
                continue  # don't waste resources running the code we don't need

@ -2439,6 +2441,14 @@ class BenchmarkWrapper:
                if unfinished_sequences.max() == 0:
                    this_peer_finished = True

+            if self.device.type == "xpu":
+                torch.xpu.synchronize()
+            end = time.perf_counter()
+            if first_token_time is None:
+                first_token_time = end - st
+            else:
+                last_token_time.append(end - st)
+
            # stop if we exceed the maximum length
            if stopping_criteria(input_ids, scores):
                this_peer_finished = True
@ -2447,12 +2457,13 @@ class BenchmarkWrapper:
                break

        if self.do_print:
-            print(f"=========First token cost {first_token_time:.4f}s=========")
+            print(f"=========First token cost {first_token_time:.4f} s=========")
        if len(last_token_time) > 1:
            self.first_cost = first_token_time
            self.rest_cost_mean = np.mean(last_token_time)
            if self.do_print:
-                print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f}s ({len(last_token_time)} tokens in all)=========")
+                print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                      f" tokens in all)=========")

        if streamer is not None:
            streamer.end()
@ -2947,6 +2958,7 @@ class BenchmarkWrapper:
        last_token_time = []
        this_peer_finished = False  # used by synced_gpus only
        while True:
+            st = time.perf_counter()
            if synced_gpus:
                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
                # The following logic allows an early break if all peers finished generating their sequence
@ -2959,18 +2971,12 @@ class BenchmarkWrapper:

            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)

-            st = time.perf_counter()
            outputs = self(
                **model_inputs,
                return_dict=True,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
            )
-            end = time.perf_counter()
-            if first_token_time is None:
-                first_token_time = end - st
-            else:
-                last_token_time.append(end - st)

            if synced_gpus and this_peer_finished:
                cur_len = cur_len + 1
@ -3046,6 +3052,14 @@ class BenchmarkWrapper:
            # increase cur_len
            cur_len = cur_len + 1

+            if self.device.type == "xpu":
+                torch.xpu.synchronize()
+            end = time.perf_counter()
+            if first_token_time is None:
+                first_token_time = end - st
+            else:
+                last_token_time.append(end - st)
+
            if beam_scorer.is_done or stopping_criteria(input_ids, scores):
                if not synced_gpus:
                    break
@ -3063,9 +3077,14 @@ class BenchmarkWrapper:
            beam_indices=beam_indices,
        )

-        print(f"=========First token cost {first_token_time}s=========")
+        if self.do_print:
+            print(f"=========First token cost {first_token_time:.4f} s=========")
        if len(last_token_time) > 1:
-            print(f"=========Rest token cost average {np.mean(last_token_time)}s ({len(last_token_time)}tokens in all)=========")
+            self.first_cost = first_token_time
+            self.rest_cost_mean = np.mean(last_token_time)
+            if self.do_print:
+                print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                      f" tokens in all)=========")

        if return_dict_in_generate:
            if not output_scores:
--- a/python/llm/dev/benchmark/gpu_benchmark_util.py
+++ b/python/llm/dev/benchmark/gpu_benchmark_util.py