fix error for benchmark_util.py running on cpu (#9949)

2024-01-22 10:14:40 +08:00 · 2024-01-22 10:14:40 +08:00 · 6fb3f40f7e
commit 6fb3f40f7e
parent fb91c97fe8
1 changed files with 48 additions and 20 deletions
--- a/python/llm/dev/benchmark/benchmark_util.py
+++ b/python/llm/dev/benchmark/benchmark_util.py
@ -2443,6 +2443,7 @@ class BenchmarkWrapper:
            if self.device.type == "xpu":
                torch.xpu.synchronize()
                memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024**3))
+                self.peak_memory = np.max(memory_every_token)
            end = time.perf_counter()
            if first_token_time is None:
                first_token_time = end - st
@ -2457,15 +2458,21 @@ class BenchmarkWrapper:
                break

        if self.do_print:
+            if self.device.type == "xpu":
                print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
+            else:
+                print(f"=========First token cost {first_token_time:.4f} s=========")
        if len(last_token_time) > 1:
            self.first_cost = first_token_time
            self.rest_cost_mean = np.mean(last_token_time)
-            self.peak_memory = np.max(memory_every_token[1:])
            if self.do_print:
+                if self.device.type == "xpu":
                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
-                      f" tokens in all) and {self.peak_memory} GB=========")
+                          f" tokens in all) and {np.max(memory_every_token[1:])} GB=========")
                    print(f"Peak memory for every token: {memory_every_token}")
+                else:
+                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                          f" tokens in all)=========")

        if streamer is not None:
            streamer.end()
@ -2750,6 +2757,7 @@ class BenchmarkWrapper:
            if self.device.type == "xpu":
                torch.xpu.synchronize()
                memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024 ** 3))
+                self.peak_memory = np.max(memory_every_token)
            end = time.perf_counter()
            if first_token_time is None:
                first_token_time = end - st
@ -2764,15 +2772,21 @@ class BenchmarkWrapper:
                break

        if self.do_print:
+            if self.device.type == "xpu":
                print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
+            else:
+                print(f"=========First token cost {first_token_time:.4f} s=========")
        if len(last_token_time) > 1:
            self.first_cost = first_token_time
            self.rest_cost_mean = np.mean(last_token_time)
-            self.peak_memory = np.max(memory_every_token[1:])
            if self.do_print:
+                if self.device.type == "xpu":
                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
-                      f" tokens in all) and {self.peak_memory} GB=========")
+                          f" tokens in all) and {np.max(memory_every_token[1:])} GB=========")
                    print(f"Peak memory for every token: {memory_every_token}")
+                else:
+                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                          f" tokens in all)=========")

        if streamer is not None:
            streamer.end()
@ -3083,6 +3097,7 @@ class BenchmarkWrapper:
            if self.device.type == "xpu":
                torch.xpu.synchronize()
                memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024 ** 3))
+                self.peak_memory = np.max(memory_every_token)
            end = time.perf_counter()
            if first_token_time is None:
                first_token_time = end - st
@ -3107,15 +3122,21 @@ class BenchmarkWrapper:
        )

        if self.do_print:
+            if self.device.type == "xpu":
                print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
+            else:
+                print(f"=========First token cost {first_token_time:.4f} s=========")
        if len(last_token_time) > 1:
            self.first_cost = first_token_time
            self.rest_cost_mean = np.mean(last_token_time)
-            self.peak_memory = np.max(memory_every_token[1:])
            if self.do_print:
+                if self.device.type == "xpu":
                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
-                      f" tokens in all) and {self.peak_memory} GB=========")
+                          f" tokens in all) and {np.max(memory_every_token[1:])} GB=========")
                    print(f"Peak memory for every token: {memory_every_token}")
+                else:
+                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                          f" tokens in all)=========")

        if return_dict_in_generate:
            if not output_scores:
@ -3447,6 +3468,7 @@ class BenchmarkWrapper:
            if self.device.type == "xpu":
                torch.xpu.synchronize()
                memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024 ** 3))
+                self.peak_memory = np.max(memory_every_token)
            end = time.perf_counter()
            if first_token_time is None:
                first_token_time = end - st
@ -3465,15 +3487,21 @@ class BenchmarkWrapper:
        )

        if self.do_print:
+            if self.device.type == "xpu":
                print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
+            else:
+                print(f"=========First token cost {first_token_time:.4f} s=========")
        if len(last_token_time) > 1:
            self.first_cost = first_token_time
            self.rest_cost_mean = np.mean(last_token_time)
-            self.peak_memory = np.max(memory_every_token[1:])
            if self.do_print:
+                if self.device.type == "xpu":
                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
-                      f" tokens in all) and {self.peak_memory} GB=========")
+                          f" tokens in all) and {np.max(memory_every_token[1:])} GB=========")
                    print(f"Peak memory for every token: {memory_every_token}")
+                else:
+                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                          f" tokens in all)=========")

        if return_dict_in_generate:
            if not output_scores: