diff --git a/python/llm/dev/benchmark/benchmark_util.py b/python/llm/dev/benchmark/benchmark_util.py index a5f0a372..36f2bcdb 100644 --- a/python/llm/dev/benchmark/benchmark_util.py +++ b/python/llm/dev/benchmark/benchmark_util.py @@ -2443,6 +2443,7 @@ class BenchmarkWrapper: if self.device.type == "xpu": torch.xpu.synchronize() memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024**3)) + self.peak_memory = np.max(memory_every_token) end = time.perf_counter() if first_token_time is None: first_token_time = end - st @@ -2457,15 +2458,21 @@ class BenchmarkWrapper: break if self.do_print: - print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========") + if self.device.type == "xpu": + print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========") + else: + print(f"=========First token cost {first_token_time:.4f} s=========") if len(last_token_time) > 1: self.first_cost = first_token_time self.rest_cost_mean = np.mean(last_token_time) - self.peak_memory = np.max(memory_every_token[1:]) if self.do_print: - print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" - f" tokens in all) and {self.peak_memory} GB=========") - print(f"Peak memory for every token: {memory_every_token}") + if self.device.type == "xpu": + print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" + f" tokens in all) and {np.max(memory_every_token[1:])} GB=========") + print(f"Peak memory for every token: {memory_every_token}") + else: + print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" + f" tokens in all)=========") if streamer is not None: streamer.end() @@ -2750,6 +2757,7 @@ class BenchmarkWrapper: if self.device.type == "xpu": torch.xpu.synchronize() memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024 ** 3)) + self.peak_memory = np.max(memory_every_token) end = time.perf_counter() if first_token_time is None: first_token_time = end - st @@ -2764,15 +2772,21 @@ class BenchmarkWrapper: break if self.do_print: - print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========") + if self.device.type == "xpu": + print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========") + else: + print(f"=========First token cost {first_token_time:.4f} s=========") if len(last_token_time) > 1: self.first_cost = first_token_time self.rest_cost_mean = np.mean(last_token_time) - self.peak_memory = np.max(memory_every_token[1:]) if self.do_print: - print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" - f" tokens in all) and {self.peak_memory} GB=========") - print(f"Peak memory for every token: {memory_every_token}") + if self.device.type == "xpu": + print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" + f" tokens in all) and {np.max(memory_every_token[1:])} GB=========") + print(f"Peak memory for every token: {memory_every_token}") + else: + print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" + f" tokens in all)=========") if streamer is not None: streamer.end() @@ -3083,6 +3097,7 @@ class BenchmarkWrapper: if self.device.type == "xpu": torch.xpu.synchronize() memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024 ** 3)) + self.peak_memory = np.max(memory_every_token) end = time.perf_counter() if first_token_time is None: first_token_time = end - st @@ -3107,15 +3122,21 @@ class BenchmarkWrapper: ) if self.do_print: - print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========") + if self.device.type == "xpu": + print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========") + else: + print(f"=========First token cost {first_token_time:.4f} s=========") if len(last_token_time) > 1: self.first_cost = first_token_time self.rest_cost_mean = np.mean(last_token_time) - self.peak_memory = np.max(memory_every_token[1:]) if self.do_print: - print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" - f" tokens in all) and {self.peak_memory} GB=========") - print(f"Peak memory for every token: {memory_every_token}") + if self.device.type == "xpu": + print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" + f" tokens in all) and {np.max(memory_every_token[1:])} GB=========") + print(f"Peak memory for every token: {memory_every_token}") + else: + print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" + f" tokens in all)=========") if return_dict_in_generate: if not output_scores: @@ -3447,6 +3468,7 @@ class BenchmarkWrapper: if self.device.type == "xpu": torch.xpu.synchronize() memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024 ** 3)) + self.peak_memory = np.max(memory_every_token) end = time.perf_counter() if first_token_time is None: first_token_time = end - st @@ -3465,15 +3487,21 @@ class BenchmarkWrapper: ) if self.do_print: - print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========") + if self.device.type == "xpu": + print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========") + else: + print(f"=========First token cost {first_token_time:.4f} s=========") if len(last_token_time) > 1: self.first_cost = first_token_time self.rest_cost_mean = np.mean(last_token_time) - self.peak_memory = np.max(memory_every_token[1:]) if self.do_print: - print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" - f" tokens in all) and {self.peak_memory} GB=========") - print(f"Peak memory for every token: {memory_every_token}") + if self.device.type == "xpu": + print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" + f" tokens in all) and {np.max(memory_every_token[1:])} GB=========") + print(f"Peak memory for every token: {memory_every_token}") + else: + print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}" + f" tokens in all)=========") if return_dict_in_generate: if not output_scores: