diff --git a/python/llm/dev/benchmark/benchmark_util.py b/python/llm/dev/benchmark/benchmark_util.py
index a5f0a372..36f2bcdb 100644
--- a/python/llm/dev/benchmark/benchmark_util.py
+++ b/python/llm/dev/benchmark/benchmark_util.py
@@ -2443,6 +2443,7 @@ class BenchmarkWrapper:
             if self.device.type == "xpu":
                 torch.xpu.synchronize()
                 memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024**3))
+                self.peak_memory = np.max(memory_every_token)
             end = time.perf_counter()
             if first_token_time is None:
                 first_token_time = end - st
@@ -2457,15 +2458,21 @@ class BenchmarkWrapper:
                 break
 
         if self.do_print:
-            print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
+            if self.device.type == "xpu":
+                print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
+            else:
+                print(f"=========First token cost {first_token_time:.4f} s=========")
         if len(last_token_time) > 1:
             self.first_cost = first_token_time
             self.rest_cost_mean = np.mean(last_token_time)
-            self.peak_memory = np.max(memory_every_token[1:])
             if self.do_print:
-                print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
-                      f" tokens in all) and {self.peak_memory} GB=========")
-                print(f"Peak memory for every token: {memory_every_token}")
+                if self.device.type == "xpu":
+                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                          f" tokens in all) and {np.max(memory_every_token[1:])} GB=========")
+                    print(f"Peak memory for every token: {memory_every_token}")
+                else:
+                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                          f" tokens in all)=========")
 
         if streamer is not None:
             streamer.end()
@@ -2750,6 +2757,7 @@ class BenchmarkWrapper:
             if self.device.type == "xpu":
                 torch.xpu.synchronize()
                 memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024 ** 3))
+                self.peak_memory = np.max(memory_every_token)
             end = time.perf_counter()
             if first_token_time is None:
                 first_token_time = end - st
@@ -2764,15 +2772,21 @@ class BenchmarkWrapper:
                 break
 
         if self.do_print:
-            print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
+            if self.device.type == "xpu":
+                print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
+            else:
+                print(f"=========First token cost {first_token_time:.4f} s=========")
         if len(last_token_time) > 1:
             self.first_cost = first_token_time
             self.rest_cost_mean = np.mean(last_token_time)
-            self.peak_memory = np.max(memory_every_token[1:])
             if self.do_print:
-                print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
-                      f" tokens in all) and {self.peak_memory} GB=========")
-                print(f"Peak memory for every token: {memory_every_token}")
+                if self.device.type == "xpu":
+                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                          f" tokens in all) and {np.max(memory_every_token[1:])} GB=========")
+                    print(f"Peak memory for every token: {memory_every_token}")
+                else:
+                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                          f" tokens in all)=========")
 
         if streamer is not None:
             streamer.end()
@@ -3083,6 +3097,7 @@ class BenchmarkWrapper:
             if self.device.type == "xpu":
                 torch.xpu.synchronize()
                 memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024 ** 3))
+                self.peak_memory = np.max(memory_every_token)
             end = time.perf_counter()
             if first_token_time is None:
                 first_token_time = end - st
@@ -3107,15 +3122,21 @@ class BenchmarkWrapper:
         )
 
         if self.do_print:
-            print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
+            if self.device.type == "xpu":
+                print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
+            else:
+                print(f"=========First token cost {first_token_time:.4f} s=========")
         if len(last_token_time) > 1:
             self.first_cost = first_token_time
             self.rest_cost_mean = np.mean(last_token_time)
-            self.peak_memory = np.max(memory_every_token[1:])
             if self.do_print:
-                print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
-                      f" tokens in all) and {self.peak_memory} GB=========")
-                print(f"Peak memory for every token: {memory_every_token}")
+                if self.device.type == "xpu":
+                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                          f" tokens in all) and {np.max(memory_every_token[1:])} GB=========")
+                    print(f"Peak memory for every token: {memory_every_token}")
+                else:
+                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                          f" tokens in all)=========")
 
         if return_dict_in_generate:
             if not output_scores:
@@ -3447,6 +3468,7 @@ class BenchmarkWrapper:
             if self.device.type == "xpu":
                 torch.xpu.synchronize()
                 memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024 ** 3))
+                self.peak_memory = np.max(memory_every_token)
             end = time.perf_counter()
             if first_token_time is None:
                 first_token_time = end - st
@@ -3465,15 +3487,21 @@ class BenchmarkWrapper:
         )
 
         if self.do_print:
-            print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
+            if self.device.type == "xpu":
+                print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
+            else:
+                print(f"=========First token cost {first_token_time:.4f} s=========")
         if len(last_token_time) > 1:
             self.first_cost = first_token_time
             self.rest_cost_mean = np.mean(last_token_time)
-            self.peak_memory = np.max(memory_every_token[1:])
             if self.do_print:
-                print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
-                      f" tokens in all) and {self.peak_memory} GB=========")
-                print(f"Peak memory for every token: {memory_every_token}")
+                if self.device.type == "xpu":
+                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                          f" tokens in all) and {np.max(memory_every_token[1:])} GB=========")
+                    print(f"Peak memory for every token: {memory_every_token}")
+                else:
+                    print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
+                          f" tokens in all)=========")
 
         if return_dict_in_generate:
             if not output_scores: