diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index 00d6161c..c1cc2a76 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -45,23 +45,22 @@ LLAVA_IDS = ['liuhaotian/llava-v1.5-7b']
 results = []
 excludes = []
 
-def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials, reserved_mem_list=[]):
+def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials):
     for i in range(num_trials + warm_up):
         st = time.perf_counter()
         output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
                                     num_beams=num_beams)
         torch.xpu.synchronize()
         end = time.perf_counter()
-        reserved_mem_list.append(torch.xpu.memory.memory_reserved()/(1024**3))
-        gpu_peak_mem = max(reserved_mem_list) # always keep the peak gpu mem at current stage
         output_ids = output_ids.cpu()
         print("model generate cost: " + str(end - st))
         output = tokenizer.batch_decode(output_ids)
         print(output[0])
+        torch.xpu.empty_cache()
         actual_out_len = output_ids.shape[1] - actual_in_len
         if i >= warm_up:
             result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
-                                actual_in_len, actual_out_len, gpu_peak_mem])
+                                   actual_in_len, actual_out_len, model.peak_memory])
 
 def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False):
     # TODO: make a parameter
@@ -360,7 +359,6 @@ def run_transformer_int4_gpu(repo_id,
     from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
     import intel_extension_for_pytorch as ipex
-    reserved_mem_list = []
     model_path = get_model_path(repo_id, local_model_hub)
     # Load model in 4 bit,
     # which convert the relevant layers in the model into INT4 format
@@ -396,8 +394,7 @@ def run_transformer_int4_gpu(repo_id,
             # For gpt-j model family, this optimization can provide a better performance.
             model = ipex.optimize(model.eval(), inplace=True)
     end = time.perf_counter()
-    print(">> loading of model costs {}s".format(end - st))
-    reserved_mem_list.append(torch.xpu.memory.memory_reserved()/(1024**3))
+    print(">> loading of model costs {}s and {}GB".format(end - st, torch.xpu.memory.memory_reserved()/(1024**3)))
 
     model = BenchmarkWrapper(model)
 
@@ -424,7 +421,7 @@ def run_transformer_int4_gpu(repo_id,
             input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
             actual_in_len = input_ids.shape[1]
             result[in_out] = []
-            thread = threading.Thread(target=run_model_in_thread, args=(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials, reserved_mem_list))
+            thread = threading.Thread(target=run_model_in_thread, args=(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials))
             thread.start()
             thread.join()
     model.to('cpu')
@@ -760,7 +757,6 @@ def run_transformer_int4_gpu_win(repo_id,
     from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
     from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
     import intel_extension_for_pytorch as ipex
-    reserved_mem_list = []
     model_path = get_model_path(repo_id, local_model_hub)
     # Load model in 4 bit,
     # which convert the relevant layers in the model into INT4 format
@@ -792,8 +788,7 @@ def run_transformer_int4_gpu_win(repo_id,
             # For gpt-j model family, this optimization can provide a better performance.
             model = ipex.optimize(model.eval(), inplace=True)
     end = time.perf_counter()
-    print(">> loading of model costs {}s".format(end - st))
-    reserved_mem_list.append(torch.xpu.memory.memory_reserved()/(1024**3))
+    print(">> loading of model costs {}s and {}GB".format(end - st, torch.xpu.memory.memory_reserved()/(1024**3)))
 
     model = BenchmarkWrapper(model)
 
@@ -825,8 +820,6 @@ def run_transformer_int4_gpu_win(repo_id,
                                                 num_beams=num_beams)
                     torch.xpu.synchronize()
                     end = time.perf_counter()
-                    reserved_mem_list.append(torch.xpu.memory.memory_reserved()/(1024**3))
-                    gpu_peak_mem = max(reserved_mem_list) # always keep the peak gpu mem at current stage
                     output_ids = output_ids.cpu()
                     print("model generate cost: " + str(end - st))
                     output = tokenizer.batch_decode(output_ids)
@@ -834,7 +827,7 @@ def run_transformer_int4_gpu_win(repo_id,
                     actual_out_len = output_ids.shape[1] - actual_in_len
                     if i >= warm_up:
                         result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
-                                            actual_in_len, actual_out_len, gpu_peak_mem])
+                                               actual_in_len, actual_out_len, model.peak_memory])
                     # torch.xpu.empty_cache() # this may make first token slower
             except RuntimeError:
                 traceback.print_exc()
diff --git a/python/llm/dev/benchmark/benchmark_util.py b/python/llm/dev/benchmark/benchmark_util.py
index 984bc762..a5f0a372 100644
--- a/python/llm/dev/benchmark/benchmark_util.py
+++ b/python/llm/dev/benchmark/benchmark_util.py
@@ -516,6 +516,7 @@ class BenchmarkWrapper:
         self.encoder_time = 0.0
         self.first_cost = 0.0
         self.rest_cost_mean = 0.0
+        self.peak_memory = 0.0
         print(self.model.__class__)
 
     def __getattr__(self, attr):
@@ -2363,6 +2364,7 @@ class BenchmarkWrapper:
         
         first_token_time = None
         last_token_time = []
+        memory_every_token = []
         while True:
             st = time.perf_counter()
             if synced_gpus:
@@ -2440,6 +2442,7 @@ class BenchmarkWrapper:
 
             if self.device.type == "xpu":
                 torch.xpu.synchronize()
+                memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024**3))
             end = time.perf_counter()
             if first_token_time is None:
                 first_token_time = end - st
@@ -2454,13 +2457,15 @@ class BenchmarkWrapper:
                 break
 
         if self.do_print:
-            print(f"=========First token cost {first_token_time:.4f} s=========")
+            print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
         if len(last_token_time) > 1:
             self.first_cost = first_token_time
             self.rest_cost_mean = np.mean(last_token_time)
+            self.peak_memory = np.max(memory_every_token[1:])
             if self.do_print:
                 print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
-                      f" tokens in all)=========")
+                      f" tokens in all) and {self.peak_memory} GB=========")
+                print(f"Peak memory for every token: {memory_every_token}")
 
         if streamer is not None:
             streamer.end()
@@ -2662,6 +2667,7 @@ class BenchmarkWrapper:
         
         first_token_time = None
         last_token_time = []
+        memory_every_token = []
         # auto-regressive generation
         while True:
             st = time.perf_counter()
@@ -2743,6 +2749,7 @@ class BenchmarkWrapper:
 
             if self.device.type == "xpu":
                 torch.xpu.synchronize()
+                memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024 ** 3))
             end = time.perf_counter()
             if first_token_time is None:
                 first_token_time = end - st
@@ -2757,13 +2764,15 @@ class BenchmarkWrapper:
                 break
 
         if self.do_print:
-            print(f"=========First token cost {first_token_time:.4f} s=========")
+            print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
         if len(last_token_time) > 1:
             self.first_cost = first_token_time
             self.rest_cost_mean = np.mean(last_token_time)
+            self.peak_memory = np.max(memory_every_token[1:])
             if self.do_print:
                 print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
-                      f" tokens in all)=========")
+                      f" tokens in all) and {self.peak_memory} GB=========")
+                print(f"Peak memory for every token: {memory_every_token}")
 
         if streamer is not None:
             streamer.end()
@@ -2975,6 +2984,7 @@ class BenchmarkWrapper:
         first_token_time = None
         last_token_time = []
         this_peer_finished = False  # used by synced_gpus only
+        memory_every_token = []
         while True:
             st = time.perf_counter()
             if synced_gpus:
@@ -3072,6 +3082,7 @@ class BenchmarkWrapper:
 
             if self.device.type == "xpu":
                 torch.xpu.synchronize()
+                memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024 ** 3))
             end = time.perf_counter()
             if first_token_time is None:
                 first_token_time = end - st
@@ -3096,13 +3107,15 @@ class BenchmarkWrapper:
         )
 
         if self.do_print:
-            print(f"=========First token cost {first_token_time:.4f} s=========")
+            print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
         if len(last_token_time) > 1:
             self.first_cost = first_token_time
             self.rest_cost_mean = np.mean(last_token_time)
+            self.peak_memory = np.max(memory_every_token[1:])
             if self.do_print:
                 print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
-                      f" tokens in all)=========")
+                      f" tokens in all) and {self.peak_memory} GB=========")
+                print(f"Peak memory for every token: {memory_every_token}")
 
         if return_dict_in_generate:
             if not output_scores:
@@ -3322,6 +3335,7 @@ class BenchmarkWrapper:
         
         first_token_time = None
         last_token_time = []
+        memory_every_token = []
         while True:
             st = time.perf_counter()
             if synced_gpus:
@@ -3432,6 +3446,7 @@ class BenchmarkWrapper:
 
             if self.device.type == "xpu":
                 torch.xpu.synchronize()
+                memory_every_token.append(torch.xpu.memory.memory_reserved() / (1024 ** 3))
             end = time.perf_counter()
             if first_token_time is None:
                 first_token_time = end - st
@@ -3450,13 +3465,15 @@ class BenchmarkWrapper:
         )
 
         if self.do_print:
-            print(f"=========First token cost {first_token_time:.4f} s=========")
+            print(f"=========First token cost {first_token_time:.4f} s and {memory_every_token[0]} GB=========")
         if len(last_token_time) > 1:
             self.first_cost = first_token_time
             self.rest_cost_mean = np.mean(last_token_time)
+            self.peak_memory = np.max(memory_every_token[1:])
             if self.do_print:
                 print(f"=========Rest tokens cost average {self.rest_cost_mean:.4f} s ({len(last_token_time)}"
-                      f" tokens in all)=========")
+                      f" tokens in all) and {self.peak_memory} GB=========")
+                print(f"Peak memory for every token: {memory_every_token}")
 
         if return_dict_in_generate:
             if not output_scores: