From ea0853c0b5e2f13e6ff5c550eee60fbb15d315bc Mon Sep 17 00:00:00 2001
From: Xin Qiu <qiuxin2012@users.noreply.github.com>
Date: Fri, 8 Sep 2023 10:30:26 +0800
Subject: [PATCH] update benchmark_utils readme (#8925)

* update readme

* meet code review
---
 python/llm/dev/benchmark/README.md | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/python/llm/dev/benchmark/README.md b/python/llm/dev/benchmark/README.md
index 2cdba0fc..89c2fa6c 100644
--- a/python/llm/dev/benchmark/README.md
+++ b/python/llm/dev/benchmark/README.md
@@ -1,24 +1,19 @@
 # Benchmark tool for transformers int4 (separate 1st token and rest)
 
-`benchmark_util.py` is used to provide a simple benchmark tool for transformer int4 model to calculate 1st token performance and the rest on CPU.
-
-`gpu_benchmark_util.py` is used to provide a simple benchmark tool for transformer int4 model to calculate 1st token performance and the rest on GPU.
+`benchmark_util.py` is used to provide a simple benchmark tool for transformer int4 model to calculate 1st token performance and the rest on CPU and GPU.
 
 ## CPU Usage
 Just put this file into your benchmark directory, and then wrap your transformer int4 model with `BenchmarkWrapper` (`model = BenchmarkWrapper(model)`).
 Take `chatglm-6b` as an example:
 ```python
 import torch
-import os
 from bigdl.llm.transformers import AutoModel
 from transformers import AutoTokenizer
-import time
-import numpy as np
 from benchmark_util import BenchmarkWrapper
 
 model_path ='THUDM/chatglm-6b'
 model = AutoModel.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
-model = BenchmarkWrapper(model)
+model = BenchmarkWrapper(model, do_print=True)
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 prompt = "今天睡不着怎么办"
  
@@ -38,18 +33,15 @@ Just put this file into your benchmark directory, and then wrap your transformer
 Take `chatglm-6b` as an example:
 ```python
 import torch
-import os
 import intel_extension_for_pytorch as ipex
 from bigdl.llm.transformers import AutoModel
 from transformers import AutoTokenizer
-import time
-import numpy as np
-from gpu_benchmark_util import BenchmarkWrapper
+from benchmark_util import BenchmarkWrapper
 
 model_path ='THUDM/chatglm-6b'
 model = AutoModel.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
-model = model.half().to('xpu')
-model = BenchmarkWrapper(model)
+model = model.to('xpu')
+model = BenchmarkWrapper(model, do_print=True)
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 prompt = "今天睡不着怎么办"