diff --git a/python/llm/dev/benchmark/README.md b/python/llm/dev/benchmark/README.md
index c371949b..2cdba0fc 100644
--- a/python/llm/dev/benchmark/README.md
+++ b/python/llm/dev/benchmark/README.md
@@ -54,9 +54,16 @@ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 prompt = "今天睡不着怎么办"
  
 with torch.inference_mode():
-    input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
-    output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
-    output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+    # wamup two times as use ipex
+    for i in range(2):
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
+        output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
+        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+    # collect performance data now
+    for i in range(5):
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
+        output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
+        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 ```
 Output will be like:
 ```bash
diff --git a/python/llm/example/transformers/transformers_int4/GPU/chatglm2/generate.py b/python/llm/example/transformers/transformers_int4/GPU/chatglm2/generate.py
index 4d89a0cd..20c27a25 100644
--- a/python/llm/example/transformers/transformers_int4/GPU/chatglm2/generate.py
+++ b/python/llm/example/transformers/transformers_int4/GPU/chatglm2/generate.py
@@ -46,7 +46,7 @@ if __name__ == '__main__':
                                       load_in_4bit=True,
                                       optimize_model=False,
                                       trust_remote_code=True)
-    model = model.half().to('xpu')
+    model = model.to('xpu')
 
     # Load tokenizer
     tokenizer = AutoTokenizer.from_pretrained(model_path,
diff --git a/python/llm/example/transformers/transformers_int4/GPU/llama2/generate.py b/python/llm/example/transformers/transformers_int4/GPU/llama2/generate.py
index 73d2282b..0a97a67a 100644
--- a/python/llm/example/transformers/transformers_int4/GPU/llama2/generate.py
+++ b/python/llm/example/transformers/transformers_int4/GPU/llama2/generate.py
@@ -49,7 +49,7 @@ if __name__ == '__main__':
                                                  load_in_4bit=True,
                                                  optimize_model=False,
                                                  trust_remote_code=True)
-    model = model.half().to('xpu')
+    model = model.to('xpu')
 
     # Load tokenizer
     tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
diff --git a/python/llm/src/bigdl/llm/transformers/linear_quant.py b/python/llm/src/bigdl/llm/transformers/linear_quant.py
index b1235b76..80244a57 100644
--- a/python/llm/src/bigdl/llm/transformers/linear_quant.py
+++ b/python/llm/src/bigdl/llm/transformers/linear_quant.py
@@ -169,7 +169,6 @@ class ParamsQuant(torch.nn.Parameter):
                                     quantized=self.quantized,
                                     _shape=self._shape,
                                     qtype=self.qtype)
-
             return new_param
 
 
@@ -244,6 +243,9 @@ class LinearQuant(nn.Linear):
 
             if x_2d.is_contiguous() is False:
                 x_2d = x_2d.contiguous()
+            # current workaround to reduce first token latency of fp32 input
+            if x_2d.shape[0] > 1 and x_2d.dtype == torch.float32:
+                x_2d = x_2d.half()
             # input format of linear_q4.forward is 1: input, 2: weight
             result = linear_q4_0.forward(x_2d, x0)
             new_shape = x_shape[:-1] + (self.out_len,)