diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index 9b73d729..23148cce 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -1210,15 +1210,15 @@ def run_bigdl_ipex_bf16(repo_id,
     st = time.perf_counter()
     if repo_id in CHATGLM_IDS:
         model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
-                                          use_cache=True, torchscript=True)
+                                          use_cache=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     elif repo_id in LLAMA_IDS:
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
-                                                     use_cache=True, torchscript=True)
+                                                     use_cache=True)
         tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
-                                                     use_cache=True, torchscript=True)
+                                                     use_cache=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     if not hasattr(model.config, "token_latency"):
         model.config.token_latency = True
@@ -1280,15 +1280,15 @@ def run_bigdl_ipex_int4(repo_id,
     st = time.perf_counter()
     if repo_id in CHATGLM_IDS:
         model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
-                                          use_cache=True, torchscript=True)
+                                          use_cache=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     elif repo_id in LLAMA_IDS:
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
-                                                     use_cache=True, torchscript=True)
+                                                     use_cache=True)
         tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int4', trust_remote_code=True, torch_dtype='auto',
-                                                     use_cache=True, torchscript=True)
+                                                     use_cache=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     if not hasattr(model.config, "token_latency"):
         model.config.token_latency = True
@@ -1350,15 +1350,15 @@ def run_bigdl_ipex_int8(repo_id,
     st = time.perf_counter()
     if repo_id in CHATGLM_IDS:
         model = AutoModel.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
-                                          use_cache=True, torchscript=True)
+                                          use_cache=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     elif repo_id in LLAMA_IDS:
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
-                                                     use_cache=True, torchscript=True)
+                                                     use_cache=True)
         tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='sym_int8', trust_remote_code=True, torch_dtype='auto',
-                                                     use_cache=True, torchscript=True)
+                                                     use_cache=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     if not hasattr(model.config, "token_latency"):
         model.config.token_latency = True
@@ -1537,15 +1537,15 @@ def run_speculative_cpu(repo_id,
     st = time.perf_counter()
     if repo_id in CHATGLM_IDS:
         model = AutoModel.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
-                                          use_cache=True, torchscript=True, speculative=True)
+                                          use_cache=True, speculative=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     elif repo_id in LLAMA_IDS:
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
-                                                     use_cache=True, torchscript=True, speculative=True)
+                                                     use_cache=True, speculative=True)
         tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='bf16', trust_remote_code=True, torch_dtype=torch.bfloat16,
-                                                     use_cache=True, torchscript=True, speculative=True)
+                                                     use_cache=True, speculative=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
diff --git a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py
index 84cb9112..fa59c410 100644
--- a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py
@@ -59,7 +59,6 @@ if __name__ == '__main__':
                                                  optimize_model=True,
                                                  torch_dtype=torch.bfloat16,
                                                  load_in_low_bit="bf16",
-                                                 torchscript=True,
                                                  speculative=True,
                                                  trust_remote_code=True,
                                                  use_cache=True)
diff --git a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py
index 5ec9a67c..7df302aa 100644
--- a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py
@@ -57,7 +57,6 @@ if __name__ == '__main__':
                                       load_in_low_bit="bf16",
                                       speculative=True,
                                       trust_remote_code=True,
-                                      torchscript=True,
                                       use_cache=True)
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
diff --git a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py
index f870a094..43da3fd6 100644
--- a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py
@@ -74,7 +74,6 @@ if __name__ == '__main__':
                                                  optimize_model=True,
                                                  torch_dtype=torch.bfloat16,
                                                  load_in_low_bit="bf16",
-                                                 torchscript=True,
                                                  speculative=True,
                                                  trust_remote_code=True,
                                                  use_cache=True)
diff --git a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py
index 1968ccaa..16563b81 100644
--- a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py
@@ -65,7 +65,6 @@ if __name__ == '__main__':
                                                  torch_dtype=torch.bfloat16,
                                                  load_in_low_bit="bf16",
                                                  speculative=True,
-                                                 torchscript=True,
                                                  trust_remote_code=True,
                                                  use_cache=True)
 
diff --git a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py
index c35b0b65..852c528e 100644
--- a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py
@@ -51,7 +51,6 @@ if __name__ == '__main__':
                                                  torch_dtype=torch.bfloat16,
                                                  load_in_low_bit="bf16",
                                                  speculative=True,
-                                                 torchscript=True,
                                                  trust_remote_code=True,
                                                  use_cache=True)
 
diff --git a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py
index 73970f67..9c9e36ba 100644
--- a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py
@@ -69,7 +69,6 @@ if __name__ == '__main__':
                                                  torch_dtype=torch.bfloat16,
                                                  load_in_low_bit="bf16",
                                                  speculative=True,
-                                                 torchscript=True,
                                                  trust_remote_code=True,
                                                  use_cache=True)
 
diff --git a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py
index a8a82474..4a957db6 100644
--- a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py
@@ -51,7 +51,6 @@ if __name__ == '__main__':
                                                  torch_dtype=torch.bfloat16,
                                                  load_in_low_bit="bf16",
                                                  speculative=True,
-                                                 torchscript=True,
                                                  trust_remote_code=True,
                                                  use_cache=True)
 
diff --git a/python/llm/src/ipex_llm/transformers/convert_ipex.py b/python/llm/src/ipex_llm/transformers/convert_ipex.py
index 6368d13a..a934a1dd 100644
--- a/python/llm/src/ipex_llm/transformers/convert_ipex.py
+++ b/python/llm/src/ipex_llm/transformers/convert_ipex.py
@@ -150,7 +150,8 @@ def _ipex_optimize_model(model, rms_classes, qtype):
     _ipex_optimize_attention(model, is_tpp=is_tpp, is_woq=is_woq)
     _ipex_optimize_decoder(model, is_tpp=is_tpp, is_woq=is_woq)
 
-    model.register_forward_hook(output_hook, with_kwargs=True)
+    # need to register_forward_hook after torch.jit.trace
+    # model.register_forward_hook(output_hook, with_kwargs=True)
     return model