diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md b/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md
index db7cec5b..0dde00f1 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md
+++ b/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md
@@ -16,11 +16,13 @@ conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm
 
 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
-
-git clone -b v1.1.1 --depth=1 https://github.com/haotian-liu/LLaVA.git # clone the llava libary
 pip install einops # install dependencies required by llava
+pip install transformers==4.36.2
+
+git clone https://github.com/haotian-liu/LLaVA.git # clone the llava libary
 cp generate.py ./LLaVA/ # copy our example to the LLaVA folder
 cd LLaVA # change the working directory to the LLaVA folder
+git checkout tags/v1.2.0 -b 1.2.0 # Get the branch which is compatible with transformers 4.36
 ```
 
 ### 2. Run
diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py
index 5f3316e3..780ba963 100644
--- a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py
@@ -39,6 +39,7 @@ import time
 from transformers import AutoModelForCausalLM
 from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
 from transformers import AutoTokenizer
+from transformers import TextStreamer
 
 from llava.constants import (
     DEFAULT_IMAGE_PATCH_TOKEN,
@@ -311,11 +312,14 @@ if __name__ == '__main__':
             print("exit...")
             break
 
+        print(f"{roles[1]}: ", end="")
+
         prompt = get_prompt(model.config.mm_use_im_start_end, first_round, conv, user_input)
         first_round = False
         input_ids = tokenizer_image_token(
             prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)
         stopping_criteria = get_stopping_criteria(conv, tokenizer, input_ids)
+        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 
         # Generate predicted tokens
         with torch.inference_mode():
@@ -325,13 +329,11 @@ if __name__ == '__main__':
                 images=image_tensor,
                 do_sample=True,
                 max_new_tokens=args.n_predict,
+                streamer=streamer,
                 use_cache=True,
                 stopping_criteria=[stopping_criteria])
             end = time.time()
             #print(f'Inference time: {end-st} s')
 
-        outputs = tokenizer.decode(
-            output_ids[0, input_ids.shape[1]:], skip_special_tokens=True).strip()
+        outputs = tokenizer.decode(output_ids[0, :], skip_special_tokens=True).strip()
         conv.messages[-1][-1] = outputs
-        print(f"{roles[1]}: ", end="")
-        print(outputs)
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md b/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md
index 4eefd714..668c63a8 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md
+++ b/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md
@@ -17,11 +17,13 @@ conda activate llm
 
 # below command will install intel_extension_for_pytorch==2.1.10+xpu as default
 pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-
-git clone -b v1.1.1 --depth=1 https://github.com/haotian-liu/LLaVA.git # clone the llava libary
 pip install einops # install dependencies required by llava
+pip install transformers==4.36.2
+
+git clone https://github.com/haotian-liu/LLaVA.git # clone the llava libary
 cp generate.py ./LLaVA/ # copy our example to the LLaVA folder
 cd LLaVA # change the working directory to the LLaVA folder
+git checkout tags/v1.2.0 -b 1.2.0 # Get the branch which is compatible with transformers 4.36
 ```
 
 #### 1.2 Installation on Windows
@@ -31,11 +33,14 @@ conda create -n llm python=3.11 libuv
 conda activate llm
 # below command will install intel_extension_for_pytorch==2.1.10+xpu as default
 pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-
-git clone -b v1.1.1 --depth=1 https://github.com/haotian-liu/LLaVA.git # clone the llava libary
 pip install einops # install dependencies required by llava
+pip install transformers==4.36.2
+
+git clone https://github.com/haotian-liu/LLaVA.git # clone the llava libary
 cp generate.py ./LLaVA/ # copy our example to the LLaVA folder
 cd LLaVA # change the working directory to the LLaVA folder
+git checkout tags/v1.2.0 -b 1.2.0 # Get the branch which is compatible with transformers 4.36
+
 ```
 
 ### 2. Configures OneAPI environment variables
diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py
index ce3275df..84c8b726 100644
--- a/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py
@@ -39,6 +39,7 @@ import time
 from transformers import AutoModelForCausalLM
 from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
 from transformers import AutoTokenizer
+from transformers import TextStreamer
 
 from llava.constants import (
     DEFAULT_IMAGE_PATCH_TOKEN,
@@ -312,11 +313,14 @@ if __name__ == '__main__':
             print("exit...")
             break
 
+        print(f"{roles[1]}: ", end="")
+
         prompt = get_prompt(model.config.mm_use_im_start_end, first_round, conv, user_input)
         first_round = False
         input_ids = tokenizer_image_token(
             prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to('xpu')
         stopping_criteria = get_stopping_criteria(conv, tokenizer, input_ids)
+        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 
         # Generate predicted tokens
         with torch.inference_mode():
@@ -326,13 +330,11 @@ if __name__ == '__main__':
                 images=image_tensor,
                 do_sample=True,
                 max_new_tokens=args.n_predict,
+                streamer=streamer,
                 use_cache=True,
                 stopping_criteria=[stopping_criteria])
             end = time.time()
             #print(f'Inference time: {end-st} s')
 
-        outputs = tokenizer.decode(
-            output_ids[0, input_ids.shape[1]:].cpu(), skip_special_tokens=True).strip()
+        outputs = tokenizer.decode(output_ids[0, :].cpu(), skip_special_tokens=True).strip()
         conv.messages[-1][-1] = outputs
-        print(f"{roles[1]}: ", end="")
-        print(outputs)
diff --git a/python/llm/src/ipex_llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py
index 57763d8f..47f5aec7 100644
--- a/python/llm/src/ipex_llm/transformers/models/llama.py
+++ b/python/llm/src/ipex_llm/transformers/models/llama.py
@@ -114,7 +114,8 @@ def llama_model_forward_4_36(
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     from ipex_llm.transformers.kv import DynamicFp8Cache
     use_cache = use_cache if use_cache is not None else self.config.use_cache
-    if use_cache and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids):
+    input = input_ids if input_ids is not None else inputs_embeds
+    if use_cache and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input):
         if not isinstance(past_key_values, DynamicFp8Cache):
             past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
     return llama_model_forward_4_36_internal(