Fix llava example to support transformerds 4.36 (#10614)

* fix llava example * update
2024-04-09 13:47:07 -07:00 · 2024-04-09 13:47:07 -07:00 · 878a97077b
commit 878a97077b
parent 1e817926ba
5 changed files with 27 additions and 15 deletions
--- a/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md
+++ b/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md
@ -16,11 +16,13 @@ conda create -n llm python=3.11 # recommend to use Python 3.11
 conda activate llm

 pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option
-
-git clone -b v1.1.1 --depth=1 https://github.com/haotian-liu/LLaVA.git # clone the llava libary
 pip install einops # install dependencies required by llava
+pip install transformers==4.36.2
+
+git clone https://github.com/haotian-liu/LLaVA.git # clone the llava libary
 cp generate.py ./LLaVA/ # copy our example to the LLaVA folder
 cd LLaVA # change the working directory to the LLaVA folder
+git checkout tags/v1.2.0 -b 1.2.0 # Get the branch which is compatible with transformers 4.36
 ```

 ### 2. Run
--- a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py
+++ b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py
@ -39,6 +39,7 @@ import time
 from transformers import AutoModelForCausalLM
 from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
 from transformers import AutoTokenizer
+from transformers import TextStreamer

 from llava.constants import (
    DEFAULT_IMAGE_PATCH_TOKEN,
@ -311,11 +312,14 @@ if __name__ == '__main__':
            print("exit...")
            break

+        print(f"{roles[1]}: ", end="")
+
        prompt = get_prompt(model.config.mm_use_im_start_end, first_round, conv, user_input)
        first_round = False
        input_ids = tokenizer_image_token(
            prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)
        stopping_criteria = get_stopping_criteria(conv, tokenizer, input_ids)
+        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

        # Generate predicted tokens
        with torch.inference_mode():
@ -325,13 +329,11 @@ if __name__ == '__main__':
                images=image_tensor,
                do_sample=True,
                max_new_tokens=args.n_predict,
+                streamer=streamer,
                use_cache=True,
                stopping_criteria=[stopping_criteria])
            end = time.time()
            #print(f'Inference time: {end-st} s')

-        outputs = tokenizer.decode(
-            output_ids[0, input_ids.shape[1]:], skip_special_tokens=True).strip()
+        outputs = tokenizer.decode(output_ids[0, :], skip_special_tokens=True).strip()
        conv.messages[-1][-1] = outputs
-        print(f"{roles[1]}: ", end="")
-        print(outputs)
--- a/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md
+++ b/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md
@ -17,11 +17,13 @@ conda activate llm

 # below command will install intel_extension_for_pytorch==2.1.10+xpu as default
 pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-
-git clone -b v1.1.1 --depth=1 https://github.com/haotian-liu/LLaVA.git # clone the llava libary
 pip install einops # install dependencies required by llava
+pip install transformers==4.36.2
+
+git clone https://github.com/haotian-liu/LLaVA.git # clone the llava libary
 cp generate.py ./LLaVA/ # copy our example to the LLaVA folder
 cd LLaVA # change the working directory to the LLaVA folder
+git checkout tags/v1.2.0 -b 1.2.0 # Get the branch which is compatible with transformers 4.36
 ```

 #### 1.2 Installation on Windows
@ -31,11 +33,14 @@ conda create -n llm python=3.11 libuv
 conda activate llm
 # below command will install intel_extension_for_pytorch==2.1.10+xpu as default
 pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-
-git clone -b v1.1.1 --depth=1 https://github.com/haotian-liu/LLaVA.git # clone the llava libary
 pip install einops # install dependencies required by llava
+pip install transformers==4.36.2
+
+git clone https://github.com/haotian-liu/LLaVA.git # clone the llava libary
 cp generate.py ./LLaVA/ # copy our example to the LLaVA folder
 cd LLaVA # change the working directory to the LLaVA folder
+git checkout tags/v1.2.0 -b 1.2.0 # Get the branch which is compatible with transformers 4.36
+
 ```

 ### 2. Configures OneAPI environment variables
--- a/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py
+++ b/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py
@ -39,6 +39,7 @@ import time
 from transformers import AutoModelForCausalLM
 from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
 from transformers import AutoTokenizer
+from transformers import TextStreamer

 from llava.constants import (
    DEFAULT_IMAGE_PATCH_TOKEN,
@ -312,11 +313,14 @@ if __name__ == '__main__':
            print("exit...")
            break

+        print(f"{roles[1]}: ", end="")
+
        prompt = get_prompt(model.config.mm_use_im_start_end, first_round, conv, user_input)
        first_round = False
        input_ids = tokenizer_image_token(
            prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to('xpu')
        stopping_criteria = get_stopping_criteria(conv, tokenizer, input_ids)
+        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

        # Generate predicted tokens
        with torch.inference_mode():
@ -326,13 +330,11 @@ if __name__ == '__main__':
                images=image_tensor,
                do_sample=True,
                max_new_tokens=args.n_predict,
+                streamer=streamer,
                use_cache=True,
                stopping_criteria=[stopping_criteria])
            end = time.time()
            #print(f'Inference time: {end-st} s')

-        outputs = tokenizer.decode(
-            output_ids[0, input_ids.shape[1]:].cpu(), skip_special_tokens=True).strip()
+        outputs = tokenizer.decode(output_ids[0, :].cpu(), skip_special_tokens=True).strip()
        conv.messages[-1][-1] = outputs
-        print(f"{roles[1]}: ", end="")
-        print(outputs)
--- a/python/llm/src/ipex_llm/transformers/models/llama.py
+++ b/python/llm/src/ipex_llm/transformers/models/llama.py
@ -114,7 +114,8 @@ def llama_model_forward_4_36(
 ) -> Union[Tuple, BaseModelOutputWithPast]:
    from ipex_llm.transformers.kv import DynamicFp8Cache
    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    if use_cache and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids):
+    input = input_ids if input_ids is not None else inputs_embeds
+    if use_cache and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input):
        if not isinstance(past_key_values, DynamicFp8Cache):
            past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
    return llama_model_forward_4_36_internal(