diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md b/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md index db7cec5b..0dde00f1 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md @@ -16,11 +16,13 @@ conda create -n llm python=3.11 # recommend to use Python 3.11 conda activate llm pip install --pre --upgrade ipex-llm[all] # install the latest ipex-llm nightly build with 'all' option - -git clone -b v1.1.1 --depth=1 https://github.com/haotian-liu/LLaVA.git # clone the llava libary pip install einops # install dependencies required by llava +pip install transformers==4.36.2 + +git clone https://github.com/haotian-liu/LLaVA.git # clone the llava libary cp generate.py ./LLaVA/ # copy our example to the LLaVA folder cd LLaVA # change the working directory to the LLaVA folder +git checkout tags/v1.2.0 -b 1.2.0 # Get the branch which is compatible with transformers 4.36 ``` ### 2. Run diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py index 5f3316e3..780ba963 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py +++ b/python/llm/example/CPU/PyTorch-Models/Model/llava/generate.py @@ -39,6 +39,7 @@ import time from transformers import AutoModelForCausalLM from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM from transformers import AutoTokenizer +from transformers import TextStreamer from llava.constants import ( DEFAULT_IMAGE_PATCH_TOKEN, @@ -311,11 +312,14 @@ if __name__ == '__main__': print("exit...") break + print(f"{roles[1]}: ", end="") + prompt = get_prompt(model.config.mm_use_im_start_end, first_round, conv, user_input) first_round = False input_ids = tokenizer_image_token( prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0) stopping_criteria = get_stopping_criteria(conv, tokenizer, input_ids) + streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # Generate predicted tokens with torch.inference_mode(): @@ -325,13 +329,11 @@ if __name__ == '__main__': images=image_tensor, do_sample=True, max_new_tokens=args.n_predict, + streamer=streamer, use_cache=True, stopping_criteria=[stopping_criteria]) end = time.time() #print(f'Inference time: {end-st} s') - outputs = tokenizer.decode( - output_ids[0, input_ids.shape[1]:], skip_special_tokens=True).strip() + outputs = tokenizer.decode(output_ids[0, :], skip_special_tokens=True).strip() conv.messages[-1][-1] = outputs - print(f"{roles[1]}: ", end="") - print(outputs) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md b/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md index 4eefd714..668c63a8 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md +++ b/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md @@ -17,11 +17,13 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -git clone -b v1.1.1 --depth=1 https://github.com/haotian-liu/LLaVA.git # clone the llava libary pip install einops # install dependencies required by llava +pip install transformers==4.36.2 + +git clone https://github.com/haotian-liu/LLaVA.git # clone the llava libary cp generate.py ./LLaVA/ # copy our example to the LLaVA folder cd LLaVA # change the working directory to the LLaVA folder +git checkout tags/v1.2.0 -b 1.2.0 # Get the branch which is compatible with transformers 4.36 ``` #### 1.2 Installation on Windows @@ -31,11 +33,14 @@ conda create -n llm python=3.11 libuv conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -git clone -b v1.1.1 --depth=1 https://github.com/haotian-liu/LLaVA.git # clone the llava libary pip install einops # install dependencies required by llava +pip install transformers==4.36.2 + +git clone https://github.com/haotian-liu/LLaVA.git # clone the llava libary cp generate.py ./LLaVA/ # copy our example to the LLaVA folder cd LLaVA # change the working directory to the LLaVA folder +git checkout tags/v1.2.0 -b 1.2.0 # Get the branch which is compatible with transformers 4.36 + ``` ### 2. Configures OneAPI environment variables diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py index ce3275df..84c8b726 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/llava/generate.py @@ -39,6 +39,7 @@ import time from transformers import AutoModelForCausalLM from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM from transformers import AutoTokenizer +from transformers import TextStreamer from llava.constants import ( DEFAULT_IMAGE_PATCH_TOKEN, @@ -312,11 +313,14 @@ if __name__ == '__main__': print("exit...") break + print(f"{roles[1]}: ", end="") + prompt = get_prompt(model.config.mm_use_im_start_end, first_round, conv, user_input) first_round = False input_ids = tokenizer_image_token( prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to('xpu') stopping_criteria = get_stopping_criteria(conv, tokenizer, input_ids) + streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # Generate predicted tokens with torch.inference_mode(): @@ -326,13 +330,11 @@ if __name__ == '__main__': images=image_tensor, do_sample=True, max_new_tokens=args.n_predict, + streamer=streamer, use_cache=True, stopping_criteria=[stopping_criteria]) end = time.time() #print(f'Inference time: {end-st} s') - outputs = tokenizer.decode( - output_ids[0, input_ids.shape[1]:].cpu(), skip_special_tokens=True).strip() + outputs = tokenizer.decode(output_ids[0, :].cpu(), skip_special_tokens=True).strip() conv.messages[-1][-1] = outputs - print(f"{roles[1]}: ", end="") - print(outputs) diff --git a/python/llm/src/ipex_llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py index 57763d8f..47f5aec7 100644 --- a/python/llm/src/ipex_llm/transformers/models/llama.py +++ b/python/llm/src/ipex_llm/transformers/models/llama.py @@ -114,7 +114,8 @@ def llama_model_forward_4_36( ) -> Union[Tuple, BaseModelOutputWithPast]: from ipex_llm.transformers.kv import DynamicFp8Cache use_cache = use_cache if use_cache is not None else self.config.use_cache - if use_cache and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids): + input = input_ids if input_ids is not None else inputs_embeds + if use_cache and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input): if not isinstance(past_key_values, DynamicFp8Cache): past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) return llama_model_forward_4_36_internal(