Fix short prompt for IPEX_CPU speculative decoding cause no_attr error (#10783)
This commit is contained in:
		
							parent
							
								
									7ec82c6042
								
							
						
					
					
						commit
						0646e2c062
					
				
					 8 changed files with 79 additions and 25 deletions
				
			
		| 
						 | 
					@ -70,6 +70,8 @@ if __name__ == '__main__':
 | 
				
			||||||
        prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
 | 
					        prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
 | 
				
			||||||
        inputs = tokenizer(prompt, return_tensors='pt', padding=True)
 | 
					        inputs = tokenizer(prompt, return_tensors='pt', padding=True)
 | 
				
			||||||
        input_ids = inputs.input_ids.to(model.device)
 | 
					        input_ids = inputs.input_ids.to(model.device)
 | 
				
			||||||
 | 
					        actual_in_len = input_ids.shape[1]
 | 
				
			||||||
 | 
					        print("actual input_ids length:" + str(actual_in_len))
 | 
				
			||||||
        attention_mask = inputs.attention_mask.to(model.device)
 | 
					        attention_mask = inputs.attention_mask.to(model.device)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # warmup
 | 
					        # warmup
 | 
				
			||||||
| 
						 | 
					@ -90,7 +92,13 @@ if __name__ == '__main__':
 | 
				
			||||||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
					        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
				
			||||||
        end = time.perf_counter()
 | 
					        end = time.perf_counter()
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        print(output_str)
 | 
					 | 
				
			||||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
					 | 
				
			||||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
					        print(f"E2E Generation time {(end - st):.4f}s")
 | 
				
			||||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
					        print(output_str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
				
			||||||
 | 
					        # it will use normal generate() and has not these attr
 | 
				
			||||||
 | 
					        from ipex_llm.transformers.convert import get_enable_ipex
 | 
				
			||||||
 | 
					        _enable_ipex = get_enable_ipex()
 | 
				
			||||||
 | 
					        if not _enable_ipex or actual_in_len >= 256:
 | 
				
			||||||
 | 
					            print(f"Tokens generated {model.n_token_generated}")
 | 
				
			||||||
 | 
					            print(f"First token latency {model.first_token_time:.4f}s")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -87,7 +87,13 @@ if __name__ == '__main__':
 | 
				
			||||||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
					        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
				
			||||||
        end = time.perf_counter()
 | 
					        end = time.perf_counter()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        print(output_str)
 | 
					 | 
				
			||||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
					 | 
				
			||||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
					        print(f"E2E Generation time {(end - st):.4f}s")
 | 
				
			||||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
					        print(output_str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
				
			||||||
 | 
					        # it will use normal generate() and has not these attr
 | 
				
			||||||
 | 
					        from ipex_llm.transformers.convert import get_enable_ipex
 | 
				
			||||||
 | 
					        _enable_ipex = get_enable_ipex()
 | 
				
			||||||
 | 
					        if not _enable_ipex or actual_in_len >= 256:
 | 
				
			||||||
 | 
					            print(f"Tokens generated {model.n_token_generated}")
 | 
				
			||||||
 | 
					            print(f"First token latency {model.first_token_time:.4f}s")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,6 +16,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
 | 
					from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from transformers import LlamaTokenizer, AutoTokenizer
 | 
					from transformers import LlamaTokenizer, AutoTokenizer
 | 
				
			||||||
import argparse
 | 
					import argparse
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
| 
						 | 
					@ -104,7 +105,13 @@ if __name__ == '__main__':
 | 
				
			||||||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
					        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
				
			||||||
        end = time.perf_counter()
 | 
					        end = time.perf_counter()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        print(output_str)
 | 
					 | 
				
			||||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
					 | 
				
			||||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
					        print(f"E2E Generation time {(end - st):.4f}s")
 | 
				
			||||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
					        print(output_str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
				
			||||||
 | 
					        # it will use normal generate() and has not these attr
 | 
				
			||||||
 | 
					        from ipex_llm.transformers.convert import get_enable_ipex
 | 
				
			||||||
 | 
					        _enable_ipex = get_enable_ipex()
 | 
				
			||||||
 | 
					        if not _enable_ipex or actual_in_len >= 256:
 | 
				
			||||||
 | 
					            print(f"Tokens generated {model.n_token_generated}")
 | 
				
			||||||
 | 
					            print(f"First token latency {model.first_token_time:.4f}s")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -97,7 +97,14 @@ if __name__ == '__main__':
 | 
				
			||||||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
					        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
				
			||||||
        end = time.perf_counter()
 | 
					        end = time.perf_counter()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        print(output_str)
 | 
					 | 
				
			||||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
					 | 
				
			||||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
					        print(f"E2E Generation time {(end - st):.4f}s")
 | 
				
			||||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
					        print(output_str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
				
			||||||
 | 
					        # it will use normal generate() and has not these attr
 | 
				
			||||||
 | 
					        from ipex_llm.transformers.convert import get_enable_ipex
 | 
				
			||||||
 | 
					        _enable_ipex = get_enable_ipex()
 | 
				
			||||||
 | 
					        if not _enable_ipex or actual_in_len >= 256:
 | 
				
			||||||
 | 
					            print(f"Tokens generated {model.n_token_generated}")
 | 
				
			||||||
 | 
					            print(f"First token latency {model.first_token_time:.4f}s")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -101,7 +101,13 @@ if __name__ == '__main__':
 | 
				
			||||||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
					        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
				
			||||||
        end = time.perf_counter()
 | 
					        end = time.perf_counter()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        print(output_str)
 | 
					 | 
				
			||||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
					 | 
				
			||||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
					        print(f"E2E Generation time {(end - st):.4f}s")
 | 
				
			||||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
					        print(output_str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
				
			||||||
 | 
					        # it will use normal generate() and has not these attr
 | 
				
			||||||
 | 
					        from ipex_llm.transformers.convert import get_enable_ipex
 | 
				
			||||||
 | 
					        _enable_ipex = get_enable_ipex()
 | 
				
			||||||
 | 
					        if not _enable_ipex or actual_in_len >= 256:
 | 
				
			||||||
 | 
					            print(f"Tokens generated {model.n_token_generated}")
 | 
				
			||||||
 | 
					            print(f"First token latency {model.first_token_time:.4f}s")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -81,7 +81,13 @@ if __name__ == '__main__':
 | 
				
			||||||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
					        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
				
			||||||
        end = time.perf_counter()
 | 
					        end = time.perf_counter()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        print(output_str)
 | 
					 | 
				
			||||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
					 | 
				
			||||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
					        print(f"E2E Generation time {(end - st):.4f}s")
 | 
				
			||||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
					        print(output_str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
				
			||||||
 | 
					        # it will use normal generate() and has not these attr
 | 
				
			||||||
 | 
					        from ipex_llm.transformers.convert import get_enable_ipex
 | 
				
			||||||
 | 
					        _enable_ipex = get_enable_ipex()
 | 
				
			||||||
 | 
					        if not _enable_ipex or actual_in_len >= 256:
 | 
				
			||||||
 | 
					            print(f"Tokens generated {model.n_token_generated}")
 | 
				
			||||||
 | 
					            print(f"First token latency {model.first_token_time:.4f}s")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -79,6 +79,8 @@ if __name__ == '__main__':
 | 
				
			||||||
        prompt = Vicuna_PROMPT_FORMAT.format(prompt=args.prompt)
 | 
					        prompt = Vicuna_PROMPT_FORMAT.format(prompt=args.prompt)
 | 
				
			||||||
        inputs = tokenizer(prompt, return_tensors='pt', padding=True)
 | 
					        inputs = tokenizer(prompt, return_tensors='pt', padding=True)
 | 
				
			||||||
        input_ids = inputs.input_ids.to(model.device)
 | 
					        input_ids = inputs.input_ids.to(model.device)
 | 
				
			||||||
 | 
					        actual_in_len = input_ids.shape[1]
 | 
				
			||||||
 | 
					        print("actual input_ids length:" + str(actual_in_len))
 | 
				
			||||||
        attention_mask = inputs.attention_mask.to(model.device)
 | 
					        attention_mask = inputs.attention_mask.to(model.device)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # warmup
 | 
					        # warmup
 | 
				
			||||||
| 
						 | 
					@ -97,7 +99,13 @@ if __name__ == '__main__':
 | 
				
			||||||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
					        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
				
			||||||
        end = time.perf_counter()
 | 
					        end = time.perf_counter()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        print(output_str)
 | 
					 | 
				
			||||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
					 | 
				
			||||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
					        print(f"E2E Generation time {(end - st):.4f}s")
 | 
				
			||||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
					        print(output_str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
				
			||||||
 | 
					        # it will use normal generate() and has not these attr
 | 
				
			||||||
 | 
					        from ipex_llm.transformers.convert import get_enable_ipex
 | 
				
			||||||
 | 
					        _enable_ipex = get_enable_ipex()
 | 
				
			||||||
 | 
					        if not _enable_ipex or actual_in_len >= 256:
 | 
				
			||||||
 | 
					            print(f"Tokens generated {model.n_token_generated}")
 | 
				
			||||||
 | 
					            print(f"First token latency {model.first_token_time:.4f}s")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -81,7 +81,13 @@ if __name__ == '__main__':
 | 
				
			||||||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
					        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
				
			||||||
        end = time.perf_counter()
 | 
					        end = time.perf_counter()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        print(output_str)
 | 
					 | 
				
			||||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
					 | 
				
			||||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
					        print(f"E2E Generation time {(end - st):.4f}s")
 | 
				
			||||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
					        print(output_str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
				
			||||||
 | 
					        # it will use normal generate() and has not these attr
 | 
				
			||||||
 | 
					        from ipex_llm.transformers.convert import get_enable_ipex
 | 
				
			||||||
 | 
					        _enable_ipex = get_enable_ipex()
 | 
				
			||||||
 | 
					        if not _enable_ipex or actual_in_len >= 256:
 | 
				
			||||||
 | 
					            print(f"Tokens generated {model.n_token_generated}")
 | 
				
			||||||
 | 
					            print(f"First token latency {model.first_token_time:.4f}s")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue