Fix short prompt for IPEX_CPU speculative decoding cause no_attr error (#10783)
This commit is contained in:
		
							parent
							
								
									7ec82c6042
								
							
						
					
					
						commit
						0646e2c062
					
				
					 8 changed files with 79 additions and 25 deletions
				
			
		| 
						 | 
				
			
			@ -70,6 +70,8 @@ if __name__ == '__main__':
 | 
			
		|||
        prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
 | 
			
		||||
        inputs = tokenizer(prompt, return_tensors='pt', padding=True)
 | 
			
		||||
        input_ids = inputs.input_ids.to(model.device)
 | 
			
		||||
        actual_in_len = input_ids.shape[1]
 | 
			
		||||
        print("actual input_ids length:" + str(actual_in_len))
 | 
			
		||||
        attention_mask = inputs.attention_mask.to(model.device)
 | 
			
		||||
 | 
			
		||||
        # warmup
 | 
			
		||||
| 
						 | 
				
			
			@ -89,8 +91,14 @@ if __name__ == '__main__':
 | 
			
		|||
                                do_sample=False)
 | 
			
		||||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
        end = time.perf_counter()
 | 
			
		||||
 | 
			
		||||
        print(output_str)
 | 
			
		||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
        
 | 
			
		||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
			
		||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
        print(output_str)
 | 
			
		||||
 | 
			
		||||
        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
			
		||||
        # it will use normal generate() and has not these attr
 | 
			
		||||
        from ipex_llm.transformers.convert import get_enable_ipex
 | 
			
		||||
        _enable_ipex = get_enable_ipex()
 | 
			
		||||
        if not _enable_ipex or actual_in_len >= 256:
 | 
			
		||||
            print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
            print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -87,7 +87,13 @@ if __name__ == '__main__':
 | 
			
		|||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
        end = time.perf_counter()
 | 
			
		||||
 | 
			
		||||
        print(output_str)
 | 
			
		||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
			
		||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
        print(output_str)
 | 
			
		||||
 | 
			
		||||
        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
			
		||||
        # it will use normal generate() and has not these attr
 | 
			
		||||
        from ipex_llm.transformers.convert import get_enable_ipex
 | 
			
		||||
        _enable_ipex = get_enable_ipex()
 | 
			
		||||
        if not _enable_ipex or actual_in_len >= 256:
 | 
			
		||||
            print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
            print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -16,6 +16,7 @@
 | 
			
		|||
 | 
			
		||||
import torch
 | 
			
		||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
 | 
			
		||||
 | 
			
		||||
from transformers import LlamaTokenizer, AutoTokenizer
 | 
			
		||||
import argparse
 | 
			
		||||
import time
 | 
			
		||||
| 
						 | 
				
			
			@ -104,7 +105,13 @@ if __name__ == '__main__':
 | 
			
		|||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
        end = time.perf_counter()
 | 
			
		||||
 | 
			
		||||
        print(output_str)
 | 
			
		||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
			
		||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
        print(output_str)
 | 
			
		||||
 | 
			
		||||
        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
			
		||||
        # it will use normal generate() and has not these attr
 | 
			
		||||
        from ipex_llm.transformers.convert import get_enable_ipex
 | 
			
		||||
        _enable_ipex = get_enable_ipex()
 | 
			
		||||
        if not _enable_ipex or actual_in_len >= 256:
 | 
			
		||||
            print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
            print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -97,7 +97,14 @@ if __name__ == '__main__':
 | 
			
		|||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
        end = time.perf_counter()
 | 
			
		||||
 | 
			
		||||
        print(output_str)
 | 
			
		||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
			
		||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
        print(output_str)
 | 
			
		||||
 | 
			
		||||
        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
			
		||||
        # it will use normal generate() and has not these attr
 | 
			
		||||
        from ipex_llm.transformers.convert import get_enable_ipex
 | 
			
		||||
        _enable_ipex = get_enable_ipex()
 | 
			
		||||
        if not _enable_ipex or actual_in_len >= 256:
 | 
			
		||||
            print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
            print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -101,7 +101,13 @@ if __name__ == '__main__':
 | 
			
		|||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
        end = time.perf_counter()
 | 
			
		||||
 | 
			
		||||
        print(output_str)
 | 
			
		||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
			
		||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
        print(output_str)
 | 
			
		||||
 | 
			
		||||
        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
			
		||||
        # it will use normal generate() and has not these attr
 | 
			
		||||
        from ipex_llm.transformers.convert import get_enable_ipex
 | 
			
		||||
        _enable_ipex = get_enable_ipex()
 | 
			
		||||
        if not _enable_ipex or actual_in_len >= 256:
 | 
			
		||||
            print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
            print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -81,7 +81,13 @@ if __name__ == '__main__':
 | 
			
		|||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
        end = time.perf_counter()
 | 
			
		||||
 | 
			
		||||
        print(output_str)
 | 
			
		||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
			
		||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
        print(output_str)
 | 
			
		||||
 | 
			
		||||
        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
			
		||||
        # it will use normal generate() and has not these attr
 | 
			
		||||
        from ipex_llm.transformers.convert import get_enable_ipex
 | 
			
		||||
        _enable_ipex = get_enable_ipex()
 | 
			
		||||
        if not _enable_ipex or actual_in_len >= 256:
 | 
			
		||||
            print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
            print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -79,6 +79,8 @@ if __name__ == '__main__':
 | 
			
		|||
        prompt = Vicuna_PROMPT_FORMAT.format(prompt=args.prompt)
 | 
			
		||||
        inputs = tokenizer(prompt, return_tensors='pt', padding=True)
 | 
			
		||||
        input_ids = inputs.input_ids.to(model.device)
 | 
			
		||||
        actual_in_len = input_ids.shape[1]
 | 
			
		||||
        print("actual input_ids length:" + str(actual_in_len))
 | 
			
		||||
        attention_mask = inputs.attention_mask.to(model.device)
 | 
			
		||||
 | 
			
		||||
        # warmup
 | 
			
		||||
| 
						 | 
				
			
			@ -97,7 +99,13 @@ if __name__ == '__main__':
 | 
			
		|||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
        end = time.perf_counter()
 | 
			
		||||
 | 
			
		||||
        print(output_str)
 | 
			
		||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
			
		||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
        print(output_str)
 | 
			
		||||
 | 
			
		||||
        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
			
		||||
        # it will use normal generate() and has not these attr
 | 
			
		||||
        from ipex_llm.transformers.convert import get_enable_ipex
 | 
			
		||||
        _enable_ipex = get_enable_ipex()
 | 
			
		||||
        if not _enable_ipex or actual_in_len >= 256:
 | 
			
		||||
            print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
            print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -81,7 +81,13 @@ if __name__ == '__main__':
 | 
			
		|||
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 | 
			
		||||
        end = time.perf_counter()
 | 
			
		||||
 | 
			
		||||
        print(output_str)
 | 
			
		||||
        print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
        print(f"E2E Generation time {(end - st):.4f}s")
 | 
			
		||||
        print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
        print(output_str)
 | 
			
		||||
 | 
			
		||||
        # When the IPEX_CPU optimized models recive short prompts(length < 256)
 | 
			
		||||
        # it will use normal generate() and has not these attr
 | 
			
		||||
        from ipex_llm.transformers.convert import get_enable_ipex
 | 
			
		||||
        _enable_ipex = get_enable_ipex()
 | 
			
		||||
        if not _enable_ipex or actual_in_len >= 256:
 | 
			
		||||
            print(f"Tokens generated {model.n_token_generated}")
 | 
			
		||||
            print(f"First token latency {model.first_token_time:.4f}s")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue