Fix short prompt for IPEX_CPU speculative decoding cause no_attr error (#10783)

This commit is contained in:
ZehuaCao 2024-04-17 16:19:57 +08:00 committed by GitHub
parent 7ec82c6042
commit 0646e2c062
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 79 additions and 25 deletions

View file

@ -70,6 +70,8 @@ if __name__ == '__main__':
prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
inputs = tokenizer(prompt, return_tensors='pt', padding=True)
input_ids = inputs.input_ids.to(model.device)
actual_in_len = input_ids.shape[1]
print("actual input_ids length:" + str(actual_in_len))
attention_mask = inputs.attention_mask.to(model.device)
# warmup
@ -89,8 +91,14 @@ if __name__ == '__main__':
do_sample=False)
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
end = time.perf_counter()
print(output_str)
print(f"Tokens generated {model.n_token_generated}")
print(f"E2E Generation time {(end - st):.4f}s")
print(f"First token latency {model.first_token_time:.4f}s")
print(output_str)
# When the IPEX_CPU optimized models recive short prompts(length < 256)
# it will use normal generate() and has not these attr
from ipex_llm.transformers.convert import get_enable_ipex
_enable_ipex = get_enable_ipex()
if not _enable_ipex or actual_in_len >= 256:
print(f"Tokens generated {model.n_token_generated}")
print(f"First token latency {model.first_token_time:.4f}s")

View file

@ -87,7 +87,13 @@ if __name__ == '__main__':
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
end = time.perf_counter()
print(output_str)
print(f"Tokens generated {model.n_token_generated}")
print(f"E2E Generation time {(end - st):.4f}s")
print(f"First token latency {model.first_token_time:.4f}s")
print(output_str)
# When the IPEX_CPU optimized models recive short prompts(length < 256)
# it will use normal generate() and has not these attr
from ipex_llm.transformers.convert import get_enable_ipex
_enable_ipex = get_enable_ipex()
if not _enable_ipex or actual_in_len >= 256:
print(f"Tokens generated {model.n_token_generated}")
print(f"First token latency {model.first_token_time:.4f}s")

View file

@ -16,6 +16,7 @@
import torch
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
from transformers import LlamaTokenizer, AutoTokenizer
import argparse
import time
@ -104,7 +105,13 @@ if __name__ == '__main__':
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
end = time.perf_counter()
print(output_str)
print(f"Tokens generated {model.n_token_generated}")
print(f"E2E Generation time {(end - st):.4f}s")
print(f"First token latency {model.first_token_time:.4f}s")
print(output_str)
# When the IPEX_CPU optimized models recive short prompts(length < 256)
# it will use normal generate() and has not these attr
from ipex_llm.transformers.convert import get_enable_ipex
_enable_ipex = get_enable_ipex()
if not _enable_ipex or actual_in_len >= 256:
print(f"Tokens generated {model.n_token_generated}")
print(f"First token latency {model.first_token_time:.4f}s")

View file

@ -97,7 +97,14 @@ if __name__ == '__main__':
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
end = time.perf_counter()
print(output_str)
print(f"Tokens generated {model.n_token_generated}")
print(f"E2E Generation time {(end - st):.4f}s")
print(f"First token latency {model.first_token_time:.4f}s")
print(output_str)
# When the IPEX_CPU optimized models recive short prompts(length < 256)
# it will use normal generate() and has not these attr
from ipex_llm.transformers.convert import get_enable_ipex
_enable_ipex = get_enable_ipex()
if not _enable_ipex or actual_in_len >= 256:
print(f"Tokens generated {model.n_token_generated}")
print(f"First token latency {model.first_token_time:.4f}s")

View file

@ -101,7 +101,13 @@ if __name__ == '__main__':
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
end = time.perf_counter()
print(output_str)
print(f"Tokens generated {model.n_token_generated}")
print(f"E2E Generation time {(end - st):.4f}s")
print(f"First token latency {model.first_token_time:.4f}s")
print(output_str)
# When the IPEX_CPU optimized models recive short prompts(length < 256)
# it will use normal generate() and has not these attr
from ipex_llm.transformers.convert import get_enable_ipex
_enable_ipex = get_enable_ipex()
if not _enable_ipex or actual_in_len >= 256:
print(f"Tokens generated {model.n_token_generated}")
print(f"First token latency {model.first_token_time:.4f}s")

View file

@ -81,7 +81,13 @@ if __name__ == '__main__':
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
end = time.perf_counter()
print(output_str)
print(f"Tokens generated {model.n_token_generated}")
print(f"E2E Generation time {(end - st):.4f}s")
print(f"First token latency {model.first_token_time:.4f}s")
print(output_str)
# When the IPEX_CPU optimized models recive short prompts(length < 256)
# it will use normal generate() and has not these attr
from ipex_llm.transformers.convert import get_enable_ipex
_enable_ipex = get_enable_ipex()
if not _enable_ipex or actual_in_len >= 256:
print(f"Tokens generated {model.n_token_generated}")
print(f"First token latency {model.first_token_time:.4f}s")

View file

@ -79,6 +79,8 @@ if __name__ == '__main__':
prompt = Vicuna_PROMPT_FORMAT.format(prompt=args.prompt)
inputs = tokenizer(prompt, return_tensors='pt', padding=True)
input_ids = inputs.input_ids.to(model.device)
actual_in_len = input_ids.shape[1]
print("actual input_ids length:" + str(actual_in_len))
attention_mask = inputs.attention_mask.to(model.device)
# warmup
@ -97,7 +99,13 @@ if __name__ == '__main__':
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
end = time.perf_counter()
print(output_str)
print(f"Tokens generated {model.n_token_generated}")
print(f"E2E Generation time {(end - st):.4f}s")
print(f"First token latency {model.first_token_time:.4f}s")
print(output_str)
# When the IPEX_CPU optimized models recive short prompts(length < 256)
# it will use normal generate() and has not these attr
from ipex_llm.transformers.convert import get_enable_ipex
_enable_ipex = get_enable_ipex()
if not _enable_ipex or actual_in_len >= 256:
print(f"Tokens generated {model.n_token_generated}")
print(f"First token latency {model.first_token_time:.4f}s")

View file

@ -81,7 +81,13 @@ if __name__ == '__main__':
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
end = time.perf_counter()
print(output_str)
print(f"Tokens generated {model.n_token_generated}")
print(f"E2E Generation time {(end - st):.4f}s")
print(f"First token latency {model.first_token_time:.4f}s")
print(output_str)
# When the IPEX_CPU optimized models recive short prompts(length < 256)
# it will use normal generate() and has not these attr
from ipex_llm.transformers.convert import get_enable_ipex
_enable_ipex = get_enable_ipex()
if not _enable_ipex or actual_in_len >= 256:
print(f"Tokens generated {model.n_token_generated}")
print(f"First token latency {model.first_token_time:.4f}s")