Fix short prompt for IPEX_CPU speculative decoding cause no_attr error (#10783)
This commit is contained in:
parent
7ec82c6042
commit
0646e2c062
8 changed files with 79 additions and 25 deletions
|
|
@ -70,6 +70,8 @@ if __name__ == '__main__':
|
||||||
prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
|
prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
|
||||||
inputs = tokenizer(prompt, return_tensors='pt', padding=True)
|
inputs = tokenizer(prompt, return_tensors='pt', padding=True)
|
||||||
input_ids = inputs.input_ids.to(model.device)
|
input_ids = inputs.input_ids.to(model.device)
|
||||||
|
actual_in_len = input_ids.shape[1]
|
||||||
|
print("actual input_ids length:" + str(actual_in_len))
|
||||||
attention_mask = inputs.attention_mask.to(model.device)
|
attention_mask = inputs.attention_mask.to(model.device)
|
||||||
|
|
||||||
# warmup
|
# warmup
|
||||||
|
|
@ -89,8 +91,14 @@ if __name__ == '__main__':
|
||||||
do_sample=False)
|
do_sample=False)
|
||||||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
|
|
||||||
print(output_str)
|
|
||||||
print(f"Tokens generated {model.n_token_generated}")
|
|
||||||
print(f"E2E Generation time {(end - st):.4f}s")
|
print(f"E2E Generation time {(end - st):.4f}s")
|
||||||
print(f"First token latency {model.first_token_time:.4f}s")
|
print(output_str)
|
||||||
|
|
||||||
|
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||||
|
# it will use normal generate() and has not these attr
|
||||||
|
from ipex_llm.transformers.convert import get_enable_ipex
|
||||||
|
_enable_ipex = get_enable_ipex()
|
||||||
|
if not _enable_ipex or actual_in_len >= 256:
|
||||||
|
print(f"Tokens generated {model.n_token_generated}")
|
||||||
|
print(f"First token latency {model.first_token_time:.4f}s")
|
||||||
|
|
|
||||||
|
|
@ -87,7 +87,13 @@ if __name__ == '__main__':
|
||||||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
|
|
||||||
print(output_str)
|
|
||||||
print(f"Tokens generated {model.n_token_generated}")
|
|
||||||
print(f"E2E Generation time {(end - st):.4f}s")
|
print(f"E2E Generation time {(end - st):.4f}s")
|
||||||
print(f"First token latency {model.first_token_time:.4f}s")
|
print(output_str)
|
||||||
|
|
||||||
|
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||||
|
# it will use normal generate() and has not these attr
|
||||||
|
from ipex_llm.transformers.convert import get_enable_ipex
|
||||||
|
_enable_ipex = get_enable_ipex()
|
||||||
|
if not _enable_ipex or actual_in_len >= 256:
|
||||||
|
print(f"Tokens generated {model.n_token_generated}")
|
||||||
|
print(f"First token latency {model.first_token_time:.4f}s")
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,7 @@
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||||
|
|
||||||
from transformers import LlamaTokenizer, AutoTokenizer
|
from transformers import LlamaTokenizer, AutoTokenizer
|
||||||
import argparse
|
import argparse
|
||||||
import time
|
import time
|
||||||
|
|
@ -104,7 +105,13 @@ if __name__ == '__main__':
|
||||||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
|
|
||||||
print(output_str)
|
|
||||||
print(f"Tokens generated {model.n_token_generated}")
|
|
||||||
print(f"E2E Generation time {(end - st):.4f}s")
|
print(f"E2E Generation time {(end - st):.4f}s")
|
||||||
print(f"First token latency {model.first_token_time:.4f}s")
|
print(output_str)
|
||||||
|
|
||||||
|
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||||
|
# it will use normal generate() and has not these attr
|
||||||
|
from ipex_llm.transformers.convert import get_enable_ipex
|
||||||
|
_enable_ipex = get_enable_ipex()
|
||||||
|
if not _enable_ipex or actual_in_len >= 256:
|
||||||
|
print(f"Tokens generated {model.n_token_generated}")
|
||||||
|
print(f"First token latency {model.first_token_time:.4f}s")
|
||||||
|
|
|
||||||
|
|
@ -97,7 +97,14 @@ if __name__ == '__main__':
|
||||||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
|
|
||||||
print(output_str)
|
|
||||||
print(f"Tokens generated {model.n_token_generated}")
|
|
||||||
print(f"E2E Generation time {(end - st):.4f}s")
|
print(f"E2E Generation time {(end - st):.4f}s")
|
||||||
print(f"First token latency {model.first_token_time:.4f}s")
|
print(output_str)
|
||||||
|
|
||||||
|
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||||
|
# it will use normal generate() and has not these attr
|
||||||
|
from ipex_llm.transformers.convert import get_enable_ipex
|
||||||
|
_enable_ipex = get_enable_ipex()
|
||||||
|
if not _enable_ipex or actual_in_len >= 256:
|
||||||
|
print(f"Tokens generated {model.n_token_generated}")
|
||||||
|
print(f"First token latency {model.first_token_time:.4f}s")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -101,7 +101,13 @@ if __name__ == '__main__':
|
||||||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
|
|
||||||
print(output_str)
|
|
||||||
print(f"Tokens generated {model.n_token_generated}")
|
|
||||||
print(f"E2E Generation time {(end - st):.4f}s")
|
print(f"E2E Generation time {(end - st):.4f}s")
|
||||||
print(f"First token latency {model.first_token_time:.4f}s")
|
print(output_str)
|
||||||
|
|
||||||
|
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||||
|
# it will use normal generate() and has not these attr
|
||||||
|
from ipex_llm.transformers.convert import get_enable_ipex
|
||||||
|
_enable_ipex = get_enable_ipex()
|
||||||
|
if not _enable_ipex or actual_in_len >= 256:
|
||||||
|
print(f"Tokens generated {model.n_token_generated}")
|
||||||
|
print(f"First token latency {model.first_token_time:.4f}s")
|
||||||
|
|
|
||||||
|
|
@ -81,7 +81,13 @@ if __name__ == '__main__':
|
||||||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
|
|
||||||
print(output_str)
|
|
||||||
print(f"Tokens generated {model.n_token_generated}")
|
|
||||||
print(f"E2E Generation time {(end - st):.4f}s")
|
print(f"E2E Generation time {(end - st):.4f}s")
|
||||||
print(f"First token latency {model.first_token_time:.4f}s")
|
print(output_str)
|
||||||
|
|
||||||
|
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||||
|
# it will use normal generate() and has not these attr
|
||||||
|
from ipex_llm.transformers.convert import get_enable_ipex
|
||||||
|
_enable_ipex = get_enable_ipex()
|
||||||
|
if not _enable_ipex or actual_in_len >= 256:
|
||||||
|
print(f"Tokens generated {model.n_token_generated}")
|
||||||
|
print(f"First token latency {model.first_token_time:.4f}s")
|
||||||
|
|
|
||||||
|
|
@ -79,6 +79,8 @@ if __name__ == '__main__':
|
||||||
prompt = Vicuna_PROMPT_FORMAT.format(prompt=args.prompt)
|
prompt = Vicuna_PROMPT_FORMAT.format(prompt=args.prompt)
|
||||||
inputs = tokenizer(prompt, return_tensors='pt', padding=True)
|
inputs = tokenizer(prompt, return_tensors='pt', padding=True)
|
||||||
input_ids = inputs.input_ids.to(model.device)
|
input_ids = inputs.input_ids.to(model.device)
|
||||||
|
actual_in_len = input_ids.shape[1]
|
||||||
|
print("actual input_ids length:" + str(actual_in_len))
|
||||||
attention_mask = inputs.attention_mask.to(model.device)
|
attention_mask = inputs.attention_mask.to(model.device)
|
||||||
|
|
||||||
# warmup
|
# warmup
|
||||||
|
|
@ -97,7 +99,13 @@ if __name__ == '__main__':
|
||||||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
|
|
||||||
print(output_str)
|
|
||||||
print(f"Tokens generated {model.n_token_generated}")
|
|
||||||
print(f"E2E Generation time {(end - st):.4f}s")
|
print(f"E2E Generation time {(end - st):.4f}s")
|
||||||
print(f"First token latency {model.first_token_time:.4f}s")
|
print(output_str)
|
||||||
|
|
||||||
|
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||||
|
# it will use normal generate() and has not these attr
|
||||||
|
from ipex_llm.transformers.convert import get_enable_ipex
|
||||||
|
_enable_ipex = get_enable_ipex()
|
||||||
|
if not _enable_ipex or actual_in_len >= 256:
|
||||||
|
print(f"Tokens generated {model.n_token_generated}")
|
||||||
|
print(f"First token latency {model.first_token_time:.4f}s")
|
||||||
|
|
|
||||||
|
|
@ -81,7 +81,13 @@ if __name__ == '__main__':
|
||||||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
|
|
||||||
print(output_str)
|
|
||||||
print(f"Tokens generated {model.n_token_generated}")
|
|
||||||
print(f"E2E Generation time {(end - st):.4f}s")
|
print(f"E2E Generation time {(end - st):.4f}s")
|
||||||
print(f"First token latency {model.first_token_time:.4f}s")
|
print(output_str)
|
||||||
|
|
||||||
|
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||||
|
# it will use normal generate() and has not these attr
|
||||||
|
from ipex_llm.transformers.convert import get_enable_ipex
|
||||||
|
_enable_ipex = get_enable_ipex()
|
||||||
|
if not _enable_ipex or actual_in_len >= 256:
|
||||||
|
print(f"Tokens generated {model.n_token_generated}")
|
||||||
|
print(f"First token latency {model.first_token_time:.4f}s")
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue