Fix short prompt for IPEX_CPU speculative decoding cause no_attr error (#10783)
This commit is contained in:
parent
7ec82c6042
commit
0646e2c062
8 changed files with 79 additions and 25 deletions
|
|
@ -70,6 +70,8 @@ if __name__ == '__main__':
|
|||
prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
|
||||
inputs = tokenizer(prompt, return_tensors='pt', padding=True)
|
||||
input_ids = inputs.input_ids.to(model.device)
|
||||
actual_in_len = input_ids.shape[1]
|
||||
print("actual input_ids length:" + str(actual_in_len))
|
||||
attention_mask = inputs.attention_mask.to(model.device)
|
||||
|
||||
# warmup
|
||||
|
|
@ -90,7 +92,13 @@ if __name__ == '__main__':
|
|||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
end = time.perf_counter()
|
||||
|
||||
print(output_str)
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"E2E Generation time {(end - st):.4f}s")
|
||||
print(output_str)
|
||||
|
||||
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||
# it will use normal generate() and has not these attr
|
||||
from ipex_llm.transformers.convert import get_enable_ipex
|
||||
_enable_ipex = get_enable_ipex()
|
||||
if not _enable_ipex or actual_in_len >= 256:
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"First token latency {model.first_token_time:.4f}s")
|
||||
|
|
|
|||
|
|
@ -87,7 +87,13 @@ if __name__ == '__main__':
|
|||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
end = time.perf_counter()
|
||||
|
||||
print(output_str)
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"E2E Generation time {(end - st):.4f}s")
|
||||
print(output_str)
|
||||
|
||||
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||
# it will use normal generate() and has not these attr
|
||||
from ipex_llm.transformers.convert import get_enable_ipex
|
||||
_enable_ipex = get_enable_ipex()
|
||||
if not _enable_ipex or actual_in_len >= 256:
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"First token latency {model.first_token_time:.4f}s")
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
import torch
|
||||
from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
|
||||
|
||||
from transformers import LlamaTokenizer, AutoTokenizer
|
||||
import argparse
|
||||
import time
|
||||
|
|
@ -104,7 +105,13 @@ if __name__ == '__main__':
|
|||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
end = time.perf_counter()
|
||||
|
||||
print(output_str)
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"E2E Generation time {(end - st):.4f}s")
|
||||
print(output_str)
|
||||
|
||||
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||
# it will use normal generate() and has not these attr
|
||||
from ipex_llm.transformers.convert import get_enable_ipex
|
||||
_enable_ipex = get_enable_ipex()
|
||||
if not _enable_ipex or actual_in_len >= 256:
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"First token latency {model.first_token_time:.4f}s")
|
||||
|
|
|
|||
|
|
@ -97,7 +97,14 @@ if __name__ == '__main__':
|
|||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
end = time.perf_counter()
|
||||
|
||||
print(output_str)
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"E2E Generation time {(end - st):.4f}s")
|
||||
print(output_str)
|
||||
|
||||
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||
# it will use normal generate() and has not these attr
|
||||
from ipex_llm.transformers.convert import get_enable_ipex
|
||||
_enable_ipex = get_enable_ipex()
|
||||
if not _enable_ipex or actual_in_len >= 256:
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"First token latency {model.first_token_time:.4f}s")
|
||||
|
||||
|
|
|
|||
|
|
@ -101,7 +101,13 @@ if __name__ == '__main__':
|
|||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
end = time.perf_counter()
|
||||
|
||||
print(output_str)
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"E2E Generation time {(end - st):.4f}s")
|
||||
print(output_str)
|
||||
|
||||
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||
# it will use normal generate() and has not these attr
|
||||
from ipex_llm.transformers.convert import get_enable_ipex
|
||||
_enable_ipex = get_enable_ipex()
|
||||
if not _enable_ipex or actual_in_len >= 256:
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"First token latency {model.first_token_time:.4f}s")
|
||||
|
|
|
|||
|
|
@ -81,7 +81,13 @@ if __name__ == '__main__':
|
|||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
end = time.perf_counter()
|
||||
|
||||
print(output_str)
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"E2E Generation time {(end - st):.4f}s")
|
||||
print(output_str)
|
||||
|
||||
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||
# it will use normal generate() and has not these attr
|
||||
from ipex_llm.transformers.convert import get_enable_ipex
|
||||
_enable_ipex = get_enable_ipex()
|
||||
if not _enable_ipex or actual_in_len >= 256:
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"First token latency {model.first_token_time:.4f}s")
|
||||
|
|
|
|||
|
|
@ -79,6 +79,8 @@ if __name__ == '__main__':
|
|||
prompt = Vicuna_PROMPT_FORMAT.format(prompt=args.prompt)
|
||||
inputs = tokenizer(prompt, return_tensors='pt', padding=True)
|
||||
input_ids = inputs.input_ids.to(model.device)
|
||||
actual_in_len = input_ids.shape[1]
|
||||
print("actual input_ids length:" + str(actual_in_len))
|
||||
attention_mask = inputs.attention_mask.to(model.device)
|
||||
|
||||
# warmup
|
||||
|
|
@ -97,7 +99,13 @@ if __name__ == '__main__':
|
|||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
end = time.perf_counter()
|
||||
|
||||
print(output_str)
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"E2E Generation time {(end - st):.4f}s")
|
||||
print(output_str)
|
||||
|
||||
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||
# it will use normal generate() and has not these attr
|
||||
from ipex_llm.transformers.convert import get_enable_ipex
|
||||
_enable_ipex = get_enable_ipex()
|
||||
if not _enable_ipex or actual_in_len >= 256:
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"First token latency {model.first_token_time:.4f}s")
|
||||
|
|
|
|||
|
|
@ -81,7 +81,13 @@ if __name__ == '__main__':
|
|||
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
end = time.perf_counter()
|
||||
|
||||
print(output_str)
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"E2E Generation time {(end - st):.4f}s")
|
||||
print(output_str)
|
||||
|
||||
# When the IPEX_CPU optimized models recive short prompts(length < 256)
|
||||
# it will use normal generate() and has not these attr
|
||||
from ipex_llm.transformers.convert import get_enable_ipex
|
||||
_enable_ipex = get_enable_ipex()
|
||||
if not _enable_ipex or actual_in_len >= 256:
|
||||
print(f"Tokens generated {model.n_token_generated}")
|
||||
print(f"First token latency {model.first_token_time:.4f}s")
|
||||
|
|
|
|||
Loading…
Reference in a new issue