From 0646e2c0622121bc7563bcfca453066fdd90d502 Mon Sep 17 00:00:00 2001 From: ZehuaCao <47251317+Romanticoseu@users.noreply.github.com> Date: Wed, 17 Apr 2024 16:19:57 +0800 Subject: [PATCH] Fix short prompt for IPEX_CPU speculative decoding cause no_attr error (#10783) --- .../baichuan2/speculative.py | 16 ++++++++++++---- .../Speculative-Decoding/chatglm3/speculative.py | 12 +++++++++--- .../Speculative-Decoding/llama2/speculative.py | 13 ++++++++++--- .../Speculative-Decoding/mistral/speculative.py | 13 ++++++++++--- .../CPU/Speculative-Decoding/qwen/speculative.py | 12 +++++++++--- .../starcoder/speculative.py | 12 +++++++++--- .../Speculative-Decoding/vicuna/speculative.py | 14 +++++++++++--- .../CPU/Speculative-Decoding/ziya/speculative.py | 12 +++++++++--- 8 files changed, 79 insertions(+), 25 deletions(-) diff --git a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py index 1010618c..84cb9112 100644 --- a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py @@ -70,6 +70,8 @@ if __name__ == '__main__': prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt) inputs = tokenizer(prompt, return_tensors='pt', padding=True) input_ids = inputs.input_ids.to(model.device) + actual_in_len = input_ids.shape[1] + print("actual input_ids length:" + str(actual_in_len)) attention_mask = inputs.attention_mask.to(model.device) # warmup @@ -89,8 +91,14 @@ if __name__ == '__main__': do_sample=False) output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - - print(output_str) - print(f"Tokens generated {model.n_token_generated}") + print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") diff --git a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py index 971e60e6..5ec9a67c 100644 --- a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py @@ -87,7 +87,13 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") diff --git a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py index 5e3c5f8b..f870a094 100644 --- a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py @@ -16,6 +16,7 @@ import torch from ipex_llm.transformers import AutoModel, AutoModelForCausalLM + from transformers import LlamaTokenizer, AutoTokenizer import argparse import time @@ -104,7 +105,13 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") diff --git a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py index 714eb430..1968ccaa 100644 --- a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py @@ -97,7 +97,14 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") + diff --git a/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py b/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py index c92b8512..81205fbd 100644 --- a/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py @@ -101,7 +101,13 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") diff --git a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py index 0bcd026e..c35b0b65 100644 --- a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py @@ -81,7 +81,13 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") diff --git a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py index 279f3550..73970f67 100644 --- a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py @@ -79,6 +79,8 @@ if __name__ == '__main__': prompt = Vicuna_PROMPT_FORMAT.format(prompt=args.prompt) inputs = tokenizer(prompt, return_tensors='pt', padding=True) input_ids = inputs.input_ids.to(model.device) + actual_in_len = input_ids.shape[1] + print("actual input_ids length:" + str(actual_in_len)) attention_mask = inputs.attention_mask.to(model.device) # warmup @@ -97,7 +99,13 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") diff --git a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py index 6db383c4..a8a82474 100644 --- a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py @@ -81,7 +81,13 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s")