diff --git a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py index 1010618c..84cb9112 100644 --- a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py @@ -70,6 +70,8 @@ if __name__ == '__main__': prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt) inputs = tokenizer(prompt, return_tensors='pt', padding=True) input_ids = inputs.input_ids.to(model.device) + actual_in_len = input_ids.shape[1] + print("actual input_ids length:" + str(actual_in_len)) attention_mask = inputs.attention_mask.to(model.device) # warmup @@ -89,8 +91,14 @@ if __name__ == '__main__': do_sample=False) output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - - print(output_str) - print(f"Tokens generated {model.n_token_generated}") + print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") diff --git a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py index 971e60e6..5ec9a67c 100644 --- a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py @@ -87,7 +87,13 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") diff --git a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py index 5e3c5f8b..f870a094 100644 --- a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py @@ -16,6 +16,7 @@ import torch from ipex_llm.transformers import AutoModel, AutoModelForCausalLM + from transformers import LlamaTokenizer, AutoTokenizer import argparse import time @@ -104,7 +105,13 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") diff --git a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py index 714eb430..1968ccaa 100644 --- a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py @@ -97,7 +97,14 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") + diff --git a/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py b/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py index c92b8512..81205fbd 100644 --- a/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py @@ -101,7 +101,13 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") diff --git a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py index 0bcd026e..c35b0b65 100644 --- a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py @@ -81,7 +81,13 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") diff --git a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py index 279f3550..73970f67 100644 --- a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py @@ -79,6 +79,8 @@ if __name__ == '__main__': prompt = Vicuna_PROMPT_FORMAT.format(prompt=args.prompt) inputs = tokenizer(prompt, return_tensors='pt', padding=True) input_ids = inputs.input_ids.to(model.device) + actual_in_len = input_ids.shape[1] + print("actual input_ids length:" + str(actual_in_len)) attention_mask = inputs.attention_mask.to(model.device) # warmup @@ -97,7 +99,13 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s") diff --git a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py index 6db383c4..a8a82474 100644 --- a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py +++ b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py @@ -81,7 +81,13 @@ if __name__ == '__main__': output_str = tokenizer.decode(output[0], skip_special_tokens=True) end = time.perf_counter() - print(output_str) - print(f"Tokens generated {model.n_token_generated}") print(f"E2E Generation time {(end - st):.4f}s") - print(f"First token latency {model.first_token_time:.4f}s") + print(output_str) + + # When the IPEX_CPU optimized models recive short prompts(length < 256) + # it will use normal generate() and has not these attr + from ipex_llm.transformers.convert import get_enable_ipex + _enable_ipex = get_enable_ipex() + if not _enable_ipex or actual_in_len >= 256: + print(f"Tokens generated {model.n_token_generated}") + print(f"First token latency {model.first_token_time:.4f}s")