From 0646e2c0622121bc7563bcfca453066fdd90d502 Mon Sep 17 00:00:00 2001
From: ZehuaCao <47251317+Romanticoseu@users.noreply.github.com>
Date: Wed, 17 Apr 2024 16:19:57 +0800
Subject: [PATCH] Fix short prompt for IPEX_CPU speculative decoding cause
 no_attr error (#10783)

---
 .../baichuan2/speculative.py                     | 16 ++++++++++++----
 .../Speculative-Decoding/chatglm3/speculative.py | 12 +++++++++---
 .../Speculative-Decoding/llama2/speculative.py   | 13 ++++++++++---
 .../Speculative-Decoding/mistral/speculative.py  | 13 ++++++++++---
 .../CPU/Speculative-Decoding/qwen/speculative.py | 12 +++++++++---
 .../starcoder/speculative.py                     | 12 +++++++++---
 .../Speculative-Decoding/vicuna/speculative.py   | 14 +++++++++++---
 .../CPU/Speculative-Decoding/ziya/speculative.py | 12 +++++++++---
 8 files changed, 79 insertions(+), 25 deletions(-)

diff --git a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py
index 1010618c..84cb9112 100644
--- a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py
@@ -70,6 +70,8 @@ if __name__ == '__main__':
         prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
         inputs = tokenizer(prompt, return_tensors='pt', padding=True)
         input_ids = inputs.input_ids.to(model.device)
+        actual_in_len = input_ids.shape[1]
+        print("actual input_ids length:" + str(actual_in_len))
         attention_mask = inputs.attention_mask.to(model.device)
 
         # warmup
@@ -89,8 +91,14 @@ if __name__ == '__main__':
                                 do_sample=False)
         output_str = tokenizer.decode(output[0], skip_special_tokens=True)
         end = time.perf_counter()
-
-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
+        
         print(f"E2E Generation time {(end - st):.4f}s")
-        print(f"First token latency {model.first_token_time:.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
+            print(f"First token latency {model.first_token_time:.4f}s")
diff --git a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py
index 971e60e6..5ec9a67c 100644
--- a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py
@@ -87,7 +87,13 @@ if __name__ == '__main__':
         output_str = tokenizer.decode(output[0], skip_special_tokens=True)
         end = time.perf_counter()
 
-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
         print(f"E2E Generation time {(end - st):.4f}s")
-        print(f"First token latency {model.first_token_time:.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
+            print(f"First token latency {model.first_token_time:.4f}s")
diff --git a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py
index 5e3c5f8b..f870a094 100644
--- a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py
@@ -16,6 +16,7 @@
 
 import torch
 from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
+
 from transformers import LlamaTokenizer, AutoTokenizer
 import argparse
 import time
@@ -104,7 +105,13 @@ if __name__ == '__main__':
         output_str = tokenizer.decode(output[0], skip_special_tokens=True)
         end = time.perf_counter()
 
-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
         print(f"E2E Generation time {(end - st):.4f}s")
-        print(f"First token latency {model.first_token_time:.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
+            print(f"First token latency {model.first_token_time:.4f}s")
diff --git a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py
index 714eb430..1968ccaa 100644
--- a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py
@@ -97,7 +97,14 @@ if __name__ == '__main__':
         output_str = tokenizer.decode(output[0], skip_special_tokens=True)
         end = time.perf_counter()
 
-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
         print(f"E2E Generation time {(end - st):.4f}s")
-        print(f"First token latency {model.first_token_time:.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
+            print(f"First token latency {model.first_token_time:.4f}s")
+
diff --git a/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py b/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py
index c92b8512..81205fbd 100644
--- a/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py
@@ -101,7 +101,13 @@ if __name__ == '__main__':
         output_str = tokenizer.decode(output[0], skip_special_tokens=True)
         end = time.perf_counter()
 
-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
         print(f"E2E Generation time {(end - st):.4f}s")
-        print(f"First token latency {model.first_token_time:.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
+            print(f"First token latency {model.first_token_time:.4f}s")
diff --git a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py
index 0bcd026e..c35b0b65 100644
--- a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py
@@ -81,7 +81,13 @@ if __name__ == '__main__':
         output_str = tokenizer.decode(output[0], skip_special_tokens=True)
         end = time.perf_counter()
 
-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
         print(f"E2E Generation time {(end - st):.4f}s")
-        print(f"First token latency {model.first_token_time:.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
+            print(f"First token latency {model.first_token_time:.4f}s")
diff --git a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py
index 279f3550..73970f67 100644
--- a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py
@@ -79,6 +79,8 @@ if __name__ == '__main__':
         prompt = Vicuna_PROMPT_FORMAT.format(prompt=args.prompt)
         inputs = tokenizer(prompt, return_tensors='pt', padding=True)
         input_ids = inputs.input_ids.to(model.device)
+        actual_in_len = input_ids.shape[1]
+        print("actual input_ids length:" + str(actual_in_len))
         attention_mask = inputs.attention_mask.to(model.device)
 
         # warmup
@@ -97,7 +99,13 @@ if __name__ == '__main__':
         output_str = tokenizer.decode(output[0], skip_special_tokens=True)
         end = time.perf_counter()
 
-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
         print(f"E2E Generation time {(end - st):.4f}s")
-        print(f"First token latency {model.first_token_time:.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
+            print(f"First token latency {model.first_token_time:.4f}s")
diff --git a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py
index 6db383c4..a8a82474 100644
--- a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py
@@ -81,7 +81,13 @@ if __name__ == '__main__':
         output_str = tokenizer.decode(output[0], skip_special_tokens=True)
         end = time.perf_counter()
 
-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
         print(f"E2E Generation time {(end - st):.4f}s")
-        print(f"First token latency {model.first_token_time:.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
+            print(f"First token latency {model.first_token_time:.4f}s")