Fix short prompt for IPEX_CPU speculative decoding cause no_attr error (#10783)

2024-04-17 16:19:57 +08:00 · 2024-04-17 16:19:57 +08:00 · 0646e2c062
commit 0646e2c062
parent 7ec82c6042
8 changed files with 79 additions and 25 deletions
--- a/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/baichuan2/speculative.py
@ -70,6 +70,8 @@ if __name__ == '__main__':
        prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=args.prompt)
        inputs = tokenizer(prompt, return_tensors='pt', padding=True)
        input_ids = inputs.input_ids.to(model.device)
+        actual_in_len = input_ids.shape[1]
+        print("actual input_ids length:" + str(actual_in_len))
        attention_mask = inputs.attention_mask.to(model.device)

        # warmup
@ -90,7 +92,13 @@ if __name__ == '__main__':
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
        end = time.perf_counter()
        
-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
        print(f"E2E Generation time {(end - st):.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
            print(f"First token latency {model.first_token_time:.4f}s")
--- a/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/chatglm3/speculative.py
@ -87,7 +87,13 @@ if __name__ == '__main__':
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
        end = time.perf_counter()

-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
        print(f"E2E Generation time {(end - st):.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
            print(f"First token latency {model.first_token_time:.4f}s")
--- a/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/llama2/speculative.py
@ -16,6 +16,7 @@

 import torch
 from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
+
 from transformers import LlamaTokenizer, AutoTokenizer
 import argparse
 import time
@ -104,7 +105,13 @@ if __name__ == '__main__':
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
        end = time.perf_counter()

-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
        print(f"E2E Generation time {(end - st):.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
            print(f"First token latency {model.first_token_time:.4f}s")
--- a/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/mistral/speculative.py
@ -97,7 +97,14 @@ if __name__ == '__main__':
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
        end = time.perf_counter()

-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
        print(f"E2E Generation time {(end - st):.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
            print(f"First token latency {model.first_token_time:.4f}s")
+
--- a/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/qwen/speculative.py
@ -101,7 +101,13 @@ if __name__ == '__main__':
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
        end = time.perf_counter()

-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
        print(f"E2E Generation time {(end - st):.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
            print(f"First token latency {model.first_token_time:.4f}s")
--- a/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/starcoder/speculative.py
@ -81,7 +81,13 @@ if __name__ == '__main__':
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
        end = time.perf_counter()

-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
        print(f"E2E Generation time {(end - st):.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
            print(f"First token latency {model.first_token_time:.4f}s")
--- a/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/vicuna/speculative.py
@ -79,6 +79,8 @@ if __name__ == '__main__':
        prompt = Vicuna_PROMPT_FORMAT.format(prompt=args.prompt)
        inputs = tokenizer(prompt, return_tensors='pt', padding=True)
        input_ids = inputs.input_ids.to(model.device)
+        actual_in_len = input_ids.shape[1]
+        print("actual input_ids length:" + str(actual_in_len))
        attention_mask = inputs.attention_mask.to(model.device)

        # warmup
@ -97,7 +99,13 @@ if __name__ == '__main__':
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
        end = time.perf_counter()

-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
        print(f"E2E Generation time {(end - st):.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
            print(f"First token latency {model.first_token_time:.4f}s")
--- a/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py
+++ b/python/llm/example/CPU/Speculative-Decoding/ziya/speculative.py
@ -81,7 +81,13 @@ if __name__ == '__main__':
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
        end = time.perf_counter()

-        print(output_str)
-        print(f"Tokens generated {model.n_token_generated}")
        print(f"E2E Generation time {(end - st):.4f}s")
+        print(output_str)
+
+        # When the IPEX_CPU optimized models recive short prompts(length < 256)
+        # it will use normal generate() and has not these attr
+        from ipex_llm.transformers.convert import get_enable_ipex
+        _enable_ipex = get_enable_ipex()
+        if not _enable_ipex or actual_in_len >= 256:
+            print(f"Tokens generated {model.n_token_generated}")
            print(f"First token latency {model.first_token_time:.4f}s")