From 056bb5b0fab54fcf7ab2c1c6aa6562a1d79dd124 Mon Sep 17 00:00:00 2001
From: YaoyaoChang <cyy574006791@qq.com>
Date: Tue, 26 Aug 2025 19:44:34 -0700
Subject: [PATCH] add args to use_eager

---
 demo/inference_from_file.py     | 8 +++++++-
 demo/text_examples/2p_short.txt | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 demo/text_examples/2p_short.txt

diff --git a/demo/inference_from_file.py b/demo/inference_from_file.py
index 078b53a..21f3407 100644
--- a/demo/inference_from_file.py
+++ b/demo/inference_from_file.py
@@ -175,6 +175,11 @@ def parse_args():
         default=1.3,
         help="CFG (Classifier-Free Guidance) scale for generation (default: 1.3)",
     )
+    parser.add_argument(
+        "--use_eager",
+        action="store_true",
+        help="Use eager attention mode instead of flash_attention_2",
+    )
     
     return parser.parse_args()
 
@@ -244,11 +249,12 @@ def main():
     processor = VibeVoiceProcessor.from_pretrained(args.model_path)
 
     # Load model
+    attn_implementation = "flash_attention_2" if not args.use_eager else "eager"
     model = VibeVoiceForConditionalGenerationInference.from_pretrained(
         args.model_path,
         torch_dtype=torch.bfloat16,
         device_map='cuda',
-        attn_implementation="flash_attention_2" # we only test flash_attention_2
+        attn_implementation=attn_implementation # flash_attention_2 is recommended, eager may lead to lower audio quality
     )
 
     model.eval()
diff --git a/demo/text_examples/2p_short.txt b/demo/text_examples/2p_short.txt
new file mode 100644
index 0000000..0f9c0e4
--- /dev/null
+++ b/demo/text_examples/2p_short.txt
@@ -0,0 +1,2 @@
+Speaker 1: I heard there’s big news in TTS lately?
+Speaker 2: Yes! Microsoft Research just open-sourced VibeVoice. The model can generate speech up to 90 minutes long, with smooth delivery and rich emotion — it’s absolutely amazing.
\ No newline at end of file