From 056bb5b0fab54fcf7ab2c1c6aa6562a1d79dd124 Mon Sep 17 00:00:00 2001 From: YaoyaoChang Date: Tue, 26 Aug 2025 19:44:34 -0700 Subject: [PATCH] add args to use_eager --- demo/inference_from_file.py | 8 +++++++- demo/text_examples/2p_short.txt | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 demo/text_examples/2p_short.txt diff --git a/demo/inference_from_file.py b/demo/inference_from_file.py index 078b53a..21f3407 100644 --- a/demo/inference_from_file.py +++ b/demo/inference_from_file.py @@ -175,6 +175,11 @@ def parse_args(): default=1.3, help="CFG (Classifier-Free Guidance) scale for generation (default: 1.3)", ) + parser.add_argument( + "--use_eager", + action="store_true", + help="Use eager attention mode instead of flash_attention_2", + ) return parser.parse_args() @@ -244,11 +249,12 @@ def main(): processor = VibeVoiceProcessor.from_pretrained(args.model_path) # Load model + attn_implementation = "flash_attention_2" if not args.use_eager else "eager" model = VibeVoiceForConditionalGenerationInference.from_pretrained( args.model_path, torch_dtype=torch.bfloat16, device_map='cuda', - attn_implementation="flash_attention_2" # we only test flash_attention_2 + attn_implementation=attn_implementation # flash_attention_2 is recommended, eager may lead to lower audio quality ) model.eval() diff --git a/demo/text_examples/2p_short.txt b/demo/text_examples/2p_short.txt new file mode 100644 index 0000000..0f9c0e4 --- /dev/null +++ b/demo/text_examples/2p_short.txt @@ -0,0 +1,2 @@ +Speaker 1: I heard there’s big news in TTS lately? +Speaker 2: Yes! Microsoft Research just open-sourced VibeVoice. The model can generate speech up to 90 minutes long, with smooth delivery and rich emotion — it’s absolutely amazing. \ No newline at end of file