From 4b8b6f77001e91a5d1f6f9c4860dfb910e29beeb Mon Sep 17 00:00:00 2001 From: YaoyaoChang Date: Thu, 28 Aug 2025 01:28:27 -0700 Subject: [PATCH] update --- demo/VibeVoice_colab.ipynb | 134 ++++++++++++++++++------------------ demo/inference_from_file.py | 2 +- 2 files changed, 68 insertions(+), 68 deletions(-) diff --git a/demo/VibeVoice_colab.ipynb b/demo/VibeVoice_colab.ipynb index 8fc5902..70835f8 100644 --- a/demo/VibeVoice_colab.ipynb +++ b/demo/VibeVoice_colab.ipynb @@ -2,64 +2,70 @@ "cells": [ { "cell_type": "markdown", + "id": "AHLptWHtQmw-", + "metadata": { + "id": "AHLptWHtQmw-" + }, "source": [ "# VibeVoice Colab — T4 Quickstart (1.5B)\n", "This page provides a quickstart guide to run VibeVoice on Colab with T4.\n", "\n", - "T4 only support 1.5B model due to GPU memory. For the real WOW TTS experience, please try the 7B model on a stronger GPU.\n" - ], - "metadata": { - "id": "AHLptWHtQmw-" - }, - "id": "AHLptWHtQmw-" + "The T4 GPU can only support the 1.5B model due to memory limitations. Please note that T4 can only use SDPA instead of flash_attention_2, which may result in unstable and lower audio quality. For the best TTS experience, we recommend trying the 7B model on a more powerful GPU." + ] }, { "cell_type": "markdown", - "source": [ - "## Step 1: Use T4\n", - "\n" - ], + "id": "vzwhx5AtQ37g", "metadata": { "id": "vzwhx5AtQ37g" }, - "id": "vzwhx5AtQ37g" + "source": [ + "## Step 1: Use T4\n", + "\n" + ] }, { "cell_type": "markdown", - "source": [ - "Use T4 in Colab: go to Runtime → Change runtime type → Hardware accelerator: GPU → T4." - ], + "id": "ryxffqxlVbbP", "metadata": { "id": "ryxffqxlVbbP" }, - "id": "ryxffqxlVbbP" + "source": [ + "Use T4 in Colab: go to Runtime → Change runtime type → Hardware accelerator: GPU → T4." + ] }, { "cell_type": "code", + "execution_count": null, + "id": "Hek0yZKdVot_", + "metadata": { + "id": "Hek0yZKdVot_" + }, + "outputs": [], "source": [ "import torch\n", "print(torch.cuda.is_available())\n", "!nvidia-smi" - ], - "metadata": { - "id": "Hek0yZKdVot_" - }, - "id": "Hek0yZKdVot_", - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "## Step 2: Env Install" - ], + "id": "S8D9WNSvWFwy", "metadata": { "id": "S8D9WNSvWFwy" }, - "id": "S8D9WNSvWFwy" + "source": [ + "## Step 2: Env Install" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "2xGbc7gKMD7A", + "metadata": { + "id": "2xGbc7gKMD7A" + }, + "outputs": [], "source": [ "!git clone https://github.com/microsoft/VibeVoice.git\n", "\n", @@ -68,83 +74,78 @@ "\n", "!apt update && apt install ffmpeg -y\n", "!pip install -e ." - ], - "metadata": { - "id": "2xGbc7gKMD7A" - }, - "id": "2xGbc7gKMD7A", - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "## Step 3: Run VibeVoice" - ], + "id": "YmxjRFSFW4aE", "metadata": { "id": "YmxjRFSFW4aE" }, - "id": "YmxjRFSFW4aE" + "source": [ + "## Step 3: Run VibeVoice" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "MfQ0geOJQNS5", + "metadata": { + "id": "MfQ0geOJQNS5" + }, + "outputs": [], "source": [ "# First download checkpoint takes ~3 minutes\n", "!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/2p_short.txt --speaker_names Alice Frank\n", "\n", "from IPython.display import Audio\n", "Audio(\"./outputs/2p_short_generated.wav\")" - ], - "metadata": { - "id": "MfQ0geOJQNS5" - }, - "id": "MfQ0geOJQNS5", - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Create your own example" - ], + "id": "Pd6-KX2Hdswx", "metadata": { "id": "Pd6-KX2Hdswx" }, - "id": "Pd6-KX2Hdswx" + "source": [ + "### TTS from your text" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "ZB482MvXbg8M", + "metadata": { + "id": "ZB482MvXbg8M" + }, + "outputs": [], "source": [ "text = \"\"\"Speaker 1: Can I try VibeVoice with my own example?\n", "Speaker 2: Of course! VibeVoice is open-source, built to benefit everyone — you’re welcome to try it out.\"\"\"\n", "with open(\"demo/text_examples/my_example.txt\", \"w\", encoding=\"utf-8\") as f:\n", " f.write(text)" - ], - "metadata": { - "id": "ZB482MvXbg8M" - }, - "id": "ZB482MvXbg8M", - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/my_example.txt --speaker_names Alice Frank\n", - "Audio(\"./outputs/my_example_generated.wav\")\n" - ], + "execution_count": null, + "id": "heoxL08yM-gf", "metadata": { "id": "heoxL08yM-gf" }, - "id": "heoxL08yM-gf", - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/my_example.txt --speaker_names Alice Frank\n", + "Audio(\"./outputs/my_example_generated.wav\")\n" + ] } ], "metadata": { + "accelerator": "GPU", "colab": { - "provenance": [], - "gpuType": "T4" + "gpuType": "T4", + "provenance": [] }, "kernelspec": { "display_name": "Python 3", @@ -161,9 +162,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" - }, - "accelerator": "GPU" + } }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/demo/inference_from_file.py b/demo/inference_from_file.py index 2938a42..73fbce8 100644 --- a/demo/inference_from_file.py +++ b/demo/inference_from_file.py @@ -256,7 +256,7 @@ def main(): except Exception as e: print(f"[ERROR] : {type(e).__name__}: {e}") print(traceback.format_exc()) - print("Error loading model, try sdpa.") + print("Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.") model = VibeVoiceForConditionalGenerationInference.from_pretrained( args.model_path, torch_dtype=torch.bfloat16,