From 4b8b6f77001e91a5d1f6f9c4860dfb910e29beeb Mon Sep 17 00:00:00 2001
From: YaoyaoChang <cyy574006791@qq.com>
Date: Thu, 28 Aug 2025 01:28:27 -0700
Subject: [PATCH] update

---
 demo/VibeVoice_colab.ipynb  | 134 ++++++++++++++++++------------------
 demo/inference_from_file.py |   2 +-
 2 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/demo/VibeVoice_colab.ipynb b/demo/VibeVoice_colab.ipynb
index 8fc5902..70835f8 100644
--- a/demo/VibeVoice_colab.ipynb
+++ b/demo/VibeVoice_colab.ipynb
@@ -2,64 +2,70 @@
   "cells": [
     {
       "cell_type": "markdown",
+      "id": "AHLptWHtQmw-",
+      "metadata": {
+        "id": "AHLptWHtQmw-"
+      },
       "source": [
         "# VibeVoice Colab — T4 Quickstart (1.5B)\n",
         "This page provides a quickstart guide to run VibeVoice on Colab with T4.\n",
         "\n",
-        "T4 only support 1.5B model due to GPU memory. For the real WOW TTS experience, please try the 7B model on a stronger GPU.\n"
-      ],
-      "metadata": {
-        "id": "AHLptWHtQmw-"
-      },
-      "id": "AHLptWHtQmw-"
+        "The T4 GPU can only support the 1.5B model due to memory limitations. Please note that T4 can only use SDPA instead of flash_attention_2, which may result in unstable and lower audio quality. For the best TTS experience, we recommend trying the 7B model on a more powerful GPU."
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "## Step 1: Use T4\n",
-        "\n"
-      ],
+      "id": "vzwhx5AtQ37g",
       "metadata": {
         "id": "vzwhx5AtQ37g"
       },
-      "id": "vzwhx5AtQ37g"
+      "source": [
+        "## Step 1: Use T4\n",
+        "\n"
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "Use T4 in Colab: go to Runtime → Change runtime type → Hardware accelerator: GPU → T4."
-      ],
+      "id": "ryxffqxlVbbP",
       "metadata": {
         "id": "ryxffqxlVbbP"
       },
-      "id": "ryxffqxlVbbP"
+      "source": [
+        "Use T4 in Colab: go to Runtime → Change runtime type → Hardware accelerator: GPU → T4."
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "id": "Hek0yZKdVot_",
+      "metadata": {
+        "id": "Hek0yZKdVot_"
+      },
+      "outputs": [],
       "source": [
         "import torch\n",
         "print(torch.cuda.is_available())\n",
         "!nvidia-smi"
-      ],
-      "metadata": {
-        "id": "Hek0yZKdVot_"
-      },
-      "id": "Hek0yZKdVot_",
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "## Step 2: Env Install"
-      ],
+      "id": "S8D9WNSvWFwy",
       "metadata": {
         "id": "S8D9WNSvWFwy"
       },
-      "id": "S8D9WNSvWFwy"
+      "source": [
+        "## Step 2: Env Install"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "id": "2xGbc7gKMD7A",
+      "metadata": {
+        "id": "2xGbc7gKMD7A"
+      },
+      "outputs": [],
       "source": [
         "!git clone https://github.com/microsoft/VibeVoice.git\n",
         "\n",
@@ -68,83 +74,78 @@
         "\n",
         "!apt update && apt install ffmpeg -y\n",
         "!pip install -e ."
-      ],
-      "metadata": {
-        "id": "2xGbc7gKMD7A"
-      },
-      "id": "2xGbc7gKMD7A",
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "## Step 3: Run VibeVoice"
-      ],
+      "id": "YmxjRFSFW4aE",
       "metadata": {
         "id": "YmxjRFSFW4aE"
       },
-      "id": "YmxjRFSFW4aE"
+      "source": [
+        "## Step 3: Run VibeVoice"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "id": "MfQ0geOJQNS5",
+      "metadata": {
+        "id": "MfQ0geOJQNS5"
+      },
+      "outputs": [],
       "source": [
         "# First download checkpoint takes ~3 minutes\n",
         "!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/2p_short.txt --speaker_names Alice Frank\n",
         "\n",
         "from IPython.display import Audio\n",
         "Audio(\"./outputs/2p_short_generated.wav\")"
-      ],
-      "metadata": {
-        "id": "MfQ0geOJQNS5"
-      },
-      "id": "MfQ0geOJQNS5",
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "### Create your own example"
-      ],
+      "id": "Pd6-KX2Hdswx",
       "metadata": {
         "id": "Pd6-KX2Hdswx"
       },
-      "id": "Pd6-KX2Hdswx"
+      "source": [
+        "### TTS from your text"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "id": "ZB482MvXbg8M",
+      "metadata": {
+        "id": "ZB482MvXbg8M"
+      },
+      "outputs": [],
       "source": [
         "text = \"\"\"Speaker 1: Can I try VibeVoice with my own example?\n",
         "Speaker 2: Of course! VibeVoice is open-source, built to benefit everyone — you’re welcome to try it out.\"\"\"\n",
         "with open(\"demo/text_examples/my_example.txt\", \"w\", encoding=\"utf-8\") as f:\n",
         "    f.write(text)"
-      ],
-      "metadata": {
-        "id": "ZB482MvXbg8M"
-      },
-      "id": "ZB482MvXbg8M",
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/my_example.txt --speaker_names Alice Frank\n",
-        "Audio(\"./outputs/my_example_generated.wav\")\n"
-      ],
+      "execution_count": null,
+      "id": "heoxL08yM-gf",
       "metadata": {
         "id": "heoxL08yM-gf"
       },
-      "id": "heoxL08yM-gf",
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/my_example.txt --speaker_names Alice Frank\n",
+        "Audio(\"./outputs/my_example_generated.wav\")\n"
+      ]
     }
   ],
   "metadata": {
+    "accelerator": "GPU",
     "colab": {
-      "provenance": [],
-      "gpuType": "T4"
+      "gpuType": "T4",
+      "provenance": []
     },
     "kernelspec": {
       "display_name": "Python 3",
@@ -161,9 +162,8 @@
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
       "version": "3.10.11"
-    },
-    "accelerator": "GPU"
+    }
   },
   "nbformat": 4,
   "nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/demo/inference_from_file.py b/demo/inference_from_file.py
index 2938a42..73fbce8 100644
--- a/demo/inference_from_file.py
+++ b/demo/inference_from_file.py
@@ -256,7 +256,7 @@ def main():
     except Exception as e:
         print(f"[ERROR] : {type(e).__name__}: {e}")
         print(traceback.format_exc())
-        print("Error loading model, try sdpa.")
+        print("Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.")
         model = VibeVoiceForConditionalGenerationInference.from_pretrained(
             args.model_path,
             torch_dtype=torch.bfloat16,