update

2025-08-28 01:28:27 -07:00 · 2025-08-28 01:28:27 -07:00 · 4b8b6f7700
commit 4b8b6f7700
parent 9537bd96a6
2 changed files with 68 additions and 68 deletions
--- a/demo/VibeVoice_colab.ipynb
+++ b/demo/VibeVoice_colab.ipynb
@ -2,64 +2,70 @@
  "cells": [
    {
      "cell_type": "markdown",
      "id": "AHLptWHtQmw-",
      "metadata": {
        "id": "AHLptWHtQmw-"
      },
      "source": [
        "# VibeVoice Colab — T4 Quickstart (1.5B)\n",
        "This page provides a quickstart guide to run VibeVoice on Colab with T4.\n",
        "\n",
-        "T4 only support 1.5B model due to GPU memory. For the real WOW TTS experience, please try the 7B model on a stronger GPU.\n"
+        "The T4 GPU can only support the 1.5B model due to memory limitations. Please note that T4 can only use SDPA instead of flash_attention_2, which may result in unstable and lower audio quality. For the best TTS experience, we recommend trying the 7B model on a more powerful GPU."
-      ],
+      ]
      "metadata": {
        "id": "AHLptWHtQmw-"
      },
      "id": "AHLptWHtQmw-"
    },
    {
      "cell_type": "markdown",
-      "source": [
+      "id": "vzwhx5AtQ37g",
        "## Step 1: Use T4\n",
        "\n"
      ],
      "metadata": {
        "id": "vzwhx5AtQ37g"
      },
-      "id": "vzwhx5AtQ37g"
+      "source": [
        "## Step 1: Use T4\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
-      "source": [
+      "id": "ryxffqxlVbbP",
        "Use T4 in Colab: go to Runtime → Change runtime type → Hardware accelerator: GPU → T4."
      ],
      "metadata": {
        "id": "ryxffqxlVbbP"
      },
-      "id": "ryxffqxlVbbP"
+      "source": [
        "Use T4 in Colab: go to Runtime → Change runtime type → Hardware accelerator: GPU → T4."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "Hek0yZKdVot_",
      "metadata": {
        "id": "Hek0yZKdVot_"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "print(torch.cuda.is_available())\n",
        "!nvidia-smi"
-      ],
+      ]
      "metadata": {
        "id": "Hek0yZKdVot_"
      },
      "id": "Hek0yZKdVot_",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
-      "source": [
+      "id": "S8D9WNSvWFwy",
        "## Step 2: Env Install"
      ],
      "metadata": {
        "id": "S8D9WNSvWFwy"
      },
-      "id": "S8D9WNSvWFwy"
+      "source": [
        "## Step 2: Env Install"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "2xGbc7gKMD7A",
      "metadata": {
        "id": "2xGbc7gKMD7A"
      },
      "outputs": [],
      "source": [
        "!git clone https://github.com/microsoft/VibeVoice.git\n",
        "\n",
@ -68,83 +74,78 @@
        "\n",
        "!apt update && apt install ffmpeg -y\n",
        "!pip install -e ."
-      ],
+      ]
      "metadata": {
        "id": "2xGbc7gKMD7A"
      },
      "id": "2xGbc7gKMD7A",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
-      "source": [
+      "id": "YmxjRFSFW4aE",
        "## Step 3: Run VibeVoice"
      ],
      "metadata": {
        "id": "YmxjRFSFW4aE"
      },
-      "id": "YmxjRFSFW4aE"
+      "source": [
        "## Step 3: Run VibeVoice"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "MfQ0geOJQNS5",
      "metadata": {
        "id": "MfQ0geOJQNS5"
      },
      "outputs": [],
      "source": [
        "# First download checkpoint takes ~3 minutes\n",
        "!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/2p_short.txt --speaker_names Alice Frank\n",
        "\n",
        "from IPython.display import Audio\n",
        "Audio(\"./outputs/2p_short_generated.wav\")"
-      ],
+      ]
      "metadata": {
        "id": "MfQ0geOJQNS5"
      },
      "id": "MfQ0geOJQNS5",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
-      "source": [
+      "id": "Pd6-KX2Hdswx",
        "### Create your own example"
      ],
      "metadata": {
        "id": "Pd6-KX2Hdswx"
      },
-      "id": "Pd6-KX2Hdswx"
+      "source": [
        "### TTS from your text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ZB482MvXbg8M",
      "metadata": {
        "id": "ZB482MvXbg8M"
      },
      "outputs": [],
      "source": [
        "text = \"\"\"Speaker 1: Can I try VibeVoice with my own example?\n",
        "Speaker 2: Of course! VibeVoice is open-source, built to benefit everyone — you’re welcome to try it out.\"\"\"\n",
        "with open(\"demo/text_examples/my_example.txt\", \"w\", encoding=\"utf-8\") as f:\n",
        "    f.write(text)"
-      ],
+      ]
      "metadata": {
        "id": "ZB482MvXbg8M"
      },
      "id": "ZB482MvXbg8M",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
-      "source": [
+      "execution_count": null,
-        "!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/my_example.txt --speaker_names Alice Frank\n",
+      "id": "heoxL08yM-gf",
        "Audio(\"./outputs/my_example_generated.wav\")\n"
      ],
      "metadata": {
        "id": "heoxL08yM-gf"
      },
-      "id": "heoxL08yM-gf",
+      "outputs": [],
-      "execution_count": null,
+      "source": [
-      "outputs": []
+        "!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/my_example.txt --speaker_names Alice Frank\n",
        "Audio(\"./outputs/my_example_generated.wav\")\n"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
-      "provenance": [],
+      "gpuType": "T4",
-      "gpuType": "T4"
+      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
@ -161,8 +162,7 @@
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.11"
-    },
+    }
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 5
--- a/demo/inference_from_file.py
+++ b/demo/inference_from_file.py
@ -256,7 +256,7 @@ def main():
    except Exception as e:
        print(f"[ERROR] : {type(e).__name__}: {e}")
        print(traceback.format_exc())
-        print("Error loading model, try sdpa.")
+        print("Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.")
        model = VibeVoiceForConditionalGenerationInference.from_pretrained(
            args.model_path,
            torch_dtype=torch.bfloat16,