This commit is contained in:
YaoyaoChang 2025-08-28 01:28:27 -07:00
parent 9537bd96a6
commit 4b8b6f7700
2 changed files with 68 additions and 68 deletions

View file

@ -2,64 +2,70 @@
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "AHLptWHtQmw-",
"metadata": {
"id": "AHLptWHtQmw-"
},
"source": [ "source": [
"# VibeVoice Colab — T4 Quickstart (1.5B)\n", "# VibeVoice Colab — T4 Quickstart (1.5B)\n",
"This page provides a quickstart guide to run VibeVoice on Colab with T4.\n", "This page provides a quickstart guide to run VibeVoice on Colab with T4.\n",
"\n", "\n",
"T4 only support 1.5B model due to GPU memory. For the real WOW TTS experience, please try the 7B model on a stronger GPU.\n" "The T4 GPU can only support the 1.5B model due to memory limitations. Please note that T4 can only use SDPA instead of flash_attention_2, which may result in unstable and lower audio quality. For the best TTS experience, we recommend trying the 7B model on a more powerful GPU."
], ]
"metadata": {
"id": "AHLptWHtQmw-"
},
"id": "AHLptWHtQmw-"
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "id": "vzwhx5AtQ37g",
"## Step 1: Use T4\n",
"\n"
],
"metadata": { "metadata": {
"id": "vzwhx5AtQ37g" "id": "vzwhx5AtQ37g"
}, },
"id": "vzwhx5AtQ37g" "source": [
"## Step 1: Use T4\n",
"\n"
]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "id": "ryxffqxlVbbP",
"Use T4 in Colab: go to Runtime → Change runtime type → Hardware accelerator: GPU → T4."
],
"metadata": { "metadata": {
"id": "ryxffqxlVbbP" "id": "ryxffqxlVbbP"
}, },
"id": "ryxffqxlVbbP" "source": [
"Use T4 in Colab: go to Runtime → Change runtime type → Hardware accelerator: GPU → T4."
]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"id": "Hek0yZKdVot_",
"metadata": {
"id": "Hek0yZKdVot_"
},
"outputs": [],
"source": [ "source": [
"import torch\n", "import torch\n",
"print(torch.cuda.is_available())\n", "print(torch.cuda.is_available())\n",
"!nvidia-smi" "!nvidia-smi"
], ]
"metadata": {
"id": "Hek0yZKdVot_"
},
"id": "Hek0yZKdVot_",
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "id": "S8D9WNSvWFwy",
"## Step 2: Env Install"
],
"metadata": { "metadata": {
"id": "S8D9WNSvWFwy" "id": "S8D9WNSvWFwy"
}, },
"id": "S8D9WNSvWFwy" "source": [
"## Step 2: Env Install"
]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"id": "2xGbc7gKMD7A",
"metadata": {
"id": "2xGbc7gKMD7A"
},
"outputs": [],
"source": [ "source": [
"!git clone https://github.com/microsoft/VibeVoice.git\n", "!git clone https://github.com/microsoft/VibeVoice.git\n",
"\n", "\n",
@ -68,83 +74,78 @@
"\n", "\n",
"!apt update && apt install ffmpeg -y\n", "!apt update && apt install ffmpeg -y\n",
"!pip install -e ." "!pip install -e ."
], ]
"metadata": {
"id": "2xGbc7gKMD7A"
},
"id": "2xGbc7gKMD7A",
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "id": "YmxjRFSFW4aE",
"## Step 3: Run VibeVoice"
],
"metadata": { "metadata": {
"id": "YmxjRFSFW4aE" "id": "YmxjRFSFW4aE"
}, },
"id": "YmxjRFSFW4aE" "source": [
"## Step 3: Run VibeVoice"
]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"id": "MfQ0geOJQNS5",
"metadata": {
"id": "MfQ0geOJQNS5"
},
"outputs": [],
"source": [ "source": [
"# First download checkpoint takes ~3 minutes\n", "# First download checkpoint takes ~3 minutes\n",
"!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/2p_short.txt --speaker_names Alice Frank\n", "!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/2p_short.txt --speaker_names Alice Frank\n",
"\n", "\n",
"from IPython.display import Audio\n", "from IPython.display import Audio\n",
"Audio(\"./outputs/2p_short_generated.wav\")" "Audio(\"./outputs/2p_short_generated.wav\")"
], ]
"metadata": {
"id": "MfQ0geOJQNS5"
},
"id": "MfQ0geOJQNS5",
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "id": "Pd6-KX2Hdswx",
"### Create your own example"
],
"metadata": { "metadata": {
"id": "Pd6-KX2Hdswx" "id": "Pd6-KX2Hdswx"
}, },
"id": "Pd6-KX2Hdswx" "source": [
"### TTS from your text"
]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"id": "ZB482MvXbg8M",
"metadata": {
"id": "ZB482MvXbg8M"
},
"outputs": [],
"source": [ "source": [
"text = \"\"\"Speaker 1: Can I try VibeVoice with my own example?\n", "text = \"\"\"Speaker 1: Can I try VibeVoice with my own example?\n",
"Speaker 2: Of course! VibeVoice is open-source, built to benefit everyone — youre welcome to try it out.\"\"\"\n", "Speaker 2: Of course! VibeVoice is open-source, built to benefit everyone — youre welcome to try it out.\"\"\"\n",
"with open(\"demo/text_examples/my_example.txt\", \"w\", encoding=\"utf-8\") as f:\n", "with open(\"demo/text_examples/my_example.txt\", \"w\", encoding=\"utf-8\") as f:\n",
" f.write(text)" " f.write(text)"
], ]
"metadata": {
"id": "ZB482MvXbg8M"
},
"id": "ZB482MvXbg8M",
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"source": [ "execution_count": null,
"!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/my_example.txt --speaker_names Alice Frank\n", "id": "heoxL08yM-gf",
"Audio(\"./outputs/my_example_generated.wav\")\n"
],
"metadata": { "metadata": {
"id": "heoxL08yM-gf" "id": "heoxL08yM-gf"
}, },
"id": "heoxL08yM-gf", "outputs": [],
"execution_count": null, "source": [
"outputs": [] "!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/my_example.txt --speaker_names Alice Frank\n",
"Audio(\"./outputs/my_example_generated.wav\")\n"
]
} }
], ],
"metadata": { "metadata": {
"accelerator": "GPU",
"colab": { "colab": {
"provenance": [], "gpuType": "T4",
"gpuType": "T4" "provenance": []
}, },
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "Python 3",
@ -161,8 +162,7 @@
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.11" "version": "3.10.11"
}, }
"accelerator": "GPU"
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 5 "nbformat_minor": 5

View file

@ -256,7 +256,7 @@ def main():
except Exception as e: except Exception as e:
print(f"[ERROR] : {type(e).__name__}: {e}") print(f"[ERROR] : {type(e).__name__}: {e}")
print(traceback.format_exc()) print(traceback.format_exc())
print("Error loading model, try sdpa.") print("Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.")
model = VibeVoiceForConditionalGenerationInference.from_pretrained( model = VibeVoiceForConditionalGenerationInference.from_pretrained(
args.model_path, args.model_path,
torch_dtype=torch.bfloat16, torch_dtype=torch.bfloat16,