From 54fbbce40ee0ff536d54e3f230d15b7cc9dbb5d2 Mon Sep 17 00:00:00 2001 From: YaoyaoChang Date: Wed, 27 Aug 2025 18:57:33 -0700 Subject: [PATCH] add colab --- README.md | 4 +- demo/VibeVoice_colab.ipynb | 169 +++++++++++++++++++++++++++++++++++++ 2 files changed, 172 insertions(+), 1 deletion(-) create mode 100644 demo/VibeVoice_colab.ipynb diff --git a/README.md b/README.md index 6db4d46..f74d164 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![Project Page](https://img.shields.io/badge/Project-Page-blue?logo=microsoft)](https://microsoft.github.io/VibeVoice) [![Hugging Face](https://img.shields.io/badge/HuggingFace-Collection-orange?logo=huggingface)](https://huggingface.co/collections/microsoft/vibevoice-68a2ef24a875c44be47b034f) [![Technical Report](https://img.shields.io/badge/Technical-Report-red?logo=adobeacrobatreader)](https://arxiv.org/pdf/2508.19205) +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/microsoft/VibeVoice/blob/main/demo/VibeVoice_colab.ipynb) [![Live Playground](https://img.shields.io/badge/Live-Playground-green?logo=gradio)](https://aka.ms/VibeVoice-Demo) @@ -82,7 +83,8 @@ https://github.com/user-attachments/assets/a357c4b6-9768-495c-a576-1618f6275727 For more examples, see the [Project Page](https://microsoft.github.io/VibeVoice). -Try your own samples at [Demo](https://aka.ms/VibeVoice-Demo). +Try your own samples at [Colab](https://colab.research.google.com/github/microsoft/VibeVoice/blob/main/demo/VibeVoice_colab.ipynb) or [Demo](https://aka.ms/VibeVoice-Demo). + ## Models diff --git a/demo/VibeVoice_colab.ipynb b/demo/VibeVoice_colab.ipynb new file mode 100644 index 0000000..8fc5902 --- /dev/null +++ b/demo/VibeVoice_colab.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# VibeVoice Colab — T4 Quickstart (1.5B)\n", + "This page provides a quickstart guide to run VibeVoice on Colab with T4.\n", + "\n", + "T4 only support 1.5B model due to GPU memory. For the real WOW TTS experience, please try the 7B model on a stronger GPU.\n" + ], + "metadata": { + "id": "AHLptWHtQmw-" + }, + "id": "AHLptWHtQmw-" + }, + { + "cell_type": "markdown", + "source": [ + "## Step 1: Use T4\n", + "\n" + ], + "metadata": { + "id": "vzwhx5AtQ37g" + }, + "id": "vzwhx5AtQ37g" + }, + { + "cell_type": "markdown", + "source": [ + "Use T4 in Colab: go to Runtime → Change runtime type → Hardware accelerator: GPU → T4." + ], + "metadata": { + "id": "ryxffqxlVbbP" + }, + "id": "ryxffqxlVbbP" + }, + { + "cell_type": "code", + "source": [ + "import torch\n", + "print(torch.cuda.is_available())\n", + "!nvidia-smi" + ], + "metadata": { + "id": "Hek0yZKdVot_" + }, + "id": "Hek0yZKdVot_", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Step 2: Env Install" + ], + "metadata": { + "id": "S8D9WNSvWFwy" + }, + "id": "S8D9WNSvWFwy" + }, + { + "cell_type": "code", + "source": [ + "!git clone https://github.com/microsoft/VibeVoice.git\n", + "\n", + "import os\n", + "os.chdir(\"./VibeVoice\")\n", + "\n", + "!apt update && apt install ffmpeg -y\n", + "!pip install -e ." + ], + "metadata": { + "id": "2xGbc7gKMD7A" + }, + "id": "2xGbc7gKMD7A", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Step 3: Run VibeVoice" + ], + "metadata": { + "id": "YmxjRFSFW4aE" + }, + "id": "YmxjRFSFW4aE" + }, + { + "cell_type": "code", + "source": [ + "# First download checkpoint takes ~3 minutes\n", + "!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/2p_short.txt --speaker_names Alice Frank\n", + "\n", + "from IPython.display import Audio\n", + "Audio(\"./outputs/2p_short_generated.wav\")" + ], + "metadata": { + "id": "MfQ0geOJQNS5" + }, + "id": "MfQ0geOJQNS5", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Create your own example" + ], + "metadata": { + "id": "Pd6-KX2Hdswx" + }, + "id": "Pd6-KX2Hdswx" + }, + { + "cell_type": "code", + "source": [ + "text = \"\"\"Speaker 1: Can I try VibeVoice with my own example?\n", + "Speaker 2: Of course! VibeVoice is open-source, built to benefit everyone — you’re welcome to try it out.\"\"\"\n", + "with open(\"demo/text_examples/my_example.txt\", \"w\", encoding=\"utf-8\") as f:\n", + " f.write(text)" + ], + "metadata": { + "id": "ZB482MvXbg8M" + }, + "id": "ZB482MvXbg8M", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/my_example.txt --speaker_names Alice Frank\n", + "Audio(\"./outputs/my_example_generated.wav\")\n" + ], + "metadata": { + "id": "heoxL08yM-gf" + }, + "id": "heoxL08yM-gf", + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + }, + "accelerator": "GPU" + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file