update
This commit is contained in:
parent
9537bd96a6
commit
4b8b6f7700
2 changed files with 68 additions and 68 deletions
|
@ -2,64 +2,70 @@
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"id": "AHLptWHtQmw-",
|
||||||
|
"metadata": {
|
||||||
|
"id": "AHLptWHtQmw-"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# VibeVoice Colab — T4 Quickstart (1.5B)\n",
|
"# VibeVoice Colab — T4 Quickstart (1.5B)\n",
|
||||||
"This page provides a quickstart guide to run VibeVoice on Colab with T4.\n",
|
"This page provides a quickstart guide to run VibeVoice on Colab with T4.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"T4 only support 1.5B model due to GPU memory. For the real WOW TTS experience, please try the 7B model on a stronger GPU.\n"
|
"The T4 GPU can only support the 1.5B model due to memory limitations. Please note that T4 can only use SDPA instead of flash_attention_2, which may result in unstable and lower audio quality. For the best TTS experience, we recommend trying the 7B model on a more powerful GPU."
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "AHLptWHtQmw-"
|
|
||||||
},
|
|
||||||
"id": "AHLptWHtQmw-"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
"id": "vzwhx5AtQ37g",
|
||||||
"## Step 1: Use T4\n",
|
|
||||||
"\n"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "vzwhx5AtQ37g"
|
"id": "vzwhx5AtQ37g"
|
||||||
},
|
},
|
||||||
"id": "vzwhx5AtQ37g"
|
"source": [
|
||||||
|
"## Step 1: Use T4\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
"id": "ryxffqxlVbbP",
|
||||||
"Use T4 in Colab: go to Runtime → Change runtime type → Hardware accelerator: GPU → T4."
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "ryxffqxlVbbP"
|
"id": "ryxffqxlVbbP"
|
||||||
},
|
},
|
||||||
"id": "ryxffqxlVbbP"
|
"source": [
|
||||||
|
"Use T4 in Colab: go to Runtime → Change runtime type → Hardware accelerator: GPU → T4."
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "Hek0yZKdVot_",
|
||||||
|
"metadata": {
|
||||||
|
"id": "Hek0yZKdVot_"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import torch\n",
|
"import torch\n",
|
||||||
"print(torch.cuda.is_available())\n",
|
"print(torch.cuda.is_available())\n",
|
||||||
"!nvidia-smi"
|
"!nvidia-smi"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "Hek0yZKdVot_"
|
|
||||||
},
|
|
||||||
"id": "Hek0yZKdVot_",
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
"id": "S8D9WNSvWFwy",
|
||||||
"## Step 2: Env Install"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "S8D9WNSvWFwy"
|
"id": "S8D9WNSvWFwy"
|
||||||
},
|
},
|
||||||
"id": "S8D9WNSvWFwy"
|
"source": [
|
||||||
|
"## Step 2: Env Install"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "2xGbc7gKMD7A",
|
||||||
|
"metadata": {
|
||||||
|
"id": "2xGbc7gKMD7A"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"!git clone https://github.com/microsoft/VibeVoice.git\n",
|
"!git clone https://github.com/microsoft/VibeVoice.git\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -68,83 +74,78 @@
|
||||||
"\n",
|
"\n",
|
||||||
"!apt update && apt install ffmpeg -y\n",
|
"!apt update && apt install ffmpeg -y\n",
|
||||||
"!pip install -e ."
|
"!pip install -e ."
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "2xGbc7gKMD7A"
|
|
||||||
},
|
|
||||||
"id": "2xGbc7gKMD7A",
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
"id": "YmxjRFSFW4aE",
|
||||||
"## Step 3: Run VibeVoice"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "YmxjRFSFW4aE"
|
"id": "YmxjRFSFW4aE"
|
||||||
},
|
},
|
||||||
"id": "YmxjRFSFW4aE"
|
"source": [
|
||||||
|
"## Step 3: Run VibeVoice"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "MfQ0geOJQNS5",
|
||||||
|
"metadata": {
|
||||||
|
"id": "MfQ0geOJQNS5"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# First download checkpoint takes ~3 minutes\n",
|
"# First download checkpoint takes ~3 minutes\n",
|
||||||
"!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/2p_short.txt --speaker_names Alice Frank\n",
|
"!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/2p_short.txt --speaker_names Alice Frank\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from IPython.display import Audio\n",
|
"from IPython.display import Audio\n",
|
||||||
"Audio(\"./outputs/2p_short_generated.wav\")"
|
"Audio(\"./outputs/2p_short_generated.wav\")"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "MfQ0geOJQNS5"
|
|
||||||
},
|
|
||||||
"id": "MfQ0geOJQNS5",
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
"id": "Pd6-KX2Hdswx",
|
||||||
"### Create your own example"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "Pd6-KX2Hdswx"
|
"id": "Pd6-KX2Hdswx"
|
||||||
},
|
},
|
||||||
"id": "Pd6-KX2Hdswx"
|
"source": [
|
||||||
|
"### TTS from your text"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "ZB482MvXbg8M",
|
||||||
|
"metadata": {
|
||||||
|
"id": "ZB482MvXbg8M"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"text = \"\"\"Speaker 1: Can I try VibeVoice with my own example?\n",
|
"text = \"\"\"Speaker 1: Can I try VibeVoice with my own example?\n",
|
||||||
"Speaker 2: Of course! VibeVoice is open-source, built to benefit everyone — you’re welcome to try it out.\"\"\"\n",
|
"Speaker 2: Of course! VibeVoice is open-source, built to benefit everyone — you’re welcome to try it out.\"\"\"\n",
|
||||||
"with open(\"demo/text_examples/my_example.txt\", \"w\", encoding=\"utf-8\") as f:\n",
|
"with open(\"demo/text_examples/my_example.txt\", \"w\", encoding=\"utf-8\") as f:\n",
|
||||||
" f.write(text)"
|
" f.write(text)"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "ZB482MvXbg8M"
|
|
||||||
},
|
|
||||||
"id": "ZB482MvXbg8M",
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": null,
|
||||||
"!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/my_example.txt --speaker_names Alice Frank\n",
|
"id": "heoxL08yM-gf",
|
||||||
"Audio(\"./outputs/my_example_generated.wav\")\n"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "heoxL08yM-gf"
|
"id": "heoxL08yM-gf"
|
||||||
},
|
},
|
||||||
"id": "heoxL08yM-gf",
|
"outputs": [],
|
||||||
"execution_count": null,
|
"source": [
|
||||||
"outputs": []
|
"!python demo/inference_from_file.py --model_path microsoft/VibeVoice-1.5B --txt_path demo/text_examples/my_example.txt --speaker_names Alice Frank\n",
|
||||||
|
"Audio(\"./outputs/my_example_generated.wav\")\n"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"accelerator": "GPU",
|
||||||
"colab": {
|
"colab": {
|
||||||
"provenance": [],
|
"gpuType": "T4",
|
||||||
"gpuType": "T4"
|
"provenance": []
|
||||||
},
|
},
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "Python 3",
|
||||||
|
@ -161,9 +162,8 @@
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.11"
|
"version": "3.10.11"
|
||||||
},
|
}
|
||||||
"accelerator": "GPU"
|
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 5
|
"nbformat_minor": 5
|
||||||
}
|
}
|
||||||
|
|
|
@ -256,7 +256,7 @@ def main():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[ERROR] : {type(e).__name__}: {e}")
|
print(f"[ERROR] : {type(e).__name__}: {e}")
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
print("Error loading model, try sdpa.")
|
print("Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.")
|
||||||
model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
||||||
args.model_path,
|
args.model_path,
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
|
|
Loading…
Reference in a new issue