210 lines
6.2 KiB
Python
210 lines
6.2 KiB
Python
import os
|
|
from time import sleep, time
|
|
import warnings
|
|
import importlib
|
|
import random
|
|
|
|
import torch
|
|
import argparse
|
|
from kokoro import KPipeline
|
|
import soundfile as sf
|
|
from tqdm import tqdm
|
|
import pyperclip
|
|
from yaspin import yaspin
|
|
|
|
# See voices: https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
|
|
voices = {
|
|
'pro': 'af_heart',
|
|
'hot': 'af_bella',
|
|
'asmr':'af_nicole',
|
|
'brit': 'bf_emma'
|
|
}
|
|
|
|
prep_texts = [
|
|
"Check mic, 1-2-3...",
|
|
"*Tap* *tap* ... Is this thing on?",
|
|
"Ready, set... *Ahem!*",
|
|
"Mic's on, lights are set, I'm ready to roll.",
|
|
"All set? Let's make it a good one.",
|
|
"Ready, set, go—now that's the real countdown.",
|
|
"Checking the mic, one, two, three.",
|
|
"Lights, mic, action—now let's do this.",
|
|
"Hold tight—this is about to get interesting.",
|
|
"If the mic works, we're good to go.",
|
|
"All systems green—let's make this a good one."
|
|
]
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="Simple TTS", allow_abbrev=False)
|
|
parser.add_argument(
|
|
"input_text",
|
|
type=str,
|
|
nargs='?',
|
|
default="",
|
|
help="Text to read",
|
|
)
|
|
parser.add_argument(
|
|
"--title",
|
|
"-t",
|
|
required=False,
|
|
type=str,
|
|
help="Title to use as label to the generated outputs",
|
|
)
|
|
parser.add_argument(
|
|
"--voice",
|
|
"-v",
|
|
required=False,
|
|
type=str,
|
|
default="pro",
|
|
help="Voice to use (pro, hot, asmr, brit)",
|
|
)
|
|
parser.add_argument(
|
|
"--input_file",
|
|
"-i",
|
|
required=False,
|
|
type=str,
|
|
default="demo/tongue-twister.txt",
|
|
help="Path to the input text file",
|
|
)
|
|
parser.add_argument(
|
|
"--verbose",
|
|
default=False,
|
|
action="store_true",
|
|
help="Show verbose reports",
|
|
)
|
|
parser.add_argument(
|
|
"--clipboard",
|
|
"-c",
|
|
required=False,
|
|
action="store_true",
|
|
help="Use text from the clipboard (i.e., copied text)",
|
|
)
|
|
parser.add_argument(
|
|
"--device",
|
|
"-d",
|
|
required=False,
|
|
type=str,
|
|
default=("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else ("xpu" if torch.xpu.is_available() else "cpu"))),
|
|
help="Device for inference: cuda | mps | cpu",
|
|
)
|
|
parser.add_argument(
|
|
"--skip_play",
|
|
"-s",
|
|
required=False,
|
|
action="store_true",
|
|
help="Prevent playing the generated audio",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
def generate_audio(generator, name, voice):
|
|
output_files = []
|
|
|
|
for i, (gs, ps, audio) in enumerate(generator):
|
|
output_file_name=f'outputs/{name}/{name}-{voice}-{i}.wav'
|
|
os.makedirs(os.path.dirname(output_file_name), exist_ok=True)
|
|
output_files.append(output_file_name)
|
|
sf.write(output_file_name, audio, 24000)
|
|
return output_files
|
|
|
|
def play_audio(output_files):
|
|
vlc_module = importlib.import_module("vlc")
|
|
length = len(output_files)
|
|
for i, output in enumerate(output_files):
|
|
full_path = os.path.abspath(output)
|
|
media = vlc_module.MediaPlayer(f"file://{full_path}")
|
|
media.play()
|
|
sleep(0.01)
|
|
duration=media.get_length() / 1000
|
|
chunk=f"{i+1}/{length} " if length > 1 else ""
|
|
description = f"\u25B6 {chunk}"
|
|
for i in tqdm(range(100),
|
|
desc=description,
|
|
bar_format='{l_bar} {elapsed} {bar} {remaining}',
|
|
colour='yellow'):
|
|
sleep(duration / 100)
|
|
|
|
def main():
|
|
|
|
# Get a randome "preparing" text
|
|
spinner_text = random.choice(prep_texts)
|
|
|
|
# Generate audio
|
|
with yaspin() as spinner:
|
|
spinner.text = spinner_text
|
|
|
|
args=parse_args()
|
|
|
|
if not args.verbose:
|
|
# Disable all warnings
|
|
warnings.filterwarnings("ignore")
|
|
|
|
if args.voice in voices:
|
|
voice=voices[args.voice]
|
|
else:
|
|
voice=voices['pro'] if args.voice is None else args.voice
|
|
|
|
# filename argument
|
|
if args.input_text == "":
|
|
if args.clipboard:
|
|
# use copied text
|
|
text = pyperclip.paste()
|
|
name = 'copied'
|
|
else:
|
|
file_path = args.input_file
|
|
directory, file_name = os.path.split(file_path)
|
|
name = '.'.join(file_name.split('.')[:-1])
|
|
file = open(file_path, "r")
|
|
text = file.read()
|
|
else:
|
|
name = "chat"
|
|
text = args.input_text
|
|
|
|
if args.title:
|
|
name = args.title
|
|
|
|
# make safe for filenames
|
|
name = name.replace(" ", "_")
|
|
name = name.replace("\\", "_")
|
|
name = name.replace("/", "_")
|
|
|
|
lang_code = voice[0]
|
|
pipeline = KPipeline(lang_code=lang_code, device=args.device, repo_id='hexgrad/Kokoro-82M')
|
|
|
|
'''
|
|
Split patterns:
|
|
- only multiple consecutive new line (to handle wrapped statements)
|
|
- statements ending in punctuations (:.?!;)
|
|
- list items starting in '-' or '*'
|
|
- numbered items starting with a digit followed by a dot '.'
|
|
'''
|
|
generator = pipeline(
|
|
text,
|
|
voice=voice,
|
|
split_pattern=r'\n{2,}|[:.?!;]\n+|\n[\*\-(\d+\.)]'
|
|
)
|
|
|
|
if args.verbose:
|
|
print(f"[TTS] Using device: \"{args.device}\", voice: \"{voice}\", output label: \"{name}\"")
|
|
if args.clipboard:
|
|
print('[TTS] Using copied text as input.')
|
|
|
|
start_time = time()
|
|
output_files = generate_audio(generator, name, voice)
|
|
generation_time = time() - start_time
|
|
directory,f = os.path.split(output_files[0])
|
|
|
|
if args.verbose:
|
|
print(f"[TTS] {len(output_files)} chunks generated in {generation_time:.2f} seconds")
|
|
print(f"[TTS] Output files are in: {directory}/*")
|
|
|
|
# Play audio
|
|
if args.skip_play:
|
|
print(f"[TTS] Audio player disabled: {directory}/*")
|
|
else:
|
|
try:
|
|
play_audio(output_files)
|
|
except:
|
|
print(f"[TTS] Something went wrong when trying to play the audio. Play the output files manually: {directory}/*")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|