From b212a108df9b3e8d64d65dc30489607b8bfad0e9 Mon Sep 17 00:00:00 2001 From: Ayo Date: Wed, 3 Sep 2025 22:41:40 +0200 Subject: [PATCH] feat: implement --voice & --input --- README.md | 4 +-- main.py | 49 ------------------------------ requirements.txt | 1 + tts.py | 79 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 51 deletions(-) delete mode 100644 main.py create mode 100644 tts.py diff --git a/README.md b/README.md index 4957174..96b0a86 100644 --- a/README.md +++ b/README.md @@ -51,8 +51,8 @@ $ . env.sh ## Usage -To run the program it needs an input file. For example, using `input.txt` +To run the program it needs an input file using the flag `--input`. Optionally, you can indicate a [voice](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md) you want to use with `--voice`. ```bash -$ python main.py input.txt +$ python tts.py --input demo/tongue-twister.txt --voice asmr ``` diff --git a/main.py b/main.py deleted file mode 100644 index 9b2f1e9..0000000 --- a/main.py +++ /dev/null @@ -1,49 +0,0 @@ -import sys -import os -from time import sleep - -from kokoro import KPipeline -import soundfile as sf -import vlc -from tqdm import tqdm - - -# See voices: https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md -voices = { - 'pro': 'af_heart', - 'hot': 'af_bella', - 'asmr':'af_nicole', - 'brit': 'bf_emma' -} -pipeline = KPipeline(lang_code='a', device='xpu', repo_id='hexgrad/Kokoro-82M') - -# filename argument -file_path = sys.argv[1] -directory, file_name = os.path.split(file_path) - -name = '.'.join(file_name.split('.')[:-1]) - -file = open(file_path, "r") -text = file.read() -generator = pipeline(text, voice=voices['pro']) - -output_files = [] -length = 0 - -for i, (gs, ps, audio) in enumerate(generator): - output_file_name=f'outputs/{name}-{i}.wav' - os.makedirs(os.path.dirname(output_file_name), exist_ok=True) - output_files.append(output_file_name) - sf.write(output_file_name, audio, 24000) - print(u'\u2713', output_file_name) - length = length + 1 - -for i, output in enumerate(output_files): - full_path = os.path.abspath(output) - media = vlc.MediaPlayer(f"file://{full_path}") - media.play() - sleep(0.1) - duration=media.get_length() / 1000 - description = f"\u25B6 {i+1}/{length} ({'{0:0>5.2f}'.format(duration)}s)" - for i in tqdm(range(100), desc=description): - sleep(duration / 100) diff --git a/requirements.txt b/requirements.txt index 1e9120c..630820c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ kokoro soundfile python-vlc tqdm +argparse diff --git a/tts.py b/tts.py new file mode 100644 index 0000000..aeac894 --- /dev/null +++ b/tts.py @@ -0,0 +1,79 @@ +import sys +import os +from time import sleep + +import argparse +from kokoro import KPipeline +import soundfile as sf +import vlc +from tqdm import tqdm + + +# See voices: https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md +voices = { + 'pro': 'af_heart', + 'hot': 'af_bella', + 'asmr':'af_nicole', + 'brit': 'bf_emma' +} + + +def parse_args(): + parser = argparse.ArgumentParser(description="Simple TTS") + parser.add_argument( + "--voice", + type=str, + default="pro", + help="Voice to use (pro, hot, asmr, brit)", + ) + parser.add_argument( + "--input", + type=str, + default="demo/tongue-twister.txt", + help="Voice to use (pro, hot, asmr, brit)", + ) + return parser.parse_args() + +def main(): + args=parse_args() + pipeline = KPipeline(lang_code='a', device='xpu', repo_id='hexgrad/Kokoro-82M') + voice=voices[args.voice] + if voice is None: + if args.voice is None: + voice=voices['pro'] + else: + voice=args.voice + + # filename argument + file_path = args.input + directory, file_name = os.path.split(file_path) + + name = '.'.join(file_name.split('.')[:-1]) + + file = open(file_path, "r") + text = file.read() + generator = pipeline(text, voice=voices[args.voice]) + + output_files = [] + length = 0 + + for i, (gs, ps, audio) in enumerate(generator): + output_file_name=f'outputs/{name}-{i}.wav' + os.makedirs(os.path.dirname(output_file_name), exist_ok=True) + output_files.append(output_file_name) + sf.write(output_file_name, audio, 24000) + print(u'\u2713', output_file_name) + length = length + 1 + + for i, output in enumerate(output_files): + full_path = os.path.abspath(output) + media = vlc.MediaPlayer(f"file://{full_path}") + media.play() + sleep(0.1) + duration=media.get_length() / 1000 + description = f"\u25B6 {i+1}/{length} ({'{0:0>5.2f}'.format(duration)}s)" + for i in tqdm(range(100), desc=description): + sleep(duration / 100) + +if __name__ == "__main__": + main() \ No newline at end of file