feat: implement --voice & --input

2025-09-03 22:41:40 +02:00 · 2025-09-03 22:41:40 +02:00 · b212a108df
commit b212a108df
parent da6faa61fe
4 changed files with 82 additions and 51 deletions
--- a/README.md
+++ b/README.md
@ -51,8 +51,8 @@ $ . env.sh
 ## Usage
-To run the program it needs an input file. For example, using `input.txt`
+To run the program it needs an input file using the flag `--input`. Optionally, you can indicate a [voice](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md) you want to use with `--voice`.
 ```bash
-$ python main.py input.txt
+$ python tts.py --input demo/tongue-twister.txt --voice asmr
 ```
--- a/main.py
+++ b/main.py
@ -1,49 +0,0 @@
 import sys
 import os
 from time import sleep
 from kokoro import KPipeline
 import soundfile as sf
 import vlc
 from tqdm import tqdm
 # See voices: https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
 voices = {
    'pro': 'af_heart',
    'hot': 'af_bella',
    'asmr':'af_nicole',
    'brit': 'bf_emma'
 }
 pipeline = KPipeline(lang_code='a', device='xpu', repo_id='hexgrad/Kokoro-82M')
 # filename argument
 file_path = sys.argv[1]
 directory, file_name = os.path.split(file_path)
 name = '.'.join(file_name.split('.')[:-1])
 file = open(file_path, "r")
 text = file.read()
 generator = pipeline(text, voice=voices['pro'])
 output_files = []
 length = 0
 for i, (gs, ps, audio) in enumerate(generator):
    output_file_name=f'outputs/{name}-{i}.wav'
    os.makedirs(os.path.dirname(output_file_name), exist_ok=True)
    output_files.append(output_file_name)
    sf.write(output_file_name, audio, 24000)
    print(u'\u2713', output_file_name)
    length = length + 1
 for i, output in enumerate(output_files):
    full_path = os.path.abspath(output)
    media = vlc.MediaPlayer(f"file://{full_path}")
    media.play()
    sleep(0.1)
    duration=media.get_length() / 1000
    description = f"\u25B6 {i+1}/{length} ({'{0:0>5.2f}'.format(duration)}s)"
    for i in tqdm(range(100), desc=description):
        sleep(duration / 100)
--- a/requirements.txt
+++ b/requirements.txt
@ -2,3 +2,4 @@ kokoro
 soundfile
 python-vlc
 tqdm
 argparse
--- a/tts.py
+++ b/tts.py
@ -0,0 +1,79 @@
 import sys
 import os
 from time import sleep
 import argparse
 from kokoro import KPipeline
 import soundfile as sf
 import vlc
 from tqdm import tqdm
 # See voices: https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
 voices = {
    'pro': 'af_heart',
    'hot': 'af_bella',
    'asmr':'af_nicole',
    'brit': 'bf_emma'
 }
 def parse_args():
    parser = argparse.ArgumentParser(description="Simple TTS")
    parser.add_argument(
        "--voice",
        type=str,
        default="pro",
        help="Voice to use (pro, hot, asmr, brit)",
    )
    parser.add_argument(
        "--input",
        type=str,
        default="demo/tongue-twister.txt",
        help="Voice to use (pro, hot, asmr, brit)",
    )
    return parser.parse_args()
 def main():
    args=parse_args()
    pipeline = KPipeline(lang_code='a', device='xpu', repo_id='hexgrad/Kokoro-82M')
    voice=voices[args.voice]
    if voice is None:
        if args.voice is None:
            voice=voices['pro']
        else:
            voice=args.voice
    # filename argument
    file_path = args.input
    directory, file_name = os.path.split(file_path)
    name = '.'.join(file_name.split('.')[:-1])
    file = open(file_path, "r")
    text = file.read()
    generator = pipeline(text, voice=voices[args.voice])
    output_files = []
    length = 0
    for i, (gs, ps, audio) in enumerate(generator):
        output_file_name=f'outputs/{name}-{i}.wav'
        os.makedirs(os.path.dirname(output_file_name), exist_ok=True)
        output_files.append(output_file_name)
        sf.write(output_file_name, audio, 24000)
        print(u'\u2713', output_file_name)
        length = length + 1
    for i, output in enumerate(output_files):
        full_path = os.path.abspath(output)
        media = vlc.MediaPlayer(f"file://{full_path}")
        media.play()
        sleep(0.1)
        duration=media.get_length() / 1000
        description = f"\u25B6 {i+1}/{length} ({'{0:0>5.2f}'.format(duration)}s)"
        for i in tqdm(range(100), desc=description):
            sleep(duration / 100)
 if __name__ == "__main__":
    main()