feat: implement --voice & --input

2025-09-03 22:41:40 +02:00 · 2025-09-03 22:41:40 +02:00 · b212a108df
commit b212a108df
parent da6faa61fe
4 changed files with 82 additions and 51 deletions
--- a/README.md
+++ b/README.md
@ -51,8 +51,8 @@ $ . env.sh

 ## Usage

-To run the program it needs an input file. For example, using `input.txt`
+To run the program it needs an input file using the flag `--input`. Optionally, you can indicate a [voice](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md) you want to use with `--voice`.

 ```bash
-$ python main.py input.txt
+$ python tts.py --input demo/tongue-twister.txt --voice asmr
 ```
--- a/main.py
+++ b/main.py
@ -1,49 +0,0 @@
-import sys
-import os
-from time import sleep
-
-from kokoro import KPipeline
-import soundfile as sf
-import vlc
-from tqdm import tqdm
-
-
-# See voices: https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
-voices = {
-    'pro': 'af_heart',
-    'hot': 'af_bella',
-    'asmr':'af_nicole',
-    'brit': 'bf_emma'
-}
-pipeline = KPipeline(lang_code='a', device='xpu', repo_id='hexgrad/Kokoro-82M')
-
-# filename argument
-file_path = sys.argv[1]
-directory, file_name = os.path.split(file_path)
-
-name = '.'.join(file_name.split('.')[:-1])
-
-file = open(file_path, "r")
-text = file.read()
-generator = pipeline(text, voice=voices['pro'])
-
-output_files = []
-length = 0
-
-for i, (gs, ps, audio) in enumerate(generator):
-    output_file_name=f'outputs/{name}-{i}.wav'
-    os.makedirs(os.path.dirname(output_file_name), exist_ok=True)
-    output_files.append(output_file_name)
-    sf.write(output_file_name, audio, 24000)
-    print(u'\u2713', output_file_name)
-    length = length + 1
-
-for i, output in enumerate(output_files):
-    full_path = os.path.abspath(output)
-    media = vlc.MediaPlayer(f"file://{full_path}")
-    media.play()
-    sleep(0.1)
-    duration=media.get_length() / 1000
-    description = f"\u25B6 {i+1}/{length} ({'{0:0>5.2f}'.format(duration)}s)"
-    for i in tqdm(range(100), desc=description):
-        sleep(duration / 100)
--- a/requirements.txt
+++ b/requirements.txt
@ -2,3 +2,4 @@ kokoro
 soundfile
 python-vlc
 tqdm
+argparse
--- a/tts.py
+++ b/tts.py
@ -0,0 +1,79 @@
+import sys
+import os
+from time import sleep
+
+import argparse
+from kokoro import KPipeline
+import soundfile as sf
+import vlc
+from tqdm import tqdm
+
+
+# See voices: https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
+voices = {
+    'pro': 'af_heart',
+    'hot': 'af_bella',
+    'asmr':'af_nicole',
+    'brit': 'bf_emma'
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple TTS")
+    parser.add_argument(
+        "--voice",
+        type=str,
+        default="pro",
+        help="Voice to use (pro, hot, asmr, brit)",
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        default="demo/tongue-twister.txt",
+        help="Voice to use (pro, hot, asmr, brit)",
+    )
+    return parser.parse_args()
+
+def main():
+    args=parse_args()
+    pipeline = KPipeline(lang_code='a', device='xpu', repo_id='hexgrad/Kokoro-82M')
+    voice=voices[args.voice]
+    if voice is None:
+        if args.voice is None:
+            voice=voices['pro']
+        else:
+            voice=args.voice
+
+    # filename argument
+    file_path = args.input
+    directory, file_name = os.path.split(file_path)
+
+    name = '.'.join(file_name.split('.')[:-1])
+
+    file = open(file_path, "r")
+    text = file.read()
+    generator = pipeline(text, voice=voices[args.voice])
+
+    output_files = []
+    length = 0
+
+    for i, (gs, ps, audio) in enumerate(generator):
+        output_file_name=f'outputs/{name}-{i}.wav'
+        os.makedirs(os.path.dirname(output_file_name), exist_ok=True)
+        output_files.append(output_file_name)
+        sf.write(output_file_name, audio, 24000)
+        print(u'\u2713', output_file_name)
+        length = length + 1
+
+    for i, output in enumerate(output_files):
+        full_path = os.path.abspath(output)
+        media = vlc.MediaPlayer(f"file://{full_path}")
+        media.play()
+        sleep(0.1)
+        duration=media.get_length() / 1000
+        description = f"\u25B6 {i+1}/{length} ({'{0:0>5.2f}'.format(duration)}s)"
+        for i in tqdm(range(100), desc=description):
+            sleep(duration / 100)
+
+if __name__ == "__main__":
+    main()