diff --git a/tts.py b/tts.py index e307e73..b557675 100644 --- a/tts.py +++ b/tts.py @@ -1,4 +1,3 @@ -import sys import os from time import sleep, time import warnings @@ -53,6 +52,32 @@ def parse_args(): ) return parser.parse_args() +def generate_audio(generator, name, voice): + start_time = time() + output_files = [] + print("Generating...") + for i, (gs, ps, audio) in enumerate(generator): + output_file_name=f'outputs/{name}-{voice}-{i}.wav' + os.makedirs(os.path.dirname(output_file_name), exist_ok=True) + output_files.append(output_file_name) + sf.write(output_file_name, audio, 24000) + generation_time = time() - start_time + print(f"Generated {len(output_files)} chunks in {generation_time:.2f} seconds") + return output_files + +def play_audio(output_files): + length = len(output_files) + for i, output in enumerate(output_files): + full_path = os.path.abspath(output) + media = vlc.MediaPlayer(f"file://{full_path}") + media.play() + sleep(0.1) + duration=media.get_length() / 1000 + chunk=f"{i+1}/{length} " if length > 1 else "" + description = f"\u25B6 {chunk}({'{0:0>5.2f}'.format(duration)}s)" + for i in tqdm(range(100), desc=description): + sleep(duration / 100) + def main(): args=parse_args() pipeline = KPipeline(lang_code='a', device=args.device, repo_id='hexgrad/Kokoro-82M') @@ -73,29 +98,8 @@ def main(): text = args.input_text generator = pipeline(text, voice=voice) - output_files = [] - length = 0 - - start_time = time() - print("Generating...") - for i, (gs, ps, audio) in enumerate(generator): - output_file_name=f'outputs/{name}-{voice}-{i}.wav' - os.makedirs(os.path.dirname(output_file_name), exist_ok=True) - output_files.append(output_file_name) - sf.write(output_file_name, audio, 24000) - length = length + 1 - generation_time = time() - start_time - print(f"Done in {generation_time:.2f} seconds") - - for i, output in enumerate(output_files): - full_path = os.path.abspath(output) - media = vlc.MediaPlayer(f"file://{full_path}") - media.play() - sleep(0.1) - duration=media.get_length() / 1000 - description = f"\u25B6 {i+1}/{length} ({'{0:0>5.2f}'.format(duration)}s)" - for i in tqdm(range(100), desc=description): - sleep(duration / 100) + output_files = generate_audio(generator, name, voice) + play_audio(output_files) if __name__ == "__main__": main()