import os from time import sleep, time import warnings import importlib import torch import argparse from kokoro import KPipeline import soundfile as sf from tqdm import tqdm import pyperclip from yaspin import yaspin # See voices: https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md voices = { 'pro': 'af_heart', 'hot': 'af_bella', 'asmr':'af_nicole', 'brit': 'bf_emma' } def parse_args(): parser = argparse.ArgumentParser(description="Simple TTS", allow_abbrev=False) parser.add_argument( "input_text", type=str, nargs='?', default="", help="Text to read", ) parser.add_argument( "--title", "-t", required=False, type=str, help="Title to use as label to the generated outputs", ) parser.add_argument( "--voice", "-v", required=False, type=str, default="pro", help="Voice to use (pro, hot, asmr, brit)", ) parser.add_argument( "--input_file", "-i", required=False, type=str, default="demo/tongue-twister.txt", help="Path to the input text file", ) parser.add_argument( "--verbose", default=False, action="store_true", help="Show verbose reports", ) parser.add_argument( "--clipboard", "-c", required=False, action="store_true", help="Use text from the clipboard (i.e., copied text)", ) parser.add_argument( "--device", "-d", required=False, type=str, default=("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else ("xpu" if torch.xpu.is_available() else "cpu"))), help="Device for inference: cuda | mps | cpu", ) parser.add_argument( "--skip_play", "-s", required=False, action="store_true", help="Prevent playing the generated audio", ) return parser.parse_args() def generate_audio(generator, name, voice): output_files = [] with yaspin(): for i, (gs, ps, audio) in enumerate(generator): output_file_name=f'outputs/{name}/{name}-{voice}-{i}.wav' os.makedirs(os.path.dirname(output_file_name), exist_ok=True) output_files.append(output_file_name) sf.write(output_file_name, audio, 24000) return output_files def play_audio(output_files): vlc_module = importlib.import_module("vlc") length = len(output_files) for i, output in enumerate(output_files): full_path = os.path.abspath(output) media = vlc_module.MediaPlayer(f"file://{full_path}") media.play() sleep(0.01) duration=media.get_length() / 1000 chunk=f"{i+1}/{length} " if length > 1 else "" description = f"\u25B6 {chunk}" for i in tqdm(range(100), desc=description, bar_format='{l_bar} {elapsed} {bar} {remaining}', colour='yellow'): sleep(duration / 100) def main(): args=parse_args() if not args.verbose: # Disable all warnings warnings.filterwarnings("ignore") pipeline = KPipeline(lang_code='a', device=args.device, repo_id='hexgrad/Kokoro-82M') if args.voice in voices: voice=voices[args.voice] else: voice=voices['pro'] if args.voice is None else args.voice # filename argument if args.input_text == "": if args.clipboard: # use copied text text = pyperclip.paste() name = 'copied' else: file_path = args.input_file directory, file_name = os.path.split(file_path) name = '.'.join(file_name.split('.')[:-1]) file = open(file_path, "r") text = file.read() else: name = "chat" text = args.input_text if args.title: name = args.title # make safe for filenames name = name.replace(" ", "_") name = name.replace("\\", "_") name = name.replace("/", "_") ''' Split patterns: - only multiple consecutive new line (to handle wrapped statements) - statements ending in punctuations (:.?!;) - list items starting in '-' or '*' - numbered items starting with a digit followed by a dot '.' ''' generator = pipeline( text, voice=voice, split_pattern=r'\n{2,}|[:.?!;]\n+|\n[\*\-(\d+\.)]' ) if args.verbose: print(f"[TTS] Using device: \"{args.device}\", voice: \"{voice}\", output label: \"{name}\"") if args.clipboard: print('[TTS] Using copied text as input.') start_time = time() output_files = generate_audio(generator, name, voice) generation_time = time() - start_time directory,f = os.path.split(output_files[0]) if args.verbose: print(f"[TTS] {len(output_files)} chunks generated in {generation_time:.2f} seconds") print(f"[TTS] Output files are in: {directory}/*") if args.skip_play: print(f"[TTS] Audio player disabled: {directory}/*") else: try: play_audio(output_files) except: print(f"[TTS] Something went wrong when trying to play the audio. Play the output files manually: {directory}/*") if __name__ == "__main__": main()