import numpy as np import onnxruntime import utils from text import text_to_sequence, sequence_to_text import torch import gradio as gr import soundfile as sf import tempfile import yaml def intersperse(lst, item): result = [item] * (len(lst) * 2 + 1) result[1::2] = lst return result def process_text(i: int, text: str, device: torch.device): print(f"[{i}] - Input text: {text}") x = torch.tensor( intersperse(text_to_sequence(text, ["catalan_cleaners"]), 0), dtype=torch.long, device=device, )[None] x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device) x_phones = sequence_to_text(x.squeeze(0).tolist()) print(x_phones) return x.numpy(), x_lengths.numpy() MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15.onnx" MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx" MODEL_PATH_VOCOS="mel_spec_22khz.onnx" CONFIG_PATH="config_22khz.yaml" sess_options = onnxruntime.SessionOptions() model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"]) model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"]) model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"]) def vocos_inference(mel): with open(CONFIG_PATH, "r") as f: config = yaml.safe_load(f) params = config["feature_extractor"]["init_args"] sample_rate = params["sample_rate"] n_fft= params["n_fft"] hop_length= params["hop_length"] win_length = n_fft # ONNX inference mag, x, y = model_vocos.run( None, { "mels": mel }, ) # complex spectrogram from vocos output spectrogram = mag * (x + 1j * y) window = torch.hann_window(win_length) # Inverse stft pad = (win_length - hop_length) // 2 spectrogram = torch.tensor(spectrogram) B, N, T = spectrogram.shape print("Spectrogram synthesized shape", spectrogram.shape) # Inverse FFT ifft = torch.fft.irfft(spectrogram, n_fft, dim=1, norm="backward") ifft = ifft * window[None, :, None] # Overlap and Add output_size = (T - 1) * hop_length + win_length y = torch.nn.functional.fold( ifft, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length), )[:, 0, 0, pad:-pad] # Window envelope window_sq = window.square().expand(1, T, -1).transpose(1, 2) window_envelope = torch.nn.functional.fold( window_sq, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length), ).squeeze()[pad:-pad] # Normalize assert (window_envelope > 1e-11).all() y = y / window_envelope return y def tts(text:str, spk_id:int): sid = np.array([int(spk_id)]) if spk_id is not None else None text_matcha , text_lengths = process_text(0,text,"cpu") # MATCHA VOCOS inputs = { "x": text_matcha, "x_lengths": text_lengths, "scales": np.array([0.667, 1.0], dtype=np.float32), "spks": sid } mel, mel_lengths = model_matcha_mel.run(None, inputs) # vocos inference wavs_vocos = vocos_inference(mel) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha_vocos: sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24") #MATCHA HIFIGAN inputs = { "x": text_matcha, "x_lengths": text_lengths, "scales": np.array([0.667, 1.0], dtype=np.float32), "spks": sid } wavs, wav_lengths = model_matcha.run(None, inputs) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha: sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24") return fp_matcha_vocos.name, fp_matcha.name ## GUI space title = """

TTS Vocoder Comparison

""" description = """ 🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis For vocoders we use Hifigan universal version and Vocos trained in a catalan set of ~28 hours. Matcha was trained using openslr69 and festcat datasets """ article = "Training and demo by BSC." vits2_inference = gr.Interface( fn=tts, inputs=[ gr.Textbox( value="m'ha costat desenvolupar molt una veu, i ara que la tinc no estaré en silenci.", max_lines=1, label="Input text", ), gr.Slider( 1, 47, value=10, step=1, label="Speaker id", info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids.", ), ], outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"), gr.Audio(label="Matcha", interactive=False, type="filepath")] ) demo = gr.Blocks() with demo: gr.Markdown(title) gr.Markdown(description) gr.TabbedInterface([vits2_inference], ["Multispeaker"]) gr.Markdown(article) demo.queue(max_size=10) demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860)