TTS Catalan Comparison

import numpy as np
import onnxruntime

import utils
from text import text_to_sequence, sequence_to_text
import torch
import gradio as gr
import soundfile as sf
import tempfile
import yaml

def intersperse(lst, item):
    result = [item] * (len(lst) * 2 + 1)
    result[1::2] = lst
    return result


def process_text(i: int, text: str, device: torch.device):
    print(f"[{i}] - Input text: {text}")
    x = torch.tensor(
        intersperse(text_to_sequence(text, ["catalan_cleaners"]), 0),
        dtype=torch.long,
        device=device,
    )[None]
    x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device)
    x_phones = sequence_to_text(x.squeeze(0).tolist())
    print(x_phones)
    return x.numpy(), x_lengths.numpy()

MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15.onnx"
MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
MODEL_PATH_VOCOS="mel_spec_22khz.onnx"
CONFIG_PATH="config_22khz.yaml"

sess_options = onnxruntime.SessionOptions()
model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"])
model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"])

def vocos_inference(mel):

    with open(CONFIG_PATH, "r") as f:
        config = yaml.safe_load(f)

    params = config["feature_extractor"]["init_args"]
    sample_rate = params["sample_rate"]
    n_fft= params["n_fft"]
    hop_length= params["hop_length"]
    win_length = n_fft

    # ONNX inference
    mag, x, y = model_vocos.run(
        None,
        {
            "mels": mel
        },
    )

    # complex spectrogram from vocos output
    spectrogram = mag * (x + 1j * y)
    window = torch.hann_window(win_length)

    # Inverse stft
    pad = (win_length - hop_length) // 2
    spectrogram = torch.tensor(spectrogram)
    B, N, T = spectrogram.shape

    print("Spectrogram synthesized shape", spectrogram.shape)
    # Inverse FFT
    ifft = torch.fft.irfft(spectrogram, n_fft, dim=1, norm="backward")
    ifft = ifft * window[None, :, None]

    # Overlap and Add
    output_size = (T - 1) * hop_length + win_length
    y = torch.nn.functional.fold(
        ifft, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length),
    )[:, 0, 0, pad:-pad]

    # Window envelope
    window_sq = window.square().expand(1, T, -1).transpose(1, 2)
    window_envelope = torch.nn.functional.fold(
        window_sq, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length),
    ).squeeze()[pad:-pad]

    # Normalize
    assert (window_envelope > 1e-11).all()
    y = y / window_envelope
    
    return y

def tts(text:str, spk_id:int):
    sid = np.array([int(spk_id)]) if spk_id is not None else None
    text_matcha , text_lengths = process_text(0,text,"cpu") 

    # MATCHA VOCOS
    inputs = {
        "x": text_matcha,
        "x_lengths": text_lengths,
        "scales": np.array([0.667, 1.0], dtype=np.float32),
        "spks": sid
    }

    mel, mel_lengths = model_matcha_mel.run(None, inputs)
    # vocos inference
    wavs_vocos = vocos_inference(mel)

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha_vocos:
        sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")

    #MATCHA HIFIGAN

    inputs = {
        "x": text_matcha,
        "x_lengths": text_lengths,
        "scales": np.array([0.667, 1.0], dtype=np.float32),
        "spks": sid
    }
    wavs, wav_lengths = model_matcha.run(None, inputs)
    
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha:
        sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")

    return fp_matcha_vocos.name, fp_matcha.name

## GUI space

title = """
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
    <div
        style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;"
    > <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
        TTS Catalan Comparison
    </h1> </div>
</div>
 """

description = """
VITS2 is an end-to-end speech synthesis model that predicts a speech waveform conditional on an input text sequence. VITS2 improved the
training and inference efficiency and naturalness by introducing adversarial learning into the duration predictor. The transformer
block was added to the normalizing flows to capture the long-term dependency when transforming the distribution.
The synthesis quality was improved by incorporating Gaussian noise into the alignment search. 

🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis

Models are being trained in openslr69 and festcat datasets
"""

article = "Training and demo by BSC."

vits2_inference = gr.Interface(
    fn=tts,
    inputs=[
        gr.Textbox(
            value="m'ha costat desenvolupar molt una veu, i ara que la tinc no estaré en silenci.",
            max_lines=1,
            label="Input text",
        ),
        gr.Slider(
            1,
            47,
            value=10,
            step=1,
            label="Speaker id",
            info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids.",
        ),
    ],
    outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
             gr.Audio(label="Matcha", interactive=False, type="filepath")]
)

demo = gr.Blocks()

with demo:
    gr.Markdown(title)
    gr.Markdown(description)
    gr.TabbedInterface([vits2_inference], ["Multispeaker"])
    gr.Markdown(article)

demo.queue(max_size=10)
demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860)