File size: 6,045 Bytes
92df4f5
 
 
 
 
 
 
 
 
 
 
158b03a
 
92df4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158b03a
92df4f5
 
35cd28c
92df4f5
 
 
 
 
 
bbb1375
d3127d4
92df4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
d3127d4
92df4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbb1375
92df4f5
 
 
 
 
 
 
 
 
 
 
158b03a
 
92df4f5
158b03a
 
 
 
92df4f5
35cd28c
158b03a
 
92df4f5
 
 
 
 
 
 
 
 
 
 
 
158b03a
 
92df4f5
158b03a
 
 
92df4f5
 
 
 
 
 
 
 
 
 
 
 
bbb1375
92df4f5
 
 
 
 
bbb1375
92df4f5
 
bbb1375
 
 
92df4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158b03a
92df4f5
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import numpy as np
import onnxruntime

import utils
from text import text_to_sequence, sequence_to_text
import torch
import gradio as gr
import soundfile as sf
import tempfile
import yaml

from time import perf_counter

def intersperse(lst, item):
    result = [item] * (len(lst) * 2 + 1)
    result[1::2] = lst
    return result


def process_text(i: int, text: str, device: torch.device):
    print(f"[{i}] - Input text: {text}")
    x = torch.tensor(
        intersperse(text_to_sequence(text, ["catalan_cleaners"]), 0),
        dtype=torch.long,
        device=device,
    )[None]
    x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device)
    x_phones = sequence_to_text(x.squeeze(0).tolist())
    print(x_phones)
    return x.numpy(), x_lengths.numpy()

MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15_10_steps.onnx"
MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
MODEL_PATH_VOCOS="mel_spec_22khz.onnx"
CONFIG_PATH="config_22khz.yaml"

sess_options = onnxruntime.SessionOptions()
model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"])
model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"])


def vocos_inference(mel):

    with open(CONFIG_PATH, "r") as f:
        config = yaml.safe_load(f)

    params = config["feature_extractor"]["init_args"]
    sample_rate = params["sample_rate"]
    n_fft= params["n_fft"]
    hop_length= params["hop_length"]
    win_length = n_fft

    # ONNX inference
    mag, x, y = model_vocos.run(
        None,
        {
            "mels": mel
        },
    )

    # complex spectrogram from vocos output
    spectrogram = mag * (x + 1j * y)
    window = torch.hann_window(win_length)

    # Inverse stft
    pad = (win_length - hop_length) // 2
    spectrogram = torch.tensor(spectrogram)
    B, N, T = spectrogram.shape

    print("Spectrogram synthesized shape", spectrogram.shape)
    # Inverse FFT
    ifft = torch.fft.irfft(spectrogram, n_fft, dim=1, norm="backward")
    ifft = ifft * window[None, :, None]

    # Overlap and Add
    output_size = (T - 1) * hop_length + win_length
    y = torch.nn.functional.fold(
        ifft, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length),
    )[:, 0, 0, pad:-pad]

    # Window envelope
    window_sq = window.square().expand(1, T, -1).transpose(1, 2)
    window_envelope = torch.nn.functional.fold(
        window_sq, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length),
    ).squeeze()[pad:-pad]

    # Normalize
    assert (window_envelope > 1e-11).all()
    y = y / window_envelope
    
    return y


def tts(text:str, spk_id:int):
    sid = np.array([int(spk_id)]) if spk_id is not None else None
    text_matcha , text_lengths = process_text(0,text,"cpu") 

    # MATCHA VOCOS
    inputs = {
        "x": text_matcha,
        "x_lengths": text_lengths,
        "scales": np.array([0.667, 1.0], dtype=np.float32),
        "spks": sid
    }
    mel_t0 = perf_counter()
    # matcha mel inference
    mel, mel_lengths = model_matcha_mel.run(None, inputs)
    mel_infer_secs = perf_counter() - mel_t0
    print("Matcha Mel inference time", mel_infer_secs)

    vocos_t0 = perf_counter()
    # vocos inference
    wavs_vocos = vocos_inference(mel)
    vocos_infer_secs = perf_counter() - vocos_t0
    print("Vocos inference time", vocos_infer_secs)

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha_vocos:
        sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")

    #MATCHA HIFIGAN

    inputs = {
        "x": text_matcha,
        "x_lengths": text_lengths,
        "scales": np.array([0.667, 1.0], dtype=np.float32),
        "spks": sid
    }
    hifigan_t0 = perf_counter()
    # matcha hifigan inference
    wavs, wav_lengths = model_matcha.run(None, inputs)
    hifigan_infer_secs = perf_counter() - hifigan_t0
    print("Matcha + Hifigan",hifigan_infer_secs)

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha:
        sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")

    return fp_matcha_vocos.name, fp_matcha.name

## GUI space

title = """
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
    <div
        style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;"
    > <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
        TTS Vocoder Comparison
    </h1> </div>
</div>
 """

description = """
 
🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis

For vocoders we use Hifigan universal version and Vocos trained in a catalan set of ~28 hours.

Matcha was trained using openslr69 and festcat datasets
"""

article = "Training and demo by BSC."

vits2_inference = gr.Interface(
    fn=tts,
    inputs=[
        gr.Textbox(
            value="m'ha costat desenvolupar molt una veu, i ara que la tinc no estaré en silenci.",
            max_lines=1,
            label="Input text",
        ),
        gr.Slider(
            1,
            47,
            value=10,
            step=1,
            label="Speaker id",
            info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids.",
        ),
    ],
    outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
             gr.Audio(label="Matcha hifigan", interactive=False, type="filepath")]
)

demo = gr.Blocks()

with demo:
    gr.Markdown(title)
    gr.Markdown(description)
    gr.TabbedInterface([vits2_inference], ["Multispeaker"])
    gr.Markdown(article)

demo.queue(max_size=10)
demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860)