File size: 9,490 Bytes
92df4f5
 
 
 
 
 
 
 
 
40b17fc
fc52d83
92df4f5
158b03a
2dd2041
158b03a
2dd2041
 
fc52d83
92df4f5
 
 
 
 
 
2dd2041
92df4f5
 
2dd2041
92df4f5
 
 
 
 
 
 
 
2dd2041
 
35c9c3f
2dd2041
 
 
 
 
92df4f5
2dd2041
92df4f5
2dd2041
35c9c3f
2dd2041
 
92df4f5
bea1338
40b17fc
2dd2041
 
 
35c9c3f
 
78e7ff4
2dd2041
 
35c9c3f
 
 
2dd2041
 
 
bbb1375
1b8b2e7
92df4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
d3127d4
92df4f5
 
 
 
 
 
 
1b8b2e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bea1338
1b8b2e7
 
 
 
 
 
 
92df4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbb1375
2dd2041
 
92df4f5
2dd2041
 
92df4f5
 
 
 
 
6b0bcdf
92df4f5
 
158b03a
 
92df4f5
158b03a
 
 
 
92df4f5
1b8b2e7
158b03a
 
92df4f5
6b0bcdf
92df4f5
 
40b17fc
 
92df4f5
 
 
 
 
 
 
 
ba1a8f9
92df4f5
 
 
 
 
bbb1375
92df4f5
 
211b582
 
35c9c3f
bbb1375
92df4f5
 
5b51c67
 
fc52d83
211b582
92df4f5
2dd2041
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92df4f5
 
 
62f951d
92df4f5
 
 
2dd2041
 
6b0bcdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b8b2e7
40b17fc
92df4f5
40b17fc
92df4f5
 
fc52d83
 
92df4f5
 
 
 
 
2dd2041
 
92df4f5
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import numpy as np
import onnxruntime

from text import text_to_sequence, sequence_to_text
import torch
import gradio as gr
import soundfile as sf
import tempfile
import yaml
import json
import os

from time import perf_counter
import random

DEFAULT_SPEAKER_ID = os.environ.get("DEFAULT_SPEAKER_ID", default="quim")
DEFAULT_ACCENT= os.environ.get("DEFAULT_ACCENT", default="balear")

def intersperse(lst, item):
    result = [item] * (len(lst) * 2 + 1)
    result[1::2] = lst
    return result


def process_text(i: int, text: str, device: torch.device, cleaner:str):
    print(f"[{i}] - Input text: {text}")
    x = torch.tensor(
        intersperse(text_to_sequence(text, [cleaner]), 0),
        dtype=torch.long,
        device=device,
    )[None]
    x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device)
    x_phones = sequence_to_text(x.squeeze(0).tolist())
    print(x_phones)
    return x.numpy(), x_lengths.numpy()

# paths
MODEL_PATH_MATCHA_MEL_BAL="matcha_multispeaker_cat_bal_opset_15_10_steps.onnx"
MODEL_PATH_MATCHA_MEL_CAT="matcha_multispeaker_cat_cen_opset_15_10_steps.onnx"
MODEL_PATH_MATCHA_MEL_OCC="matcha_multispeaker_cat_occ_opset_15_10_steps.onnx"
MODEL_PATH_MATCHA_MEL_VAL="matcha_multispeaker_cat_val_opset_15_10_steps.onnx"
MODEL_PATH_VOCOS="mel_spec_22khz_cat.onnx"
CONFIG_PATH="config.yaml"
SPEAKER_ID_DICT="spk_to_id_2.json"

# Load models
sess_options = onnxruntime.SessionOptions()
model_matcha_mel_bal = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL_BAL), sess_options=sess_options, providers=["CPUExecutionProvider"])
model_matcha_mel_cat = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL_CAT), sess_options=sess_options, providers=["CPUExecutionProvider"])
model_matcha_mel_occ = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL_OCC), sess_options=sess_options, providers=["CPUExecutionProvider"])
model_matcha_mel_val = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL_VAL), sess_options=sess_options, providers=["CPUExecutionProvider"])
model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])

speaker_id_dict = json.load(open(SPEAKER_ID_DICT))
accents = [e for e in speaker_id_dict.keys()]

models={"balear":model_matcha_mel_bal,
        "nord-occidental": model_matcha_mel_occ,
        "valencia": model_matcha_mel_val,
        "central": model_matcha_mel_cat}

cleaners={"balear": "catalan_balear_cleaners",
        "nord-occidental": "catalan_occidental_cleaners",
        "valencia": "catalan_valencia_cleaners",
        "central": "catalan_cleaners"}


speakers = [sp for sp in speaker_id_dict[DEFAULT_ACCENT].keys()]

def vocos_inference(mel,denoise):

    with open(CONFIG_PATH, "r") as f:
        config = yaml.safe_load(f)

    params = config["feature_extractor"]["init_args"]
    sample_rate = params["sample_rate"]
    n_fft= params["n_fft"]
    hop_length= params["hop_length"]
    win_length = n_fft

    # ONNX inference
    mag, x, y = model_vocos.run(
        None,
        {
            "mels": mel
        },
    )

    # complex spectrogram from vocos output
    spectrogram = mag * (x + 1j * y)
    window = torch.hann_window(win_length)

    if denoise:
        # Vocoder bias
        mel_rand = torch.zeros_like(torch.tensor(mel))
        mag_bias, x_bias, y_bias = model_vocos.run(
            None,
            {
                "mels": mel_rand.float().numpy()
            },
        )

        # complex spectrogram from vocos output
        spectrogram_bias = mag_bias * (x_bias + 1j * y_bias)

        # Denoising
        spec = torch.view_as_real(torch.tensor(spectrogram))
        # get magnitude of vocos spectrogram
        mag_spec = torch.sqrt(spec.pow(2).sum(-1))

        # get magnitude of bias spectrogram
        spec_bias = torch.view_as_real(torch.tensor(spectrogram_bias))
        mag_spec_bias = torch.sqrt(spec_bias.pow(2).sum(-1))

        # substract 
        strength = 0.0025
        mag_spec_denoised = mag_spec - mag_spec_bias * strength
        mag_spec_denoised = torch.clamp(mag_spec_denoised, 0.0)

        # return to complex spectrogram from magnitude
        angle = torch.atan2(spec[..., -1], spec[..., 0] )
        spectrogram = torch.complex(mag_spec_denoised * torch.cos(angle), mag_spec_denoised * torch.sin(angle))

    # Inverse stft
    pad = (win_length - hop_length) // 2
    spectrogram = torch.tensor(spectrogram)
    B, N, T = spectrogram.shape

    print("Spectrogram synthesized shape", spectrogram.shape)
    # Inverse FFT
    ifft = torch.fft.irfft(spectrogram, n_fft, dim=1, norm="backward")
    ifft = ifft * window[None, :, None]

    # Overlap and Add
    output_size = (T - 1) * hop_length + win_length
    y = torch.nn.functional.fold(
        ifft, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length),
    )[:, 0, 0, pad:-pad]

    # Window envelope
    window_sq = window.square().expand(1, T, -1).transpose(1, 2)
    window_envelope = torch.nn.functional.fold(
        window_sq, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length),
    ).squeeze()[pad:-pad]

    # Normalize
    assert (window_envelope > 1e-11).all()
    y = y / window_envelope
    
    return y


def tts(text:str, accent:str, spk_name:str, temperature:float, length_scale:float, denoise:bool):
    spk_id = speaker_id_dict[accent][spk_name]
    sid = np.array([int(spk_id)]) if spk_id is not None else None
    text_matcha , text_lengths = process_text(0,text,"cpu",cleaner=cleaners[accent])
    model_matcha_mel = models[accent]

    # MATCHA VOCOS
    inputs = {
        "x": text_matcha,
        "x_lengths": text_lengths,
        "scales": np.array([temperature, length_scale], dtype=np.float32),
        "spks": sid
    }
    mel_t0 = perf_counter()
    # matcha mel inference
    mel, mel_lengths = model_matcha_mel.run(None, inputs)
    mel_infer_secs = perf_counter() - mel_t0
    print("Matcha Mel inference time", mel_infer_secs)

    vocos_t0 = perf_counter()
    # vocos inference
    wavs_vocos = vocos_inference(mel,denoise)
    vocos_infer_secs = perf_counter() - vocos_t0
    print("Vocos inference time", vocos_infer_secs)

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha_vocos:
        sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")

    print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs_vocos.shape[1]/22050) }")
    return fp_matcha_vocos.name

## GUI space

title = """
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
    <div
        style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;"
    > <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
        Natural and efficient TTS in Catalan
    </h1> </div>
</div>
 """

description = """
 
🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis

For vocoders we use [Vocos](https://huggingface.co/BSC-LT/vocos-mel-22khz-cat) trained in a catalan set of ~28 hours.

[Matcha](https://huggingface.co/BSC-LT/matcha-tts-cat-multispeaker) was trained using openslr69 and festcat datasets

"""

with open("about.md", "r", encoding="utf-8") as f:
    about = f.read()

article = "Training and demo by The Language Technologies Unit from Barcelona Supercomputing Center."


def rs_change(accent):
    rnd_idx = random.randint(0, 1)
    return gr.Dropdown(choices=speaker_id_dict[accent], interactive=True,value=list(speaker_id_dict[accent].keys())[rnd_idx])

accent_dropdown = gr.Dropdown(
            choices=accents,
            label="Accent",
            value=DEFAULT_ACCENT,
            info=f"Models are trained on 4 accents"
        )

speaker_dropdown = gr.Dropdown(
            choices=speaker_id_dict[DEFAULT_ACCENT],
            label="Speaker id",
            value=DEFAULT_SPEAKER_ID,
            info=f"Models are trained on 2 speakers. You can prompt the model using one of these speaker ids.",  
            interactive=True   
        )

matcha_inference = gr.Interface(
    fn=tts,
    inputs=[
        gr.Textbox(
            value="m'ha costat molt desenvolupar una veu, i ara que la tinc no estaré en silenci.",
            max_lines=1,
            label="Input text",
        ),
        accent_dropdown,
        speaker_dropdown,
        gr.Slider(
            0.1,
            2.0,
            value=0.667,
            step=0.01,
            label="Temperature",
            info=f"Temperature",
        ),
        gr.Slider(
            0.5,
            2.0,
            value=1.0,
            step=0.01,
            label="Length scale",
            info=f"Controls speech pace, larger values for slower pace and smaller values for faster pace",
        ),
        gr.Checkbox(label="Denoise", info="Removes model bias from vocos", value=True),
    ],
    outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath")]
)

about_article = gr.Markdown(about)

demo = gr.Blocks()

with demo:
    gr.Markdown(title)
    gr.Markdown(description)
    gr.TabbedInterface([matcha_inference, about_article], ["Demo", "About"])
    accent_dropdown.select(fn=rs_change, inputs=accent_dropdown, outputs=speaker_dropdown)
    gr.Markdown(article)

demo.queue(max_size=10)
demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860)