add 10 timesteps model
Browse files
infer_onnx.py
CHANGED
@@ -9,6 +9,8 @@ import soundfile as sf
|
|
9 |
import tempfile
|
10 |
import yaml
|
11 |
|
|
|
|
|
12 |
def intersperse(lst, item):
|
13 |
result = [item] * (len(lst) * 2 + 1)
|
14 |
result[1::2] = lst
|
@@ -27,7 +29,7 @@ def process_text(i: int, text: str, device: torch.device):
|
|
27 |
print(x_phones)
|
28 |
return x.numpy(), x_lengths.numpy()
|
29 |
|
30 |
-
MODEL_PATH_MATCHA_MEL="
|
31 |
MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
|
32 |
MODEL_PATH_VOCOS="mel_spec_22khz.onnx"
|
33 |
CONFIG_PATH="config_22khz.yaml"
|
@@ -101,10 +103,17 @@ def tts(text:str, spk_id:int):
|
|
101 |
"scales": np.array([0.667, 1.0], dtype=np.float32),
|
102 |
"spks": sid
|
103 |
}
|
104 |
-
|
|
|
105 |
mel, mel_lengths = model_matcha_mel.run(None, inputs)
|
|
|
|
|
|
|
|
|
106 |
# vocos inference
|
107 |
wavs_vocos = vocos_inference(mel)
|
|
|
|
|
108 |
|
109 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha_vocos:
|
110 |
sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
|
@@ -117,8 +126,12 @@ def tts(text:str, spk_id:int):
|
|
117 |
"scales": np.array([0.667, 1.0], dtype=np.float32),
|
118 |
"spks": sid
|
119 |
}
|
|
|
|
|
120 |
wavs, wav_lengths = model_matcha.run(None, inputs)
|
121 |
-
|
|
|
|
|
122 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha:
|
123 |
sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")
|
124 |
|
@@ -165,7 +178,7 @@ vits2_inference = gr.Interface(
|
|
165 |
),
|
166 |
],
|
167 |
outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
|
168 |
-
gr.Audio(label="Matcha", interactive=False, type="filepath")]
|
169 |
)
|
170 |
|
171 |
demo = gr.Blocks()
|
|
|
9 |
import tempfile
|
10 |
import yaml
|
11 |
|
12 |
+
from time import perf_counter
|
13 |
+
|
14 |
def intersperse(lst, item):
|
15 |
result = [item] * (len(lst) * 2 + 1)
|
16 |
result[1::2] = lst
|
|
|
29 |
print(x_phones)
|
30 |
return x.numpy(), x_lengths.numpy()
|
31 |
|
32 |
+
MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15_10_steps.onnx"
|
33 |
MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
|
34 |
MODEL_PATH_VOCOS="mel_spec_22khz.onnx"
|
35 |
CONFIG_PATH="config_22khz.yaml"
|
|
|
103 |
"scales": np.array([0.667, 1.0], dtype=np.float32),
|
104 |
"spks": sid
|
105 |
}
|
106 |
+
mel_t0 = perf_counter()
|
107 |
+
# matcha mel inference
|
108 |
mel, mel_lengths = model_matcha_mel.run(None, inputs)
|
109 |
+
mel_infer_secs = perf_counter() - mel_t0
|
110 |
+
print("Matcha Mel inference time", mel_infer_secs)
|
111 |
+
|
112 |
+
vocos_t0 = perf_counter()
|
113 |
# vocos inference
|
114 |
wavs_vocos = vocos_inference(mel)
|
115 |
+
vocos_infer_secs = perf_counter() - vocos_t0
|
116 |
+
print("Vocos inference time", vocos_infer_secs)
|
117 |
|
118 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha_vocos:
|
119 |
sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
|
|
|
126 |
"scales": np.array([0.667, 1.0], dtype=np.float32),
|
127 |
"spks": sid
|
128 |
}
|
129 |
+
hifigan_t0 = perf_counter()
|
130 |
+
# matcha hifigan inference
|
131 |
wavs, wav_lengths = model_matcha.run(None, inputs)
|
132 |
+
hifigan_infer_secs = perf_counter() - hifigan_t0
|
133 |
+
print("Matcha + Hifigan",hifigan_infer_secs)
|
134 |
+
|
135 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha:
|
136 |
sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")
|
137 |
|
|
|
178 |
),
|
179 |
],
|
180 |
outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
|
181 |
+
gr.Audio(label="Matcha hifigan", interactive=False, type="filepath")]
|
182 |
)
|
183 |
|
184 |
demo = gr.Blocks()
|
matcha_multispeaker_cat_opset_15_10_steps.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f7ab5fb2e8d590d5cb610912d3c4e6480b32322cc4fa4bedf94eb0f8b8ce7570
|
3 |
+
size 86049399
|