Spaces:

projecte-aina
/

matxa-alvocat-tts-ca

Running

App Files Files Community

wetdog commited on Mar 6

Commit

158b03a

•

1 Parent(s): bbb1375

add 10 timesteps model

Browse files

Files changed (2) hide show

infer_onnx.py +17 -4
matcha_multispeaker_cat_opset_15_10_steps.onnx +3 -0

infer_onnx.py CHANGED Viewed

@@ -9,6 +9,8 @@ import soundfile as sf
 import tempfile
 import yaml
 def intersperse(lst, item):
     result = [item] * (len(lst) * 2 + 1)
     result[1::2] = lst
@@ -27,7 +29,7 @@ def process_text(i: int, text: str, device: torch.device):
     print(x_phones)
     return x.numpy(), x_lengths.numpy()
-MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15.onnx"
 MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
 MODEL_PATH_VOCOS="mel_spec_22khz.onnx"
 CONFIG_PATH="config_22khz.yaml"
@@ -101,10 +103,17 @@ def tts(text:str, spk_id:int):
         "scales": np.array([0.667, 1.0], dtype=np.float32),
         "spks": sid
     }
     mel, mel_lengths = model_matcha_mel.run(None, inputs)
     # vocos inference
     wavs_vocos = vocos_inference(mel)
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha_vocos:
         sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
@@ -117,8 +126,12 @@ def tts(text:str, spk_id:int):
         "scales": np.array([0.667, 1.0], dtype=np.float32),
         "spks": sid
     }
     wavs, wav_lengths = model_matcha.run(None, inputs)
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha:
         sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")
@@ -165,7 +178,7 @@ vits2_inference = gr.Interface(
         ),
     ],
     outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
-             gr.Audio(label="Matcha", interactive=False, type="filepath")]
 )
 demo = gr.Blocks()

 import tempfile
 import yaml
+from time import perf_counter
 def intersperse(lst, item):
     result = [item] * (len(lst) * 2 + 1)
     result[1::2] = lst
     print(x_phones)
     return x.numpy(), x_lengths.numpy()
+MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15_10_steps.onnx"
 MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
 MODEL_PATH_VOCOS="mel_spec_22khz.onnx"
 CONFIG_PATH="config_22khz.yaml"
         "scales": np.array([0.667, 1.0], dtype=np.float32),
         "spks": sid
     }
+    mel_t0 = perf_counter()
+    # matcha mel inference
     mel, mel_lengths = model_matcha_mel.run(None, inputs)
+    mel_infer_secs = perf_counter() - mel_t0
+    print("Matcha Mel inference time", mel_infer_secs)
+    vocos_t0 = perf_counter()
     # vocos inference
     wavs_vocos = vocos_inference(mel)
+    vocos_infer_secs = perf_counter() - vocos_t0
+    print("Vocos inference time", vocos_infer_secs)
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha_vocos:
         sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
         "scales": np.array([0.667, 1.0], dtype=np.float32),
         "spks": sid
     }
+    hifigan_t0 = perf_counter()
+    # matcha hifigan inference
     wavs, wav_lengths = model_matcha.run(None, inputs)
+    hifigan_infer_secs = perf_counter() - hifigan_t0
+    print("Matcha + Hifigan",hifigan_infer_secs)
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha:
         sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")
         ),
     ],
     outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
+             gr.Audio(label="Matcha hifigan", interactive=False, type="filepath")]
 )
 demo = gr.Blocks()

matcha_multispeaker_cat_opset_15_10_steps.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7ab5fb2e8d590d5cb610912d3c4e6480b32322cc4fa4bedf94eb0f8b8ce7570
+size 86049399