Baybars commited on
Commit
40b17fc
1 Parent(s): 1b8b2e7

hifigan removed, frontend changed

Browse files
Files changed (2) hide show
  1. infer_onnx.py +15 -22
  2. spk_to_id.json +49 -0
infer_onnx.py CHANGED
@@ -8,6 +8,7 @@ import gradio as gr
8
  import soundfile as sf
9
  import tempfile
10
  import yaml
 
11
 
12
  from time import perf_counter
13
 
@@ -33,12 +34,15 @@ MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15_10_steps.onnx"
33
  MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
34
  MODEL_PATH_VOCOS="mel_spec_22khz_v2.onnx"
35
  CONFIG_PATH="config_22khz.yaml"
 
36
 
37
  sess_options = onnxruntime.SessionOptions()
38
  model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"])
39
  model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
40
  model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"])
41
-
 
 
42
 
43
  def vocos_inference(mel,denoise):
44
 
@@ -123,7 +127,8 @@ def vocos_inference(mel,denoise):
123
  return y
124
 
125
 
126
- def tts(text:str, spk_id:int, temperature:float, length_scale:float, denoise:bool):
 
127
  sid = np.array([int(spk_id)]) if spk_id is not None else None
128
  text_matcha , text_lengths = process_text(0,text,"cpu")
129
 
@@ -158,17 +163,8 @@ def tts(text:str, spk_id:int, temperature:float, length_scale:float, denoise:boo
158
  "spks": sid
159
  }
160
  hifigan_t0 = perf_counter()
161
- # matcha hifigan inference
162
- wavs, wav_lengths = model_matcha.run(None, inputs)
163
- hifigan_infer_secs = perf_counter() - hifigan_t0
164
- print("Matcha + Hifigan",hifigan_infer_secs)
165
-
166
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha:
167
- sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")
168
-
169
- print(f"RTF matcha + hifigan { hifigan_infer_secs/ (wavs.shape[1]/22050) }")
170
- print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs.shape[1]/22050) }")
171
- return fp_matcha_vocos.name, fp_matcha.name
172
 
173
  ## GUI space
174
 
@@ -201,13 +197,11 @@ vits2_inference = gr.Interface(
201
  max_lines=1,
202
  label="Input text",
203
  ),
204
- gr.Slider(
205
- 1,
206
- 47,
207
- value=10,
208
- step=1,
209
  label="Speaker id",
210
- info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids.",
 
211
  ),
212
  gr.Slider(
213
  0.1,
@@ -225,10 +219,9 @@ vits2_inference = gr.Interface(
225
  label="Length scale",
226
  info=f"Controls speech pace, larger values for slower pace and smaller values for faster pace",
227
  ),
228
- gr.Checkbox(label="Denoise", info="Removes model bias from vocos"),
229
  ],
230
- outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
231
- gr.Audio(label="Matcha hifigan", interactive=False, type="filepath")]
232
  )
233
 
234
  demo = gr.Blocks()
 
8
  import soundfile as sf
9
  import tempfile
10
  import yaml
11
+ import json
12
 
13
  from time import perf_counter
14
 
 
34
  MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
35
  MODEL_PATH_VOCOS="mel_spec_22khz_v2.onnx"
36
  CONFIG_PATH="config_22khz.yaml"
37
+ SPEAKER_ID_DICT="spk_to_id.json"
38
 
39
  sess_options = onnxruntime.SessionOptions()
40
  model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"])
41
  model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
42
  model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"])
43
+ speaker_id_dict = json.load(open(SPEAKER_ID_DICT))
44
+ speakers = [sp for sp in speaker_id_dict.keys()]
45
+ speakers.sort()
46
 
47
  def vocos_inference(mel,denoise):
48
 
 
127
  return y
128
 
129
 
130
+ def tts(text:str, spk_name:str, temperature:float, length_scale:float, denoise:bool):
131
+ spk_id = speaker_id_dict[spk_name]
132
  sid = np.array([int(spk_id)]) if spk_id is not None else None
133
  text_matcha , text_lengths = process_text(0,text,"cpu")
134
 
 
163
  "spks": sid
164
  }
165
  hifigan_t0 = perf_counter()
166
+ print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs_vocos.shape[1]/22050) }")
167
+ return fp_matcha_vocos.name
 
 
 
 
 
 
 
 
 
168
 
169
  ## GUI space
170
 
 
197
  max_lines=1,
198
  label="Input text",
199
  ),
200
+ gr.Dropdown(
201
+ choices=speakers,
 
 
 
202
  label="Speaker id",
203
+ value='caf_09204',
204
+ info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids."
205
  ),
206
  gr.Slider(
207
  0.1,
 
219
  label="Length scale",
220
  info=f"Controls speech pace, larger values for slower pace and smaller values for faster pace",
221
  ),
222
+ gr.Checkbox(label="Denoise", info="Removes model bias from vocos", value=True),
223
  ],
224
+ outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath")]
 
225
  )
226
 
227
  demo = gr.Blocks()
spk_to_id.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cam_03115": 0,
3
+ "caf_04247": 1,
4
+ "caf_05450": 2,
5
+ "cam_08935": 3,
6
+ "caf_09901": 4,
7
+ "ona": 5,
8
+ "pol": 6,
9
+ "cam_02689": 7,
10
+ "caf_06042": 8,
11
+ "jan": 9,
12
+ "caf_08106": 10,
13
+ "cam_04910": 11,
14
+ "cam_08664": 12,
15
+ "caf_07803": 13,
16
+ "cam_06582": 14,
17
+ "caf_06311": 15,
18
+ "caf_07245": 16,
19
+ "cam_06279": 17,
20
+ "caf_09598": 18,
21
+ "caf_09796": 19,
22
+ "eva": 20,
23
+ "cam_00762": 21,
24
+ "caf_09204": 22,
25
+ "caf_03944": 23,
26
+ "caf_05147": 24,
27
+ "uri": 25,
28
+ "mar": 26,
29
+ "cam_00459": 27,
30
+ "teo": 28,
31
+ "caf_03655": 29,
32
+ "bet": 30,
33
+ "cam_06705": 31,
34
+ "caf_05739": 32,
35
+ "caf_06008": 33,
36
+ "cam_04484": 34,
37
+ "cam_03386": 35,
38
+ "cam_08967": 36,
39
+ "caf_06942": 37,
40
+ "cam_07140": 38,
41
+ "pau": 39,
42
+ "caf_08001": 40,
43
+ "pep": 41,
44
+ "cam_04787": 42,
45
+ "eli": 43,
46
+ "caf_01591": 44,
47
+ "caf_02452": 45,
48
+ "cam_02992": 46
49
+ }