wetdog commited on
Commit
847328b
1 Parent(s): 9778fab

max input length

Browse files
Files changed (1) hide show
  1. infer_onnx.py +38 -32
infer_onnx.py CHANGED
@@ -22,7 +22,8 @@ def intersperse(lst, item):
22
  result = [item] * (len(lst) * 2 + 1)
23
  result[1::2] = lst
24
  return result
25
-
 
26
  def process_text(i: int, text: str, device: torch.device, cleaner:str):
27
  print(f"[{i}] - Input text: {text}")
28
  x = torch.tensor(
@@ -152,36 +153,40 @@ def vocos_inference(mel,denoise):
152
 
153
 
154
  def tts(text:str, accent:str, spk_name:str, temperature:float, length_scale:float):
155
- denoise=True
156
- spk_id = speaker_id_dict[accent][spk_name]
157
- sid = np.array([int(spk_id)]) if spk_id is not None else None
158
- text_matcha , text_lengths = process_text(0,text,"cpu",cleaner=cleaners[accent])
159
- model_matcha_mel = models[accent]
160
-
161
- # MATCHA VOCOS
162
- inputs = {
163
- "x": text_matcha,
164
- "x_lengths": text_lengths,
165
- "scales": np.array([temperature, length_scale], dtype=np.float32),
166
- "spks": sid
167
- }
168
- mel_t0 = perf_counter()
169
- # matcha mel inference
170
- mel, mel_lengths = model_matcha_mel.run(None, inputs)
171
- mel_infer_secs = perf_counter() - mel_t0
172
- print("Matcha Mel inference time", mel_infer_secs)
173
-
174
- vocos_t0 = perf_counter()
175
- # vocos inference
176
- wavs_vocos = vocos_inference(mel,denoise)
177
- vocos_infer_secs = perf_counter() - vocos_t0
178
- print("Vocos inference time", vocos_infer_secs)
179
-
180
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha_vocos:
181
- sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
182
-
183
- print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs_vocos.shape[1]/22050) }")
184
- return fp_matcha_vocos.name
 
 
 
 
185
 
186
 
187
  ## GUI space
@@ -244,7 +249,8 @@ matcha_inference = gr.Interface(
244
  gr.Textbox(
245
  value="m'ha costat molt desenvolupar una veu, i ara que la tinc no estaré en silenci.",
246
  max_lines=1,
247
- label="Input text (max 500 characters)",
 
248
  ),
249
  accent_dropdown,
250
  speaker_dropdown,
 
22
  result = [item] * (len(lst) * 2 + 1)
23
  result[1::2] = lst
24
  return result
25
+
26
+
27
  def process_text(i: int, text: str, device: torch.device, cleaner:str):
28
  print(f"[{i}] - Input text: {text}")
29
  x = torch.tensor(
 
153
 
154
 
155
  def tts(text:str, accent:str, spk_name:str, temperature:float, length_scale:float):
156
+ if len(text) > 500:
157
+ gr.Info("The maximum input allowed is 500 characters.")
158
+
159
+ else:
160
+ denoise=True
161
+ spk_id = speaker_id_dict[accent][spk_name]
162
+ sid = np.array([int(spk_id)]) if spk_id is not None else None
163
+ text_matcha , text_lengths = process_text(0,text,"cpu",cleaner=cleaners[accent])
164
+ model_matcha_mel = models[accent]
165
+
166
+ # MATCHA VOCOS
167
+ inputs = {
168
+ "x": text_matcha,
169
+ "x_lengths": text_lengths,
170
+ "scales": np.array([temperature, length_scale], dtype=np.float32),
171
+ "spks": sid
172
+ }
173
+ mel_t0 = perf_counter()
174
+ # matcha mel inference
175
+ mel, mel_lengths = model_matcha_mel.run(None, inputs)
176
+ mel_infer_secs = perf_counter() - mel_t0
177
+ print("Matcha Mel inference time", mel_infer_secs)
178
+
179
+ vocos_t0 = perf_counter()
180
+ # vocos inference
181
+ wavs_vocos = vocos_inference(mel,denoise)
182
+ vocos_infer_secs = perf_counter() - vocos_t0
183
+ print("Vocos inference time", vocos_infer_secs)
184
+
185
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha_vocos:
186
+ sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
187
+
188
+ print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs_vocos.shape[1]/22050) }")
189
+ return fp_matcha_vocos.name
190
 
191
 
192
  ## GUI space
 
249
  gr.Textbox(
250
  value="m'ha costat molt desenvolupar una veu, i ara que la tinc no estaré en silenci.",
251
  max_lines=1,
252
+ label="Input text ",
253
+ info="max 500 characters",
254
  ),
255
  accent_dropdown,
256
  speaker_dropdown,