khof312 commited on
Commit
593cb11
1 Parent(s): 1eb5aca

Add support for IMS Toucan.

Browse files
Files changed (2) hide show
  1. app.py +21 -3
  2. src/synthesize.py +30 -0
app.py CHANGED
@@ -50,6 +50,7 @@ type=['wav'])
50
  base_mms = synth_mms(tts_text, models[tts_lang]['mms'])
51
  base_coqui= synth_coqui(tts_text, models[tts_lang]['coqui'])
52
  base_espeakng= synth_espeakng(tts_text, models[tts_lang]['espeakng'])
 
53
 
54
  if tts_lang=="swh":
55
  finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-swh-female-1")
@@ -68,6 +69,7 @@ type=['wav'])
68
  row2 = st.columns([1,1,2])
69
  row3 = st.columns([1,1,2])
70
  row4 = st.columns([1,1,2])
 
71
 
72
  row1[0].write("**Model**")
73
  row1[1].write("**Configuration**")
@@ -84,10 +86,15 @@ type=['wav'])
84
  row3[2].audio(base_coqui[0], sample_rate = base_coqui[1])
85
 
86
  if base_espeakng is not None:
87
-
88
  row4[0].write(f"[Espeak-ng](https://github.com/espeak-ng/espeak-ng)")
89
  row4[1].write("default")
90
  row4[2].audio(base_espeakng[0], sample_rate = base_espeakng[1])
 
 
 
 
 
 
91
 
92
  #################################################################
93
  if tts_lang == "swh":
@@ -156,9 +163,13 @@ type=['wav'])
156
  scipy.io.wavfile.write("source_speaker_espeakng.wav", rate=base_espeakng[1], data=base_espeakng[0].T)
157
  converted_espeakng = convert_coqui('source_speaker_espeakng.wav', target_speaker)
158
 
 
 
 
159
  row1 = st.columns([1,1,2])
160
  row2 = st.columns([1,1,2])
161
  row3 = st.columns([1,1,2])
 
162
 
163
  row1[0].write("**Model**")
164
  row1[1].write("**Configuration**")
@@ -178,6 +189,11 @@ type=['wav'])
178
  row3[0].write(f"Espeak-ng")
179
  row3[1].write(f"converted")
180
  row3[2].audio(converted_espeakng[0], sample_rate = converted_espeakng[1])
 
 
 
 
 
181
 
182
 
183
  #row3[0].write("MMS-TTS-SWH")
@@ -197,12 +213,13 @@ type=['wav'])
197
  with about:
198
  #st.header("How it works")
199
  st.markdown('''# Mockingbird TTS Demo
200
- This page is a demo of the openly available Text to Speech models for various languages of interest. Currently, 3 synthesizers are supported:
201
  - [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1]
202
  - [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package;[^2] while no longer supported, Coqui acted as a hub for TTS model hosting and these models are still available.
203
  - [**ESpeak-NG's**](https://github.com/espeak-ng/espeak-ng/tree/master)'s synthetic voices**[^3]
 
204
 
205
- Voice conversion is achieved through Coqui.
206
 
207
  Notes:
208
  1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output.
@@ -219,5 +236,6 @@ Notes:
219
 
220
  [^2]: [Available models](https://github.com/coqui-ai/TTS/blob/dev/TTS/.models.json)
221
  [^3]: [Language list](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)
 
222
  ''')
223
 
 
50
  base_mms = synth_mms(tts_text, models[tts_lang]['mms'])
51
  base_coqui= synth_coqui(tts_text, models[tts_lang]['coqui'])
52
  base_espeakng= synth_espeakng(tts_text, models[tts_lang]['espeakng'])
53
+ base_toucan= synth_toucan(tts_text, models[tts_lang]['toucan'])
54
 
55
  if tts_lang=="swh":
56
  finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-swh-female-1")
 
69
  row2 = st.columns([1,1,2])
70
  row3 = st.columns([1,1,2])
71
  row4 = st.columns([1,1,2])
72
+ row5 = st.columns([1,1,2])
73
 
74
  row1[0].write("**Model**")
75
  row1[1].write("**Configuration**")
 
86
  row3[2].audio(base_coqui[0], sample_rate = base_coqui[1])
87
 
88
  if base_espeakng is not None:
 
89
  row4[0].write(f"[Espeak-ng](https://github.com/espeak-ng/espeak-ng)")
90
  row4[1].write("default")
91
  row4[2].audio(base_espeakng[0], sample_rate = base_espeakng[1])
92
+
93
+
94
+ row5[0].write(f"[IMS-Toucan](https://github.com/DigitalPhonetics/IMS-Toucan)")
95
+ row5[1].write("default")
96
+ row5[2].audio(base_toucan[0], sample_rate = base_toucan[1])
97
+
98
 
99
  #################################################################
100
  if tts_lang == "swh":
 
163
  scipy.io.wavfile.write("source_speaker_espeakng.wav", rate=base_espeakng[1], data=base_espeakng[0].T)
164
  converted_espeakng = convert_coqui('source_speaker_espeakng.wav', target_speaker)
165
 
166
+ scipy.io.wavfile.write("source_speaker_toucan.wav", rate=base_toucan[1], data=base_toucan[0].T)
167
+ converted_toucan = convert_coqui('source_speaker_toucan.wav', target_speaker)
168
+
169
  row1 = st.columns([1,1,2])
170
  row2 = st.columns([1,1,2])
171
  row3 = st.columns([1,1,2])
172
+ row4 = st.columns([1,1,2])
173
 
174
  row1[0].write("**Model**")
175
  row1[1].write("**Configuration**")
 
189
  row3[0].write(f"Espeak-ng")
190
  row3[1].write(f"converted")
191
  row3[2].audio(converted_espeakng[0], sample_rate = converted_espeakng[1])
192
+
193
+
194
+ row4[0].write(f"IMS Toucan")
195
+ row4[1].write(f"converted")
196
+ row4[2].audio(converted_toucan[0], sample_rate = converted_toucan[1])
197
 
198
 
199
  #row3[0].write("MMS-TTS-SWH")
 
213
  with about:
214
  #st.header("How it works")
215
  st.markdown('''# Mockingbird TTS Demo
216
+ This page is a demo of the openly available Text to Speech models for various languages of interest. Currently, 4 synthesizers are supported:
217
  - [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1]
218
  - [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package;[^2] while no longer supported, Coqui acted as a hub for TTS model hosting and these models are still available.
219
  - [**ESpeak-NG's**](https://github.com/espeak-ng/espeak-ng/tree/master)'s synthetic voices**[^3]
220
+ - [**IMS Toucan**](https://github.com/DigitalPhonetics/IMS-Toucan), which supports 7000 languages. [^4]
221
 
222
+ Voice conversion is currently achieved through Coqui.
223
 
224
  Notes:
225
  1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output.
 
236
 
237
  [^2]: [Available models](https://github.com/coqui-ai/TTS/blob/dev/TTS/.models.json)
238
  [^3]: [Language list](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)
239
+ [^4]: Language list is available in the Gradio API documentation [here](https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS).
240
  ''')
241
 
src/synthesize.py CHANGED
@@ -9,6 +9,8 @@ from scipy.io import wavfile
9
  from transformers import pipeline
10
  import os
11
  import numpy as np
 
 
12
 
13
  def synth_mms(text:str, model:str):
14
  '''
@@ -86,3 +88,31 @@ def synth_espeakng(text:str, model:str):
86
  return wav, sampling_rate
87
  else:
88
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  from transformers import pipeline
10
  import os
11
  import numpy as np
12
+ from gradio_client import Client, file
13
+
14
 
15
  def synth_mms(text:str, model:str):
16
  '''
 
88
  return wav, sampling_rate
89
  else:
90
  return None
91
+
92
+
93
+ def synth_toucan(text:str, model:str):
94
+ '''
95
+ Use Toucan to synthesize text.
96
+
97
+ Inputs:
98
+ text: Text to synthesze
99
+ model: Model code
100
+ Returns:
101
+ Streaming Wav and sampling rate.
102
+
103
+ NOTE: This wrapper does not let you explore the full range of options possible with the API. The API should allow you to generate female voices, however, it does not seem to be working at the moment.
104
+ '''
105
+ client = Client("Flux9665/MassivelyMultilingualTTS")
106
+ result = client.predict(
107
+ prompt=text,
108
+ language=model,
109
+ reference_audio=file('https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav'),
110
+ voice_seed=123,
111
+ prosody_creativity=0.1,
112
+ duration_scaling_factor=1,
113
+ emb1=0,
114
+ emb2=0,
115
+ api_name="/predict"
116
+ )
117
+ sampling_rate, wav = wavfile.read(result[0])
118
+ return wav, sampling_rate