tts_mockingbird

Running

App Files Files Community

khof312 commited on Jul 31

Commit

593cb11

•

1 Parent(s): 1eb5aca

Add support for IMS Toucan.

Browse files

Files changed (2) hide show

app.py +21 -3
src/synthesize.py +30 -0

app.py CHANGED Viewed

@@ -50,6 +50,7 @@ type=['wav'])
             base_mms = synth_mms(tts_text, models[tts_lang]['mms'])
             base_coqui= synth_coqui(tts_text, models[tts_lang]['coqui'])
             base_espeakng= synth_espeakng(tts_text, models[tts_lang]['espeakng'])
             if tts_lang=="swh":
                 finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-swh-female-1")
@@ -68,6 +69,7 @@ type=['wav'])
             row2 = st.columns([1,1,2])
             row3 = st.columns([1,1,2])
             row4 = st.columns([1,1,2])
             row1[0].write("**Model**")
             row1[1].write("**Configuration**")
@@ -84,10 +86,15 @@ type=['wav'])
                 row3[2].audio(base_coqui[0], sample_rate = base_coqui[1])
             if base_espeakng is not None:
                 row4[0].write(f"[Espeak-ng](https://github.com/espeak-ng/espeak-ng)")
                 row4[1].write("default")
                 row4[2].audio(base_espeakng[0], sample_rate = base_espeakng[1])
             #################################################################
             if tts_lang == "swh":
@@ -156,9 +163,13 @@ type=['wav'])
                 scipy.io.wavfile.write("source_speaker_espeakng.wav", rate=base_espeakng[1], data=base_espeakng[0].T)
                 converted_espeakng = convert_coqui('source_speaker_espeakng.wav', target_speaker)
             row1 = st.columns([1,1,2])
             row2 = st.columns([1,1,2])
             row3 = st.columns([1,1,2])
             row1[0].write("**Model**")
             row1[1].write("**Configuration**")
@@ -178,6 +189,11 @@ type=['wav'])
                 row3[0].write(f"Espeak-ng")
                 row3[1].write(f"converted")
                 row3[2].audio(converted_espeakng[0], sample_rate = converted_espeakng[1])
                 #row3[0].write("MMS-TTS-SWH")
@@ -197,12 +213,13 @@ type=['wav'])
 with about:
     #st.header("How it works")
     st.markdown('''# Mockingbird TTS Demo
-This page is a demo of the openly available Text to Speech models for various languages of interest. Currently, 3 synthesizers are supported:
 - [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1]
 - [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package;[^2] while no longer supported, Coqui acted as a hub for TTS model hosting and these models are still available.
 - [**ESpeak-NG's**](https://github.com/espeak-ng/espeak-ng/tree/master)'s synthetic voices**[^3]
-Voice conversion is achieved through Coqui.
 Notes:
 1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output.
@@ -219,5 +236,6 @@ Notes:
 [^2]: [Available models](https://github.com/coqui-ai/TTS/blob/dev/TTS/.models.json)
 [^3]: [Language list](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)
 ''')

             base_mms = synth_mms(tts_text, models[tts_lang]['mms'])
             base_coqui= synth_coqui(tts_text, models[tts_lang]['coqui'])
             base_espeakng= synth_espeakng(tts_text, models[tts_lang]['espeakng'])
+            base_toucan= synth_toucan(tts_text, models[tts_lang]['toucan'])
             if tts_lang=="swh":
                 finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-swh-female-1")
             row2 = st.columns([1,1,2])
             row3 = st.columns([1,1,2])
             row4 = st.columns([1,1,2])
+            row5 = st.columns([1,1,2])
             row1[0].write("**Model**")
             row1[1].write("**Configuration**")
                 row3[2].audio(base_coqui[0], sample_rate = base_coqui[1])
             if base_espeakng is not None:
                 row4[0].write(f"[Espeak-ng](https://github.com/espeak-ng/espeak-ng)")
                 row4[1].write("default")
                 row4[2].audio(base_espeakng[0], sample_rate = base_espeakng[1])
+            row5[0].write(f"[IMS-Toucan](https://github.com/DigitalPhonetics/IMS-Toucan)")
+            row5[1].write("default")
+            row5[2].audio(base_toucan[0], sample_rate = base_toucan[1])
             #################################################################
             if tts_lang == "swh":
                 scipy.io.wavfile.write("source_speaker_espeakng.wav", rate=base_espeakng[1], data=base_espeakng[0].T)
                 converted_espeakng = convert_coqui('source_speaker_espeakng.wav', target_speaker)
+            scipy.io.wavfile.write("source_speaker_toucan.wav", rate=base_toucan[1], data=base_toucan[0].T)
+            converted_toucan = convert_coqui('source_speaker_toucan.wav', target_speaker)
             row1 = st.columns([1,1,2])
             row2 = st.columns([1,1,2])
             row3 = st.columns([1,1,2])
+            row4 = st.columns([1,1,2])
             row1[0].write("**Model**")
             row1[1].write("**Configuration**")
                 row3[0].write(f"Espeak-ng")
                 row3[1].write(f"converted")
                 row3[2].audio(converted_espeakng[0], sample_rate = converted_espeakng[1])
+            row4[0].write(f"IMS Toucan")
+            row4[1].write(f"converted")
+            row4[2].audio(converted_toucan[0], sample_rate = converted_toucan[1])
                 #row3[0].write("MMS-TTS-SWH")
 with about:
     #st.header("How it works")
     st.markdown('''# Mockingbird TTS Demo
+This page is a demo of the openly available Text to Speech models for various languages of interest. Currently, 4 synthesizers are supported:
 - [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1]
 - [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package;[^2] while no longer supported, Coqui acted as a hub for TTS model hosting and these models are still available.
 - [**ESpeak-NG's**](https://github.com/espeak-ng/espeak-ng/tree/master)'s synthetic voices**[^3]
+- [**IMS Toucan**](https://github.com/DigitalPhonetics/IMS-Toucan), which supports 7000 languages. [^4]
+Voice conversion is currently achieved through Coqui.
 Notes:
 1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output.
 [^2]: [Available models](https://github.com/coqui-ai/TTS/blob/dev/TTS/.models.json)
 [^3]: [Language list](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)
+[^4]: Language list is available in the Gradio API documentation [here](https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS).
 ''')

src/synthesize.py CHANGED Viewed

@@ -9,6 +9,8 @@ from scipy.io import wavfile
 from transformers import pipeline
 import os
 import numpy as np
 def synth_mms(text:str, model:str):
     '''
@@ -86,3 +88,31 @@ def synth_espeakng(text:str, model:str):
         return wav, sampling_rate
     else:
         return None

 from transformers import pipeline
 import os
 import numpy as np
+from gradio_client import Client, file
 def synth_mms(text:str, model:str):
     '''
         return wav, sampling_rate
     else:
         return None
+def synth_toucan(text:str, model:str):
+    '''
+    Use Toucan to synthesize text.
+    Inputs:
+        text: Text to synthesze
+        model: Model code
+    Returns:
+        Streaming Wav and sampling rate.
+    NOTE: This wrapper does not let you explore the full range of options possible with the API. The API should allow you to generate female voices, however, it does not seem to be working at the moment.
+    '''
+    client = Client("Flux9665/MassivelyMultilingualTTS")
+    result = client.predict(
+    		prompt=text,
+    		language=model,
+    		reference_audio=file('https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav'),
+    		voice_seed=123,
+    		prosody_creativity=0.1,
+    		duration_scaling_factor=1,
+    		emb1=0,
+    		emb2=0,
+    		api_name="/predict"
+    )
+    sampling_rate, wav = wavfile.read(result[0])
+    return wav, sampling_rate