Flux9665 commited on
Commit
ff32d8e
1 Parent(s): 558df2f

adapt the version number to the release

Browse files
Files changed (2) hide show
  1. app.py +1 -6
  2. app_future.py +193 -0
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
 
3
  import spaces
4
 
5
- os.system("git clone --branch v2.5 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
6
  os.system("mv toucan_codebase/* .")
7
 
8
  from run_model_downloader import download_models
@@ -39,7 +39,6 @@ class ControllableInterface(torch.nn.Module):
39
  language,
40
  accent,
41
  voice_seed,
42
- prosody_creativity,
43
  duration_scaling_factor,
44
  pause_duration_scaling_factor,
45
  pitch_variance_scale,
@@ -114,7 +113,6 @@ class ControllableInterface(torch.nn.Module):
114
  energy_variance_scale=energy_variance_scale,
115
  pause_duration_scaling_factor=pause_duration_scaling_factor,
116
  return_plot_as_filepath=True,
117
- prosody_creativity=prosody_creativity,
118
  loudness_in_db=loudness_in_db)
119
  return sr, wav, fig
120
 
@@ -132,7 +130,6 @@ controllable_ui = ControllableInterface(available_artificial_voices=available_ar
132
  def read(prompt,
133
  language,
134
  voice_seed,
135
- prosody_creativity,
136
  duration_scaling_factor,
137
  pitch_variance_scale,
138
  energy_variance_scale,
@@ -147,7 +144,6 @@ def read(prompt,
147
  language.split(" ")[-1].split("(")[1].split(")")[0],
148
  language.split(" ")[-1].split("(")[1].split(")")[0],
149
  voice_seed,
150
- prosody_creativity,
151
  duration_scaling_factor,
152
  1.,
153
  pitch_variance_scale,
@@ -177,7 +173,6 @@ iface = gr.Interface(fn=read,
177
  gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
178
  value=279,
179
  label="Random Seed for the artificial Voice"),
180
- gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.7, label="Prosody Creativity"),
181
  gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
182
  gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
183
  gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
 
2
 
3
  import spaces
4
 
5
+ os.system("git clone --branch v3.0 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
6
  os.system("mv toucan_codebase/* .")
7
 
8
  from run_model_downloader import download_models
 
39
  language,
40
  accent,
41
  voice_seed,
 
42
  duration_scaling_factor,
43
  pause_duration_scaling_factor,
44
  pitch_variance_scale,
 
113
  energy_variance_scale=energy_variance_scale,
114
  pause_duration_scaling_factor=pause_duration_scaling_factor,
115
  return_plot_as_filepath=True,
 
116
  loudness_in_db=loudness_in_db)
117
  return sr, wav, fig
118
 
 
130
  def read(prompt,
131
  language,
132
  voice_seed,
 
133
  duration_scaling_factor,
134
  pitch_variance_scale,
135
  energy_variance_scale,
 
144
  language.split(" ")[-1].split("(")[1].split(")")[0],
145
  language.split(" ")[-1].split("(")[1].split(")")[0],
146
  voice_seed,
 
147
  duration_scaling_factor,
148
  1.,
149
  pitch_variance_scale,
 
173
  gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
174
  value=279,
175
  label="Random Seed for the artificial Voice"),
 
176
  gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
177
  gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
178
  gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
app_future.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import spaces
4
+
5
+ os.system("git clone --branch v3.1 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
6
+ os.system("mv toucan_codebase/* .")
7
+
8
+ from run_model_downloader import download_models
9
+
10
+ download_models()
11
+
12
+ import gradio as gr
13
+ import torch.cuda
14
+ from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
15
+ from Utility.utils import float2pcm
16
+
17
+ import os
18
+
19
+ import torch
20
+
21
+ from Architectures.ControllabilityGAN.GAN import GanWrapper
22
+ from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
23
+ from Utility.storage_config import MODELS_DIR
24
+
25
+
26
+ class ControllableInterface(torch.nn.Module):
27
+
28
+ def __init__(self, available_artificial_voices=1000):
29
+ super().__init__()
30
+ self.model = ToucanTTSInterface(device="cpu", tts_model_path="Meta")
31
+ self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cpu")
32
+ self.generated_speaker_embeds = list()
33
+ self.available_artificial_voices = available_artificial_voices
34
+ self.current_language = ""
35
+ self.current_accent = ""
36
+
37
+ def read(self,
38
+ prompt,
39
+ language,
40
+ accent,
41
+ voice_seed,
42
+ prosody_creativity,
43
+ duration_scaling_factor,
44
+ pause_duration_scaling_factor,
45
+ pitch_variance_scale,
46
+ energy_variance_scale,
47
+ emb_slider_1,
48
+ emb_slider_2,
49
+ emb_slider_3,
50
+ emb_slider_4,
51
+ emb_slider_5,
52
+ emb_slider_6,
53
+ loudness_in_db
54
+ ):
55
+ if self.current_language != language:
56
+ self.model.set_phonemizer_language(language)
57
+ self.current_language = language
58
+ if self.current_accent != accent:
59
+ self.model.set_accent_language(accent)
60
+ self.current_accent = accent
61
+
62
+ self.wgan.set_latent(voice_seed)
63
+ controllability_vector = torch.tensor([emb_slider_1,
64
+ emb_slider_2,
65
+ emb_slider_3,
66
+ emb_slider_4,
67
+ emb_slider_5,
68
+ emb_slider_6], dtype=torch.float32)
69
+ embedding = self.wgan.modify_embed(controllability_vector)
70
+ self.model.set_utterance_embedding(embedding=embedding)
71
+
72
+ phones = self.model.text2phone.get_phone_string(prompt)
73
+ if len(phones) > 1800:
74
+ if language == "deu":
75
+ prompt = "Deine Eingabe war zu lang. Bitte versuche es entweder mit einem kürzeren Text oder teile ihn in mehrere Teile auf."
76
+ elif language == "ell":
77
+ prompt = "Η εισήγησή σας ήταν πολύ μεγάλη. Παρακαλώ δοκιμάστε είτε ένα μικρότερο κείμενο είτε χωρίστε το σε διάφορα μέρη."
78
+ elif language == "spa":
79
+ prompt = "Su entrada es demasiado larga. Por favor, intente un texto más corto o divídalo en varias partes."
80
+ elif language == "fin":
81
+ prompt = "Vastauksesi oli liian pitkä. Kokeile joko lyhyempää tekstiä tai jaa se useampaan osaan."
82
+ elif language == "rus":
83
+ prompt = "Ваш текст слишком длинный. Пожалуйста, попробуйте либо сократить текст, либо разделить его на несколько частей."
84
+ elif language == "hun":
85
+ prompt = "Túl hosszú volt a bevitele. Kérjük, próbáljon meg rövidebb szöveget írni, vagy ossza több részre."
86
+ elif language == "nld":
87
+ prompt = "Uw input was te lang. Probeer een kortere tekst of splits het in verschillende delen."
88
+ elif language == "fra":
89
+ prompt = "Votre saisie était trop longue. Veuillez essayer un texte plus court ou le diviser en plusieurs parties."
90
+ elif language == 'pol':
91
+ prompt = "Twój wpis był zbyt długi. Spróbuj skrócić tekst lub podzielić go na kilka części."
92
+ elif language == 'por':
93
+ prompt = "O seu contributo foi demasiado longo. Por favor, tente um texto mais curto ou divida-o em várias partes."
94
+ elif language == 'ita':
95
+ prompt = "Il tuo input era troppo lungo. Per favore, prova un testo più corto o dividilo in più parti."
96
+ elif language == 'cmn':
97
+ prompt = "你的输入太长了。请尝试使用较短的文本或将其拆分为多个部分。"
98
+ elif language == 'vie':
99
+ prompt = "Đầu vào của bạn quá dài. Vui lòng thử một văn bản ngắn hơn hoặc chia nó thành nhiều phần."
100
+ else:
101
+ prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
102
+ if self.current_language != "eng":
103
+ self.model.set_phonemizer_language("eng")
104
+ self.current_language = "eng"
105
+ if self.current_accent != "eng":
106
+ self.model.set_accent_language("eng")
107
+ self.current_accent = "eng"
108
+
109
+ print(prompt)
110
+ wav, sr, fig = self.model(prompt,
111
+ input_is_phones=False,
112
+ duration_scaling_factor=duration_scaling_factor,
113
+ pitch_variance_scale=pitch_variance_scale,
114
+ energy_variance_scale=energy_variance_scale,
115
+ pause_duration_scaling_factor=pause_duration_scaling_factor,
116
+ return_plot_as_filepath=True,
117
+ prosody_creativity=prosody_creativity,
118
+ loudness_in_db=loudness_in_db)
119
+ return sr, wav, fig
120
+
121
+
122
+ title = "Controllable Text-to-Speech for over 7000 Languages"
123
+ article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
124
+ available_artificial_voices = 1000
125
+ path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
126
+ iso_to_name = load_json_from_path(path_to_iso_list)
127
+ text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
128
+ controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)
129
+
130
+
131
+ @spaces.GPU
132
+ def read(prompt,
133
+ language,
134
+ voice_seed,
135
+ prosody_creativity,
136
+ duration_scaling_factor,
137
+ pitch_variance_scale,
138
+ energy_variance_scale,
139
+ emb1,
140
+ emb2
141
+ ):
142
+ if torch.cuda.is_available():
143
+ controllable_ui.to("cuda")
144
+ controllable_ui.device = "cuda"
145
+ try:
146
+ sr, wav, fig = controllable_ui.read(prompt,
147
+ language.split(" ")[-1].split("(")[1].split(")")[0],
148
+ language.split(" ")[-1].split("(")[1].split(")")[0],
149
+ voice_seed,
150
+ prosody_creativity,
151
+ duration_scaling_factor,
152
+ 1.,
153
+ pitch_variance_scale,
154
+ energy_variance_scale,
155
+ emb1,
156
+ emb2,
157
+ 0.,
158
+ 0.,
159
+ 0.,
160
+ 0.,
161
+ -24.)
162
+ finally:
163
+ controllable_ui.to("cpu")
164
+ controllable_ui.device = "cpu"
165
+ return (sr, float2pcm(wav)), fig
166
+
167
+
168
+ iface = gr.Interface(fn=read,
169
+ inputs=[gr.Textbox(lines=2,
170
+ placeholder="write what you want the synthesis to read here...",
171
+ value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
172
+ label="Text input"),
173
+ gr.Dropdown(text_selection,
174
+ type="value",
175
+ value='English Text (eng)',
176
+ label="Select the Language of the Text (type on your keyboard to find it quickly)"),
177
+ gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
178
+ value=279,
179
+ label="Random Seed for the artificial Voice"),
180
+ gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.7, label="Prosody Creativity"),
181
+ gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
182
+ gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
183
+ gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
184
+ gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
185
+ gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
186
+ ],
187
+ outputs=[gr.Audio(type="numpy", label="Speech"),
188
+ gr.Image(label="Visualization")],
189
+ title=title,
190
+ theme="default",
191
+ allow_flagging="never",
192
+ article=article)
193
+ iface.launch()