Flux9665 commited on
Commit
6cd09aa
1 Parent(s): 185fc75

make read function device agnostic

Browse files
Files changed (1) hide show
  1. app.py +168 -61
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
- import spaces
3
 
 
4
 
5
  os.system("git clone --branch v2.5 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
6
  os.system("mv toucan_codebase/* .")
@@ -11,76 +11,183 @@ download_models()
11
 
12
  import gradio as gr
13
  import torch.cuda
14
- from InferenceInterfaces.ControllableInterface import ControllableInterface
15
  from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
16
  from Utility.utils import float2pcm
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- class TTSWebUI:
20
-
21
- def __init__(self, gpu_id="cpu", title="Controllable Text-to-Speech for over 7000 Languages", article="", available_artificial_voices=1000, path_to_iso_list="Preprocessing/multilinguality/iso_to_fullname.json"):
22
- iso_to_name = load_json_from_path(path_to_iso_list)
23
- text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
24
- # accent_selection = [f"{iso_to_name[iso_code]} Accent ({iso_code})" for iso_code in iso_to_name]
25
-
26
- self.controllable_ui = ControllableInterface(gpu_id=gpu_id,
27
- available_artificial_voices=available_artificial_voices)
28
- self.iface = gr.Interface(fn=self.read,
29
- inputs=[gr.Textbox(lines=2,
30
- placeholder="write what you want the synthesis to read here...",
31
- value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
32
- label="Text input"),
33
- gr.Dropdown(text_selection,
34
- type="value",
35
- value='English Text (eng)',
36
- label="Select the Language of the Text (type on your keyboard to find it quickly)"),
37
- gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
38
- value=279,
39
- label="Random Seed for the artificial Voice"),
40
- gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.7, label="Prosody Creativity"),
41
- gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
42
- gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
43
- gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
44
- gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
45
- gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
46
- ],
47
- outputs=[gr.Audio(type="numpy", label="Speech"),
48
- gr.Image(label="Visualization")],
49
- title=title,
50
- theme="default",
51
- allow_flagging="never",
52
- article=article)
53
- self.iface.launch()
54
-
55
- @spaces.GPU
56
  def read(self,
57
  prompt,
58
  language,
 
59
  voice_seed,
60
  prosody_creativity,
61
  duration_scaling_factor,
 
62
  pitch_variance_scale,
63
  energy_variance_scale,
64
- emb1,
65
- emb2
 
 
 
 
 
66
  ):
67
- sr, wav, fig = self.controllable_ui.read(prompt,
68
- language.split(" ")[-1].split("(")[1].split(")")[0],
69
- language.split(" ")[-1].split("(")[1].split(")")[0],
70
- voice_seed,
71
- prosody_creativity,
72
- duration_scaling_factor,
73
- 1.,
74
- pitch_variance_scale,
75
- energy_variance_scale,
76
- emb1,
77
- emb2,
78
- 0.,
79
- 0.,
80
- 0.,
81
- 0.,
82
- -24.)
83
- return (sr, float2pcm(wav)), fig
84
-
85
-
86
- TTSWebUI(gpu_id="cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
2
 
3
+ import spaces
4
 
5
  os.system("git clone --branch v2.5 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
6
  os.system("mv toucan_codebase/* .")
 
11
 
12
  import gradio as gr
13
  import torch.cuda
 
14
  from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
15
  from Utility.utils import float2pcm
16
 
17
+ import os
18
+
19
+ import torch
20
+
21
+ from Architectures.ControllabilityGAN.GAN import GanWrapper
22
+ from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
23
+ from Utility.storage_config import MODELS_DIR
24
+
25
+
26
+ class ControllableInterface(torch.nn.Module):
27
+
28
+ def __init__(self, available_artificial_voices=1000):
29
+ super().__init__()
30
+ self.model = ToucanTTSInterface(device="cpu", tts_model_path="Meta")
31
+ self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cpu")
32
+ self.generated_speaker_embeds = list()
33
+ self.available_artificial_voices = available_artificial_voices
34
+ self.current_language = ""
35
+ self.current_accent = ""
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def read(self,
38
  prompt,
39
  language,
40
+ accent,
41
  voice_seed,
42
  prosody_creativity,
43
  duration_scaling_factor,
44
+ pause_duration_scaling_factor,
45
  pitch_variance_scale,
46
  energy_variance_scale,
47
+ emb_slider_1,
48
+ emb_slider_2,
49
+ emb_slider_3,
50
+ emb_slider_4,
51
+ emb_slider_5,
52
+ emb_slider_6,
53
+ loudness_in_db
54
  ):
55
+ if self.current_language != language:
56
+ self.model.set_phonemizer_language(language)
57
+ self.current_language = language
58
+ if self.current_accent != accent:
59
+ self.model.set_accent_language(accent)
60
+ self.current_accent = accent
61
+
62
+ self.wgan.set_latent(voice_seed)
63
+ controllability_vector = torch.tensor([emb_slider_1,
64
+ emb_slider_2,
65
+ emb_slider_3,
66
+ emb_slider_4,
67
+ emb_slider_5,
68
+ emb_slider_6], dtype=torch.float32)
69
+ embedding = self.wgan.modify_embed(controllability_vector)
70
+ self.model.set_utterance_embedding(embedding=embedding)
71
+
72
+ phones = self.model.text2phone.get_phone_string(prompt)
73
+ if len(phones) > 1800:
74
+ if language == "deu":
75
+ prompt = "Deine Eingabe war zu lang. Bitte versuche es entweder mit einem kürzeren Text oder teile ihn in mehrere Teile auf."
76
+ elif language == "ell":
77
+ prompt = "Η εισήγησή σας ήταν πολύ μεγάλη. Παρακαλώ δοκιμάστε είτε ένα μικρότερο κείμενο είτε χωρίστε το σε διάφορα μέρη."
78
+ elif language == "spa":
79
+ prompt = "Su entrada es demasiado larga. Por favor, intente un texto más corto o divídalo en varias partes."
80
+ elif language == "fin":
81
+ prompt = "Vastauksesi oli liian pitkä. Kokeile joko lyhyempää tekstiä tai jaa se useampaan osaan."
82
+ elif language == "rus":
83
+ prompt = "Ваш текст слишком длинный. Пожалуйста, попробуйте либо сократить текст, либо разделить его на несколько частей."
84
+ elif language == "hun":
85
+ prompt = "Túl hosszú volt a bevitele. Kérjük, próbáljon meg rövidebb szöveget írni, vagy ossza több részre."
86
+ elif language == "nld":
87
+ prompt = "Uw input was te lang. Probeer een kortere tekst of splits het in verschillende delen."
88
+ elif language == "fra":
89
+ prompt = "Votre saisie était trop longue. Veuillez essayer un texte plus court ou le diviser en plusieurs parties."
90
+ elif language == 'pol':
91
+ prompt = "Twój wpis był zbyt długi. Spróbuj skrócić tekst lub podzielić go na kilka części."
92
+ elif language == 'por':
93
+ prompt = "O seu contributo foi demasiado longo. Por favor, tente um texto mais curto ou divida-o em várias partes."
94
+ elif language == 'ita':
95
+ prompt = "Il tuo input era troppo lungo. Per favore, prova un testo più corto o dividilo in più parti."
96
+ elif language == 'cmn':
97
+ prompt = "你的输入太长了。请尝试使用较短的文本或将其拆分为多个部分。"
98
+ elif language == 'vie':
99
+ prompt = "Đầu vào của bạn quá dài. Vui lòng thử một văn bản ngắn hơn hoặc chia nó thành nhiều phần."
100
+ else:
101
+ prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
102
+ if self.current_language != "eng":
103
+ self.model.set_phonemizer_language("eng")
104
+ self.current_language = "eng"
105
+ if self.current_accent != "eng":
106
+ self.model.set_accent_language("eng")
107
+ self.current_accent = "eng"
108
+
109
+ print(prompt)
110
+ wav, sr, fig = self.model(prompt,
111
+ input_is_phones=False,
112
+ duration_scaling_factor=duration_scaling_factor,
113
+ pitch_variance_scale=pitch_variance_scale,
114
+ energy_variance_scale=energy_variance_scale,
115
+ pause_duration_scaling_factor=pause_duration_scaling_factor,
116
+ return_plot_as_filepath=True,
117
+ prosody_creativity=prosody_creativity,
118
+ loudness_in_db=loudness_in_db)
119
+ return sr, wav, fig
120
+
121
+
122
+ title = "Controllable Text-to-Speech for over 7000 Languages"
123
+ article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
124
+ available_artificial_voices = 1000
125
+ path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
126
+ iso_to_name = load_json_from_path(path_to_iso_list)
127
+ text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
128
+ controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)
129
+
130
+
131
+ @spaces.GPU
132
+ def read(prompt,
133
+ language,
134
+ voice_seed,
135
+ prosody_creativity,
136
+ duration_scaling_factor,
137
+ pitch_variance_scale,
138
+ energy_variance_scale,
139
+ emb1,
140
+ emb2
141
+ ):
142
+ if torch.cuda.is_available():
143
+ controllable_ui.to("cuda")
144
+ controllable_ui.device = "cuda"
145
+ try:
146
+ sr, wav, fig = controllable_ui.read(prompt,
147
+ language.split(" ")[-1].split("(")[1].split(")")[0],
148
+ language.split(" ")[-1].split("(")[1].split(")")[0],
149
+ voice_seed,
150
+ prosody_creativity,
151
+ duration_scaling_factor,
152
+ 1.,
153
+ pitch_variance_scale,
154
+ energy_variance_scale,
155
+ emb1,
156
+ emb2,
157
+ 0.,
158
+ 0.,
159
+ 0.,
160
+ 0.,
161
+ -24.)
162
+ finally:
163
+ controllable_ui.to("cpu")
164
+ controllable_ui.device = "cpu"
165
+ return (sr, float2pcm(wav)), fig
166
+
167
+
168
+ iface = gr.Interface(fn=read,
169
+ inputs=[gr.Textbox(lines=2,
170
+ placeholder="write what you want the synthesis to read here...",
171
+ value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
172
+ label="Text input"),
173
+ gr.Dropdown(text_selection,
174
+ type="value",
175
+ value='English Text (eng)',
176
+ label="Select the Language of the Text (type on your keyboard to find it quickly)"),
177
+ gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
178
+ value=279,
179
+ label="Random Seed for the artificial Voice"),
180
+ gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.7, label="Prosody Creativity"),
181
+ gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
182
+ gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
183
+ gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
184
+ gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
185
+ gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
186
+ ],
187
+ outputs=[gr.Audio(type="numpy", label="Speech"),
188
+ gr.Image(label="Visualization")],
189
+ title=title,
190
+ theme="default",
191
+ allow_flagging="never",
192
+ article=article)
193
+ iface.launch()