latuan commited on
Commit
3bcafb5
1 Parent(s): 40e4f0c
Files changed (4) hide show
  1. __pycache__/app.cpython-38.pyc +0 -0
  2. app.py +197 -4
  3. flagged/log.csv +4 -0
  4. requirements.txt +11 -0
__pycache__/app.cpython-38.pyc ADDED
Binary file (610 Bytes). View file
 
app.py CHANGED
@@ -1,7 +1,200 @@
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import io
4
+ import torch
5
+ import requests
6
+ import torchaudio
7
+ import numpy as np
8
  import gradio as gr
9
+ from uroman import uroman
10
+ from pydub import AudioSegment
11
+ from datasets import load_dataset
12
+ from IPython.display import Audio
13
+ from scipy.signal import butter, lfilter
14
+ from speechbrain.pretrained import EncoderClassifier
15
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
16
 
17
+ # Variables
18
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
19
+ dataset_name = "truong-xuan-linh/vi-xvector-speechbrain"
20
+ cache_dir="temp/"
21
+ default_model_name = "truong-xuan-linh/speecht5-vietnamese-voiceclone-lsvsc"
22
+ speaker_id = "speech_dataset_denoised"
23
 
24
+ # Active device
25
+ device = "cuda" if torch.cuda.is_available() else "cpu"
26
+
27
+ # Load models and datasets
28
+ speaker_model = EncoderClassifier.from_hparams(
29
+ source=spk_model_name,
30
+ run_opts={"device": device},
31
+ savedir=os.path.join("/tmp", spk_model_name),
32
+ )
33
+ dataset = load_dataset(
34
+ dataset_name,
35
+ download_mode="force_redownload",
36
+ verification_mode="no_checks",
37
+ cache_dir=cache_dir,
38
+ revision="5ea5e4345258333cbc6d1dd2544f6c658e66a634"
39
+ )
40
+ dataset = dataset["train"].to_list()
41
+ dataset_dict = {}
42
+ for rc in dataset:
43
+ dataset_dict[rc["speaker_id"]] = rc["embedding"]
44
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
45
+
46
+ # Model utility functions
47
+ def remove_special_characters(sentence):
48
+ # Use regular expression to keep only letters, periods, and commas
49
+ sentence_after_removal = re.sub(r'[^a-zA-Z\s,.\u00C0-\u1EF9]', ' ,', sentence)
50
+ return sentence_after_removal
51
+
52
+ def create_speaker_embedding(waveform):
53
+ with torch.no_grad():
54
+ speaker_embeddings = speaker_model.encode_batch(waveform)
55
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=-1)
56
+ return speaker_embeddings
57
+
58
+ def butter_bandpass(lowcut, highcut, fs, order=5):
59
+ nyq = 0.5 * fs
60
+ low = lowcut / nyq
61
+ high = highcut / nyq
62
+ b, a = butter(order, [low, high], btype='band')
63
+ return b, a
64
+
65
+ def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
66
+ b, a = butter_bandpass(lowcut, highcut, fs, order=order)
67
+ y = lfilter(b, a, data)
68
+ return y
69
+
70
+ def korean_splitter(string):
71
+ pattern = re.compile('[가-힣]+')
72
+ matches = pattern.findall(string)
73
+ return matches
74
+
75
+ def uroman_normalization(string):
76
+ korean_inputs = korean_splitter(string)
77
+ for korean_input in korean_inputs:
78
+ korean_roman = uroman(korean_input)
79
+ string = string.replace(korean_input, korean_roman)
80
+ return string
81
+
82
+ # Model class
83
+ class Model():
84
+ def __init__(self, model_name, speaker_url=""):
85
+ self.model_name = model_name
86
+ self.processor = SpeechT5Processor.from_pretrained(model_name)
87
+ self.model = SpeechT5ForTextToSpeech.from_pretrained(model_name)
88
+
89
+ self.model.eval()
90
+ self.speaker_url = speaker_url
91
+ if speaker_url:
92
+ print(f"download speaker_url")
93
+ response = requests.get(speaker_url)
94
+ audio_stream = io.BytesIO(response.content)
95
+ audio_segment = AudioSegment.from_file(audio_stream, format="wav")
96
+ audio_segment = audio_segment.set_channels(1)
97
+ audio_segment = audio_segment.set_frame_rate(16000)
98
+ audio_segment = audio_segment.set_sample_width(2)
99
+ wavform, _ = torchaudio.load(audio_segment.export())
100
+ self.speaker_embeddings = create_speaker_embedding(wavform)[0]
101
+ else:
102
+ self.speaker_embeddings = None
103
+
104
+ if model_name == "truong-xuan-linh/speecht5-vietnamese-commonvoice" or model_name == "truong-xuan-linh/speecht5-irmvivoice":
105
+ self.speaker_embeddings = torch.zeros((1, 512)) # or load xvectors from a file
106
+
107
+ def inference(self, text, speaker_id=None):
108
+ if "voiceclone" in self.model_name:
109
+ if not self.speaker_url:
110
+ self.speaker_embeddings = torch.tensor(dataset_dict[speaker_id])
111
+
112
+ with torch.no_grad():
113
+ full_speech = []
114
+ separators = r";|\.|!|\?|\n"
115
+ text = uroman_normalization(text)
116
+ text = remove_special_characters(text)
117
+ text = text.replace(" ", "▁")
118
+ split_texts = re.split(separators, text)
119
+
120
+ for split_text in split_texts:
121
+ if split_text != "▁":
122
+ split_text = split_text.lower() + "▁"
123
+ print(split_text)
124
+ inputs = self.processor.tokenizer(text=split_text, return_tensors="pt")
125
+ speech = self.model.generate_speech(inputs["input_ids"], threshold=0.5, speaker_embeddings=self.speaker_embeddings, vocoder=vocoder)
126
+ full_speech.append(speech.numpy())
127
+ return np.concatenate(full_speech)
128
+
129
+ @staticmethod
130
+ def moving_average(data, window_size):
131
+ return np.convolve(data, np.ones(window_size)/window_size, mode='same')
132
+
133
+ # Initialize model
134
+ model = Model(
135
+ model_name=default_model_name,
136
+ speaker_url=""
137
+ )
138
+
139
+ # Audio processing functions
140
+ def read_srt(file_path):
141
+ subtitles = []
142
+ with open(file_path, 'r', encoding='utf-8') as file:
143
+ lines = file.readlines()
144
+
145
+ for i in range(0, len(lines), 4):
146
+ if i+2 < len(lines):
147
+ start_time, end_time = lines[i+1].strip().split(' --> ')
148
+ text = lines[i+2].strip()
149
+ subtitles.append((start_time, end_time, text))
150
+
151
+ return subtitles
152
+
153
+ def time_to_seconds(time_str):
154
+ h, m, s = time_str.split(':')
155
+ seconds = int(h) * 3600 + int(m) * 60 + float(s.replace(',', '.'))
156
+ return seconds
157
+
158
+ def generate_audio_with_pause(srt_file_path):
159
+ subtitles = read_srt(srt_file_path)
160
+ audio_clips = []
161
+
162
+ for i, (start_time, end_time, text) in enumerate(subtitles):
163
+ audio_data = model.inference(text=text, speaker_id=speaker_id)
164
+ audio_data = audio_data / np.max(np.abs(audio_data))
165
+
166
+ audio_clips.append(audio_data)
167
+
168
+ if i < len(subtitles) - 1:
169
+ next_start_time = subtitles[i + 1][0]
170
+ pause_duration = time_to_seconds(next_start_time) - time_to_seconds(end_time)
171
+ if pause_duration > 0:
172
+ pause_samples = int(pause_duration * 16000)
173
+ audio_clips.append(np.zeros(pause_samples))
174
+
175
+ final_audio = np.concatenate(audio_clips)
176
+
177
+ return final_audio
178
+
179
+ def srt_to_audio(srt_file):
180
+ audio_data = generate_audio_with_pause(srt_file.name)
181
+ output_path = os.path.join(cache_dir, 'output.wav')
182
+ torchaudio.save(output_path, torch.tensor(audio_data).unsqueeze(0), 16000)
183
+ return output_path
184
+
185
+ # UI display
186
+ css = '''
187
+ #title{text-align: center}
188
+ '''
189
+ with gr.Blocks(css=css) as demo:
190
+ title = gr.HTML(
191
+ """<h1>SRT to Audio Tool</h1>""",
192
+ elem_id="title",
193
+ )
194
+ inp = gr.File(label="Upload SRT file", file_count="single", type="file")
195
+ out = gr.Audio(label="Generated Audio", type="filepath")
196
+
197
+ inp.change(fn=srt_to_audio, inputs=inp, outputs=out)
198
+
199
+ if __name__ == "__main__":
200
+ demo.launch()
flagged/log.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ name,output,flag,username,timestamp
2
+ asdasdasdasdasd,Hello asdasdasdasdasd!!,,,2024-08-21 09:52:15.746931
3
+ asdasdasdasdasd,Hello asdasdasdasdasd!!,,,2024-08-21 09:52:18.666674
4
+ asdasdasdasdasd,Hello asdasdasdasdasd!!,,,2024-08-21 09:52:27.597313
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.1.2
2
+ numpy==1.23.5
3
+ transformers==4.38.2
4
+ uroman-python==1.2.8.1
5
+ datasets==2.16.1
6
+ deepfilternet==0.5.6
7
+ torchaudio==2.1.2
8
+ librosa==0.10.0
9
+ pydub==0.25.1
10
+ speechbrain==0.5.16
11
+ moviepy