Spaces:

latuan
/

SRT-to-Audio

Running

App Files Files Community

latuan commited on Aug 23

Commit

c4c4b24

•

1 Parent(s): 36a27a4

ver 2.0.0

Browse files

Files changed (1) hide show

app.py +40 -7

app.py CHANGED Viewed

@@ -175,7 +175,7 @@ def time_to_seconds(time_str):
 def closest_speedup_factor(factor, allowed_factors):
     return min(allowed_factors, key=lambda x: abs(x - factor)) + 0.1
-def generate_audio_with_pause(srt_file_path):
     subtitles = read_srt(srt_file_path)
     audio_clips = []
     # allowed_factors = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
@@ -210,7 +210,18 @@ def generate_audio_with_pause(srt_file_path):
             audio_data = audio_data / np.max(np.abs(audio_data))
             audio_data = audio_data * 1.2
-        if current_duration < desired_duration:
             padding = int((desired_duration - current_duration) * 16000)
             audio_data = np.concatenate([np.zeros(padding), audio_data])
@@ -242,7 +253,7 @@ def check_input_files(srt_files):
     if invalid_files:
         raise gr.Warning(f"Invalid SRT files: {', '.join(invalid_files)}")
-def srt_to_audio_multi(srt_files):
     output_paths = []
     invalid_files = []
@@ -250,7 +261,7 @@ def srt_to_audio_multi(srt_files):
         if not is_valid_srt(srt_file.name):
             invalid_files.append(srt_file.name)
             return None
-        audio_data = generate_audio_with_pause(srt_file.name)
         output_path = os.path.join(cache_dir, f'output_{os.path.basename(srt_file.name)}.wav')
         torchaudio.save(output_path, torch.tensor(audio_data).unsqueeze(0), 16000)
         return output_path
@@ -288,6 +299,7 @@ model = Model(
 css = '''
 #title{text-align: center}
 #container{display: flex; justify-content: space-between; align-items: center;}
 '''
 with gr.Blocks(css=css) as demo:
@@ -295,8 +307,25 @@ with gr.Blocks(css=css) as demo:
         """<h1>SRT to Audio Tool</h1>""",
         elem_id="title",
     )
     with gr.Row(elem_id="container"):
-        inp = gr.File(
             label="Upload SRT files",
             file_count="multiple",
             type="filepath",
@@ -319,8 +348,12 @@ with gr.Blocks(css=css) as demo:
         height=100
     )
-    inp.change(check_input_files, inputs=inp)
-    btn.click(fn=srt_to_audio_multi, inputs=inp, outputs=out)
     download_btn.click(fn=download_all, inputs=out, outputs=download_out)
 if __name__ == "__main__":

 def closest_speedup_factor(factor, allowed_factors):
     return min(allowed_factors, key=lambda x: abs(x - factor)) + 0.1
+def generate_audio_with_pause(srt_file_path, speaker_id, speed_of_non_edit_speech):
     subtitles = read_srt(srt_file_path)
     audio_clips = []
     # allowed_factors = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
             audio_data = audio_data / np.max(np.abs(audio_data))
             audio_data = audio_data * 1.2
+        if current_duration < desired_duration:
+            if speed_of_non_edit_speech != 1:
+                audio_data = librosa.effects.time_stretch(
+                    y=audio_data,
+                    rate=speed_of_non_edit_speech,
+                    n_fft=1024,
+                    hop_length=256
+                )
+                audio_data = audio_data / np.max(np.abs(audio_data))
+                audio_data = audio_data * 1.2
+            current_duration = len(audio_data) / 16000
             padding = int((desired_duration - current_duration) * 16000)
             audio_data = np.concatenate([np.zeros(padding), audio_data])
     if invalid_files:
         raise gr.Warning(f"Invalid SRT files: {', '.join(invalid_files)}")
+def srt_to_audio_multi(srt_files, speaker_id, speed_of_non_edit_speech):
     output_paths = []
     invalid_files = []
         if not is_valid_srt(srt_file.name):
             invalid_files.append(srt_file.name)
             return None
+        audio_data = generate_audio_with_pause(srt_file.name, speaker_id, speed_of_non_edit_speech)
         output_path = os.path.join(cache_dir, f'output_{os.path.basename(srt_file.name)}.wav')
         torchaudio.save(output_path, torch.tensor(audio_data).unsqueeze(0), 16000)
         return output_path
 css = '''
 #title{text-align: center}
 #container{display: flex; justify-content: space-between; align-items: center;}
+#setting-heading{margin-bottom: 10px; text-align: center;}
 '''
 with gr.Blocks(css=css) as demo:
         """<h1>SRT to Audio Tool</h1>""",
         elem_id="title",
     )
+    with gr.Column(elem_id="setting-box"):
+        heading = gr.HTML("<h2>Settings</h2>", elem_id="setting-heading")
+        with gr.Row():
+            speaker_id = gr.Dropdown(
+                label="Speaker ID",
+                choices=list(dataset_dict.keys()),
+                default=speaker_id
+            )
+            speed_of_non_edit_speech = gr.Slider(
+                label="Speed of non-edit speech",
+                minimum=1,
+                maximum=2.0,
+                step=0.1,
+                value=1.2
+            )
     with gr.Row(elem_id="container"):
+        inp_srt = gr.File(
             label="Upload SRT files",
             file_count="multiple",
             type="filepath",
         height=100
     )
+    inp_srt.change(check_input_files, inputs=inp_srt)
+    btn.click(
+        fn=srt_to_audio_multi,
+        inputs=[inp_srt, speaker_id, speed_of_non_edit_speech],
+        outputs=out
+    )
     download_btn.click(fn=download_all, inputs=out, outputs=download_out)
 if __name__ == "__main__":