Spaces:

latuan
/

SRT-to-Audio

Running

App Files Files Community

latuan commited on Aug 22

Commit

0358132

•

1 Parent(s): 0b2b657

ver 1.9.9

Browse files

Files changed (1) hide show

app.py +24 -9

app.py CHANGED Viewed

@@ -173,14 +173,18 @@ def time_to_seconds(time_str):
     return seconds
 def closest_speedup_factor(factor, allowed_factors):
-    return min(allowed_factors, key=lambda x: abs(x - factor))
 def generate_audio_with_pause(srt_file_path):
     subtitles = read_srt(srt_file_path)
     audio_clips = []
-    allowed_factors = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
     for i, (start_time, end_time, text) in enumerate(subtitles):
         # Generate initial audio
         audio_data = model.inference(text=text, speaker_id=speaker_id)
         audio_data = audio_data / np.max(np.abs(audio_data))
@@ -189,10 +193,14 @@ def generate_audio_with_pause(srt_file_path):
         desired_duration = time_to_seconds(end_time) - time_to_seconds(start_time)
         current_duration = len(audio_data) / 16000
         # Adjust audio speed by speedup
         if current_duration > desired_duration:
             raw_speedup_factor = current_duration / desired_duration
-            speedup_factor = closest_speedup_factor(raw_speedup_factor, allowed_factors)
             audio_data = librosa.effects.time_stretch(
                 y=audio_data,
                 rate=speedup_factor,
@@ -201,16 +209,23 @@ def generate_audio_with_pause(srt_file_path):
             )
             audio_data = audio_data / np.max(np.abs(audio_data))
             audio_data = audio_data * 1.2
         audio_clips.append(audio_data)
         # Add pause
-        if i < len(subtitles) - 1:
-            next_start_time = subtitles[i + 1][0]
-            pause_duration = time_to_seconds(next_start_time) - time_to_seconds(end_time)
-            if pause_duration > 0.2:
-                pause_samples = int(pause_duration * 16000)
-                audio_clips.append(np.zeros(pause_samples))
     final_audio = np.concatenate(audio_clips)

     return seconds
 def closest_speedup_factor(factor, allowed_factors):
+    return min(allowed_factors, key=lambda x: abs(x - factor)) + 0.1
 def generate_audio_with_pause(srt_file_path):
     subtitles = read_srt(srt_file_path)
     audio_clips = []
+    # allowed_factors = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
     for i, (start_time, end_time, text) in enumerate(subtitles):
+        # print("=====================================")
+        # print("Text number:", i)
+        # print(f"Start: {start_time}, End: {end_time}, Text: {text}")
         # Generate initial audio
         audio_data = model.inference(text=text, speaker_id=speaker_id)
         audio_data = audio_data / np.max(np.abs(audio_data))
         desired_duration = time_to_seconds(end_time) - time_to_seconds(start_time)
         current_duration = len(audio_data) / 16000
+        # print(f"Time to seconds: {time_to_seconds(start_time)}, {time_to_seconds(end_time)}")
+        # print(f"Desired duration: {desired_duration}, Current duration: {current_duration}")
         # Adjust audio speed by speedup
         if current_duration > desired_duration:
             raw_speedup_factor = current_duration / desired_duration
+            # speedup_factor = closest_speedup_factor(raw_speedup_factor, allowed_factors)
+            speedup_factor = raw_speedup_factor
             audio_data = librosa.effects.time_stretch(
                 y=audio_data,
                 rate=speedup_factor,
             )
             audio_data = audio_data / np.max(np.abs(audio_data))
             audio_data = audio_data * 1.2
+        if current_duration < desired_duration:
+            padding = int((desired_duration - current_duration) * 16000)
+            audio_data = np.concatenate([np.zeros(padding), audio_data])
+        # print(f"Final audio duration: {len(audio_data) / 16000}")
+        # print("=====================================")
         audio_clips.append(audio_data)
         # Add pause
+        # if i < len(subtitles) - 1:
+        #     next_start_time = subtitles[i + 1][0]
+        #     pause_duration = time_to_seconds(next_start_time) - time_to_seconds(end_time)
+        #     if pause_duration > 0.2:
+        #         pause_samples = int(pause_duration * 16000)
+        #         audio_clips.append(np.zeros(pause_samples))
     final_audio = np.concatenate(audio_clips)