latuan commited on
Commit
0358132
1 Parent(s): 0b2b657
Files changed (1) hide show
  1. app.py +24 -9
app.py CHANGED
@@ -173,14 +173,18 @@ def time_to_seconds(time_str):
173
  return seconds
174
 
175
  def closest_speedup_factor(factor, allowed_factors):
176
- return min(allowed_factors, key=lambda x: abs(x - factor))
177
 
178
  def generate_audio_with_pause(srt_file_path):
179
  subtitles = read_srt(srt_file_path)
180
  audio_clips = []
181
- allowed_factors = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
182
 
183
  for i, (start_time, end_time, text) in enumerate(subtitles):
 
 
 
 
184
  # Generate initial audio
185
  audio_data = model.inference(text=text, speaker_id=speaker_id)
186
  audio_data = audio_data / np.max(np.abs(audio_data))
@@ -189,10 +193,14 @@ def generate_audio_with_pause(srt_file_path):
189
  desired_duration = time_to_seconds(end_time) - time_to_seconds(start_time)
190
  current_duration = len(audio_data) / 16000
191
 
 
 
 
192
  # Adjust audio speed by speedup
193
  if current_duration > desired_duration:
194
  raw_speedup_factor = current_duration / desired_duration
195
- speedup_factor = closest_speedup_factor(raw_speedup_factor, allowed_factors)
 
196
  audio_data = librosa.effects.time_stretch(
197
  y=audio_data,
198
  rate=speedup_factor,
@@ -201,16 +209,23 @@ def generate_audio_with_pause(srt_file_path):
201
  )
202
  audio_data = audio_data / np.max(np.abs(audio_data))
203
  audio_data = audio_data * 1.2
 
 
 
 
 
 
 
204
 
205
  audio_clips.append(audio_data)
206
 
207
  # Add pause
208
- if i < len(subtitles) - 1:
209
- next_start_time = subtitles[i + 1][0]
210
- pause_duration = time_to_seconds(next_start_time) - time_to_seconds(end_time)
211
- if pause_duration > 0.2:
212
- pause_samples = int(pause_duration * 16000)
213
- audio_clips.append(np.zeros(pause_samples))
214
 
215
  final_audio = np.concatenate(audio_clips)
216
 
 
173
  return seconds
174
 
175
  def closest_speedup_factor(factor, allowed_factors):
176
+ return min(allowed_factors, key=lambda x: abs(x - factor)) + 0.1
177
 
178
  def generate_audio_with_pause(srt_file_path):
179
  subtitles = read_srt(srt_file_path)
180
  audio_clips = []
181
+ # allowed_factors = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
182
 
183
  for i, (start_time, end_time, text) in enumerate(subtitles):
184
+ # print("=====================================")
185
+ # print("Text number:", i)
186
+ # print(f"Start: {start_time}, End: {end_time}, Text: {text}")
187
+
188
  # Generate initial audio
189
  audio_data = model.inference(text=text, speaker_id=speaker_id)
190
  audio_data = audio_data / np.max(np.abs(audio_data))
 
193
  desired_duration = time_to_seconds(end_time) - time_to_seconds(start_time)
194
  current_duration = len(audio_data) / 16000
195
 
196
+ # print(f"Time to seconds: {time_to_seconds(start_time)}, {time_to_seconds(end_time)}")
197
+ # print(f"Desired duration: {desired_duration}, Current duration: {current_duration}")
198
+
199
  # Adjust audio speed by speedup
200
  if current_duration > desired_duration:
201
  raw_speedup_factor = current_duration / desired_duration
202
+ # speedup_factor = closest_speedup_factor(raw_speedup_factor, allowed_factors)
203
+ speedup_factor = raw_speedup_factor
204
  audio_data = librosa.effects.time_stretch(
205
  y=audio_data,
206
  rate=speedup_factor,
 
209
  )
210
  audio_data = audio_data / np.max(np.abs(audio_data))
211
  audio_data = audio_data * 1.2
212
+
213
+ if current_duration < desired_duration:
214
+ padding = int((desired_duration - current_duration) * 16000)
215
+ audio_data = np.concatenate([np.zeros(padding), audio_data])
216
+
217
+ # print(f"Final audio duration: {len(audio_data) / 16000}")
218
+ # print("=====================================")
219
 
220
  audio_clips.append(audio_data)
221
 
222
  # Add pause
223
+ # if i < len(subtitles) - 1:
224
+ # next_start_time = subtitles[i + 1][0]
225
+ # pause_duration = time_to_seconds(next_start_time) - time_to_seconds(end_time)
226
+ # if pause_duration > 0.2:
227
+ # pause_samples = int(pause_duration * 16000)
228
+ # audio_clips.append(np.zeros(pause_samples))
229
 
230
  final_audio = np.concatenate(audio_clips)
231