import numpy as np import torch import torchaudio import librosa import librosa.display import matplotlib.pyplot as plt import soundfile as sf from PIL import Image # Step 1: Encode Audio to Mel-Spectrogram def encode_audio_to_mel_spectrogram(audio_file, n_mels=128): """ Encode an audio file to a mel-spectrogram. Parameters: - audio_file: Path to the audio file. - n_mels: Number of mel bands (default: 128). Returns: - mel_spectrogram_db: Mel-spectrogram in dB scale. - sample_rate: Sample rate of the audio file. """ y, sample_rate = librosa.load(audio_file, sr=None) # Load audio mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sample_rate, n_mels=n_mels) mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max) # Convert to dB return mel_spectrogram_db, sample_rate # Improved Step 2: Save Mel-Spectrogram as Image def save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, output_image='mel_spectrogram.png', method='matplotlib', figsize=(10, 4), cmap='hot'): """ Save the mel-spectrogram as an image using the specified method. Parameters: - mel_spectrogram_db: Mel-spectrogram in dB scale. - sample_rate: Sample rate of the audio file. - output_image: Path to save the image. - method: Method for saving ('matplotlib' or 'custom'). - figsize: Size of the figure for matplotlib (default: (10, 4)). - cmap: Colormap for the spectrogram (default: 'hot'). """ if method == 'matplotlib': plt.figure(figsize=figsize) librosa.display.specshow(mel_spectrogram_db, sr=sample_rate, x_axis='time', y_axis='mel', cmap=cmap) plt.colorbar(format='%+2.0f dB') plt.title('Mel-Spectrogram') plt.savefig(output_image) plt.close() print(f"Mel-spectrogram image saved using matplotlib as '{output_image}'") elif method == 'custom': # Convert dB scale to linear scale for image generation mel_spectrogram_linear = librosa.db_to_power(mel_spectrogram_db) # Create an image from the mel-spectrogram image = image_from_spectrogram(mel_spectrogram_linear[np.newaxis, ...]) # Add channel dimension # Save the image image.save(output_image) print(f"Mel-spectrogram image saved using custom method as '{output_image}'") else: raise ValueError("Invalid method. Choose 'matplotlib' or 'custom'.") # Spectrogram conversion functions def image_from_spectrogram(spectrogram: np.ndarray, power: float = 0.25) -> Image.Image: """ Compute a spectrogram image from a spectrogram magnitude array. Args: spectrogram: (channels, frequency, time) power: A power curve to apply to the spectrogram to preserve contrast Returns: image: (frequency, time, channels) """ # Rescale to 0-1 max_value = np.max(spectrogram) data = spectrogram / max_value # Apply the power curve data = np.power(data, power) # Rescale to 0-255 and invert data = 255 - (data * 255).astype(np.uint8) # Convert to a PIL image if data.shape[0] == 1: image = Image.fromarray(data[0], mode="L").convert("RGB") elif data.shape[0] == 2: data = np.array([np.zeros_like(data[0]), data[0], data[1]]).transpose(1, 2, 0) image = Image.fromarray(data, mode="RGB") else: raise NotImplementedError(f"Unsupported number of channels: {data.shape[0]}") # Flip Y image = image.transpose(Image.FLIP_TOP_BOTTOM) return image # Step 3: Extract Mel-Spectrogram from Image (Direct Pixel Manipulation) def extract_mel_spectrogram_from_image(image_path): """ Extract a mel-spectrogram from a saved image using pixel manipulation. Parameters: - image_path: Path to the spectrogram image file. Returns: - mel_spectrogram_db: The extracted mel-spectrogram in dB scale. """ img = Image.open(image_path).convert('L') # Open image and convert to grayscale img_array = np.array(img) # Convert to NumPy array mel_spectrogram_db = img_array / 255.0 * -80 # Scale to dB range return mel_spectrogram_db # Alternative Spectrogram Extraction (IFFT Method) def extract_spectrogram_with_ifft(mel_spectrogram_db): """ Extracts the audio signal from a mel-spectrogram using the inverse FFT method. Parameters: - mel_spectrogram_db: The mel-spectrogram in dB scale. Returns: - audio: The reconstructed audio signal. """ # Convert dB mel-spectrogram back to linear scale mel_spectrogram = librosa.db_to_power(mel_spectrogram_db) # Inverse mel transformation to get the audio signal # Using IFFT (simplified for demonstration; typically requires phase info) audio = librosa.feature.inverse.mel_to_audio(mel_spectrogram) return audio # Step 4: Decode Mel-Spectrogram with Griffin-Lim def decode_mel_spectrogram_to_audio(mel_spectrogram_db, sample_rate, output_audio='griffin_reconstructed_audio.wav'): """ Decode a mel-spectrogram into audio using Griffin-Lim algorithm. Parameters: - mel_spectrogram_db: The mel-spectrogram in dB scale. - sample_rate: The sample rate for the audio file. - output_audio: Path to save the reconstructed audio file. """ # Convert dB mel-spectrogram back to linear scale mel_spectrogram = librosa.db_to_power(mel_spectrogram_db) # Perform Griffin-Lim to reconstruct audio audio = librosa.griffinlim(mel_spectrogram) # Save the generated audio sf.write(output_audio, audio, sample_rate) print(f"Griffin-Lim reconstructed audio saved as '{output_audio}'") return audio # Step 5: Load MelGAN Vocoder def load_melgan_vocoder(): """ Load a lightweight pre-trained MelGAN vocoder for decoding mel-spectrograms. Returns a torch MelGAN vocoder model. """ model = torchaudio.models.MelGAN() # Load MelGAN model model.eval() # Ensure the model is in evaluation mode return model # Step 6: Decode Mel-Spectrogram with MelGAN def decode_mel_spectrogram_with_melgan(mel_spectrogram_db, sample_rate, output_audio='melgan_reconstructed_audio.wav'): """ Decode a mel-spectrogram into audio using MelGAN vocoder. Parameters: - mel_spectrogram_db: The mel-spectrogram in dB scale. - sample_rate: The sample rate for the audio file. - output_audio: Path to save the reconstructed audio file. Returns: - audio: The reconstructed audio signal. """ # Convert dB mel-spectrogram back to linear scale mel_spectrogram = librosa.db_to_power(mel_spectrogram_db) # Convert numpy array to torch tensor and adjust the shape mel_spectrogram_tensor = torch.tensor(mel_spectrogram).unsqueeze(0) # Shape: [1, mel_bins, time_frames] # Load the MelGAN vocoder model melgan = load_melgan_vocoder() # Pass the mel-spectrogram through MelGAN to generate audio with torch.no_grad(): audio = melgan(mel_spectrogram_tensor).squeeze().numpy() # Squeeze to remove batch dimension # Save the generated audio sf.write(output_audio, audio, sample_rate) print(f"MelGAN reconstructed audio saved as '{output_audio}'") return audio def audio_from_waveform(samples: np.ndarray, sample_rate: int, normalize: bool = False) -> pydub.AudioSegment: """ Convert a numpy array of samples of a waveform to an audio segment. Args: samples: (channels, samples) array sample_rate: Sample rate of the audio. normalize: Flag to normalize volume. Returns: pydub.AudioSegment """ # Normalize volume to fit in int16 if normalize: samples *= np.iinfo(np.int16).max / np.max(np.abs(samples)) # Transpose and convert to int16 samples = samples.transpose(1, 0).astype(np.int16) # Write to the bytes of a WAV file wav_bytes = io.BytesIO() wavfile.write(wav_bytes, sample_rate, samples) wav_bytes.seek(0) # Read into pydub return pydub.AudioSegment.from_wav(wav_bytes) def apply_filters(segment: pydub.AudioSegment, compression: bool = False) -> pydub.AudioSegment: """ Apply post-processing filters to the audio segment to compress it and keep at a -10 dBFS level. Args: segment: The audio segment to filter. compression: Flag to apply dynamic range compression. Returns: pydub.AudioSegment """ if compression: segment = pydub.effects.normalize(segment, headroom=0.1) segment = segment.apply_gain(-10 - segment.dBFS) segment = pydub.effects.compress_dynamic_range( segment, threshold=-20.0, ratio=4.0, attack=5.0, release=50.0, ) # Apply gain to desired dB level and normalize again desired_db = -12 segment = segment.apply_gain(desired_db - segment.dBFS) return pydub.effects.normalize(segment, headroom=0.1) def stitch_segments(segments: Sequence[pydub.AudioSegment], crossfade_s: float) -> pydub.AudioSegment: """ Stitch together a sequence of audio segments with a crossfade between each segment. Args: segments: Sequence of audio segments to stitch. crossfade_s: Duration of crossfade in seconds. Returns: pydub.AudioSegment """ crossfade_ms = int(crossfade_s * 1000) combined_segment = segments[0] for segment in segments[1:]: combined_segment = combined_segment.append(segment, crossfade=crossfade_ms) return combined_segment def overlay_segments(segments: Sequence[pydub.AudioSegment]) -> pydub.AudioSegment: """ Overlay a sequence of audio segments on top of each other. Args: segments: Sequence of audio segments to overlay. Returns: pydub.AudioSegment """ assert len(segments) > 0 output: pydub.AudioSegment = segments[0] for segment in segments[1:]: output = output.overlay(segment) return output # Step 7: Full Pipeline for Audio Processing with Customization def mel_spectrogram_pipeline(audio_file, output_image='mel_spectrogram.png', output_audio_griffin='griffin_reconstructed_audio.wav', output_audio_melgan='melgan_reconstructed_audio.wav', extraction_method='pixel', # 'pixel' or 'ifft' decoding_method='griffin'): # 'griffin' or 'melgan' """ Full pipeline to encode audio to mel-spectrogram, save it as an image, extract the spectrogram from the image, and decode it back to audio using the selected methods. Parameters: - audio_file: Path to the audio file to be processed. - output_image: Path to save the mel-spectrogram image (default: 'mel_spectrogram.png'). - output_audio_griffin: Path to save the Griffin-Lim reconstructed audio. - output_audio_melgan: Path to save the MelGAN reconstructed audio. - extraction_method: Method for extraction ('pixel' or 'ifft'). - decoding_method: Method for decoding ('griffin' or 'melgan'). """ # Step 1: Encode (Audio -> Mel-Spectrogram) mel_spectrogram_db, sample_rate = encode_audio_to_mel_spectrogram(audio_file) # Step 2: Convert Mel-Spectrogram to Image and save it save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, output_image) # Step 3: Extract Mel-Spectrogram from the image based on chosen method if extraction_method == 'pixel': extracted_mel_spectrogram_db = extract_mel_spectrogram_from_image(output_image) elif extraction_method == 'ifft': extracted_mel_spectrogram_db = extract_spectrogram_with_ifft(mel_spectrogram_db) else: raise ValueError("Invalid extraction method. Choose 'pixel' or 'ifft'.") # Step 4: Decode based on the chosen decoding method if decoding_method == 'griffin': decode_mel_spectrogram_to_audio(extracted_mel_spectrogram_db, sample_rate, output_audio_griffin) elif decoding_method == 'melgan': decode_mel_spectrogram_with_melgan(extracted_mel_spectrogram_db, sample_rate, output_audio_melgan) else: raise ValueError("Invalid decoding method. Choose 'griffin' or 'melgan'.") def process_audio(audio_file, extraction_method, decoding_method): # Create temporary files for outputs with tempfile.NamedTemporaryFile(suffix=".png") as temp_image, \ tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_griffin, \ tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_melgan: # Step 1: Encode (Audio -> Mel-Spectrogram) mel_spectrogram_db, sample_rate = encode_audio_to_mel_spectrogram(audio_file) # Step 2: Convert Mel-Spectrogram to Image and save it save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, temp_image.name) # Step 3: Extract Mel-Spectrogram from the image based on chosen method if extraction_method == 'pixel': extracted_mel_spectrogram_db = extract_mel_spectrogram_from_image(temp_image.name) elif extraction_method == 'ifft': extracted_mel_spectrogram_db = extract_spectrogram_with_ifft(mel_spectrogram_db) # Step 4: Decode using both methods decode_mel_spectrogram_to_audio(extracted_mel_spectrogram_db, sample_rate, temp_audio_griffin.name) decode_mel_spectrogram_with_melgan(extracted_mel_spectrogram_db, sample_rate, temp_audio_melgan.name) # Return results return (temp_image.name, temp_audio_griffin.name if decoding_method == 'griffin' else temp_audio_melgan.name) # Create Gradio interface iface = gr.Interface( fn=process_audio, inputs=[ gr.Audio(type="filepath", label="Upload Audio"), gr.Radio(["pixel", "ifft"], label="Extraction Method", value="pixel"), gr.Radio(["griffin", "melgan"], label="Decoding Method", value="griffin") ], outputs=[ gr.Image(type="filepath", label="Mel-Spectrogram"), gr.Audio(type="filepath", label="Reconstructed Audio") ], title="Audio Encoder-Decoder", description="Upload an audio file to encode it to a mel-spectrogram and then decode it back to audio." ) # Launch the app iface.launch() # Example usage(TEST) if __name__ == "__main__": audio_file_path = 'your_audio_file.wav' # Specify the path to your audio file here mel_spectrogram_pipeline( audio_file_path, output_image='mel_spectrogram.png', output_audio_griffin='griffin_reconstructed_audio.wav', output_audio_melgan='melgan_reconstructed_audio.wav', extraction_method='pixel', # Choose 'pixel' or 'ifft' decoding_method='griffin' # Choose 'griffin' or 'melgan' )