import gc import hashlib import os import queue import threading import warnings import librosa import numpy as np import onnxruntime as ort import soundfile as sf import torch from tqdm import tqdm warnings.filterwarnings("ignore") stem_naming = {'Vocals': 'Instrumental', 'Other': 'Instruments', 'Instrumental': 'Vocals', 'Drums': 'Drumless', 'Bass': 'Bassless'} class MDXModel: def __init__(self, device, dim_f, dim_t, n_fft, hop=1024, stem_name=None, compensation=1.000): print("[~] Initializing MDXModel...") self.dim_f = dim_f self.dim_t = dim_t self.dim_c = 4 self.n_fft = n_fft self.hop = hop self.stem_name = stem_name self.compensation = compensation self.n_bins = self.n_fft // 2 + 1 self.chunk_size = hop * (self.dim_t - 1) self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(device) out_c = self.dim_c self.freq_pad = torch.zeros([1, out_c, self.n_bins - self.dim_f, self.dim_t]).to(device) print("[+] MDXModel initialized") def stft(self, x): print("[~] Performing STFT...") x = x.reshape([-1, self.chunk_size]) x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True, return_complex=True) x = torch.view_as_real(x) x = x.permute([0, 3, 1, 2]) x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, 4, self.n_bins, self.dim_t]) print("[+] STFT completed") return x[:, :, :self.dim_f] def istft(self, x, freq_pad=None): print("[~] Performing inverse STFT...") freq_pad = self.freq_pad.repeat([x.shape[0], 1, 1, 1]) if freq_pad is None else freq_pad x = torch.cat([x, freq_pad], -2) x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, 2, self.n_bins, self.dim_t]) x = x.permute([0, 2, 3, 1]) x = x.contiguous() x = torch.view_as_complex(x) x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True) print("[+] Inverse STFT completed") return x.reshape([-1, 2, self.chunk_size]) class MDX: DEFAULT_SR = 44100 DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR DEFAULT_PROCESSOR = 0 def __init__(self, model_path: str, params: MDXModel, processor=DEFAULT_PROCESSOR): print("[~] Initializing MDX...") self.device = torch.device(f'cuda:{processor}') if processor >= 0 else torch.device('cpu') self.provider = ['CUDAExecutionProvider'] if processor >= 0 else ['CPUExecutionProvider'] self.model = params print(f"[~] Loading ONNX model from {model_path}...") self.ort = ort.InferenceSession(model_path, providers=self.provider) print("[~] Preloading model...") self.ort.run(None, {'input': torch.rand(1, 4, params.dim_f, params.dim_t).numpy()}) self.process = lambda spec: self.ort.run(None, {'input': spec.cpu().numpy()})[0] self.prog = None print("[+] MDX initialized") @staticmethod def get_hash(model_path): print(f"[~] Calculating hash for model: {model_path}") try: with open(model_path, 'rb') as f: f.seek(- 10000 * 1024, 2) model_hash = hashlib.md5(f.read()).hexdigest() except: model_hash = hashlib.md5(open(model_path, 'rb').read()).hexdigest() print(f"[+] Model hash: {model_hash}") return model_hash @staticmethod def segment(wave, combine=True, chunk_size=DEFAULT_CHUNK_SIZE, margin_size=DEFAULT_MARGIN_SIZE): print("[~] Segmenting wave...") if combine: processed_wave = None for segment_count, segment in enumerate(wave): start = 0 if segment_count == 0 else margin_size end = None if segment_count == len(wave) - 1 else -margin_size if margin_size == 0: end = None if processed_wave is None: processed_wave = segment[:, start:end] else: processed_wave = np.concatenate((processed_wave, segment[:, start:end]), axis=-1) else: processed_wave = [] sample_count = wave.shape[-1] if chunk_size <= 0 or chunk_size > sample_count: chunk_size = sample_count if margin_size > chunk_size: margin_size = chunk_size for segment_count, skip in enumerate(range(0, sample_count, chunk_size)): margin = 0 if segment_count == 0 else margin_size end = min(skip + chunk_size + margin_size, sample_count) start = skip - margin cut = wave[:, start:end].copy() processed_wave.append(cut) if end == sample_count: break print("[+] Wave segmentation completed") return processed_wave def pad_wave(self, wave): print("[~] Padding wave...") n_sample = wave.shape[1] trim = self.model.n_fft // 2 gen_size = self.model.chunk_size - 2 * trim pad = gen_size - n_sample % gen_size wave_p = np.concatenate((np.zeros((2, trim)), wave, np.zeros((2, pad)), np.zeros((2, trim))), 1) mix_waves = [] for i in range(0, n_sample + pad, gen_size): waves = np.array(wave_p[:, i:i + self.model.chunk_size]) mix_waves.append(waves) mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(self.device) print(f"[+] Wave padded. Shape: {mix_waves.shape}") return mix_waves, pad, trim def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int): print(f"[~] Processing wave segment {_id}...") mix_waves = mix_waves.split(1) with torch.no_grad(): pw = [] for mix_wave in mix_waves: self.prog.update() spec = self.model.stft(mix_wave) processed_spec = torch.tensor(self.process(spec)) processed_wav = self.model.istft(processed_spec.to(self.device)) processed_wav = processed_wav[:, :, trim:-trim].transpose(0, 1).reshape(2, -1).cpu().numpy() pw.append(processed_wav) processed_signal = np.concatenate(pw, axis=-1)[:, :-pad] q.put({_id: processed_signal}) print(f"[+] Wave segment {_id} processed") return processed_signal def process_wave(self, wave: np.array, mt_threads=1): print(f"[~] Processing wave with {mt_threads} threads...") self.prog = tqdm(total=0) chunk = wave.shape[-1] // mt_threads waves = self.segment(wave, False, chunk) q = queue.Queue() threads = [] for c, batch in enumerate(waves): mix_waves, pad, trim = self.pad_wave(batch) self.prog.total = len(mix_waves) * mt_threads thread = threading.Thread(target=self._process_wave, args=(mix_waves, trim, pad, q, c)) thread.start() threads.append(thread) for thread in threads: thread.join() self.prog.close() processed_batches = [] while not q.empty(): processed_batches.append(q.get()) processed_batches = [list(wave.values())[0] for wave in sorted(processed_batches, key=lambda d: list(d.keys())[0])] assert len(processed_batches) == len(waves), 'Incomplete processed batches, please reduce batch size!' print("[+] Wave processing completed") return self.segment(processed_batches, True, chunk) def run_mdx(model_params, output_dir, model_path, filename, exclude_main=False, exclude_inversion=False, suffix=None, invert_suffix=None, denoise=False, keep_orig=True, m_threads=2): print(f"[~] Running MDX on file: {filename}") device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') device_properties = torch.cuda.get_device_properties(device) vram_gb = device_properties.total_memory / 1024**3 m_threads = 1 if vram_gb < 8 else 2 print(f"[~] Using {m_threads} threads for processing") model_hash = MDX.get_hash(model_path) mp = model_params.get(model_hash) model = MDXModel( device, dim_f=mp["mdx_dim_f_set"], dim_t=2 ** mp["mdx_dim_t_set"], n_fft=mp["mdx_n_fft_scale_set"], stem_name=mp["primary_stem"], compensation=mp["compensate"] ) mdx_sess = MDX(model_path, model) print("[~] Loading audio file...") wave, sr = librosa.load(filename, mono=False, sr=44100) print("[~] Normalizing input wave...") peak = max(np.max(wave), abs(np.min(wave))) wave /= peak if denoise: print("[~] Denoising wave...") wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (mdx_sess.process_wave(wave, m_threads)) wave_processed *= 0.5 else: print("[~] Processing wave...") wave_processed = mdx_sess.process_wave(wave, m_threads) wave_processed *= peak stem_name = model.stem_name if suffix is None else suffix main_filepath = None if not exclude_main: main_filepath = os.path.join(output_dir, f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav") print(f"[~] Writing main output to: {main_filepath}") sf.write(main_filepath, wave_processed.T, sr) invert_filepath = None if not exclude_inversion: diff_stem_name = stem_naming.get(stem_name) if invert_suffix is None else invert_suffix stem_name = f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name invert_filepath = os.path.join(output_dir, f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav") print(f"[~] Writing inverted output to: {invert_filepath}") sf.write(invert_filepath, (-wave_processed.T * model.compensation) + wave.T, sr) if not keep_orig: print(f"[~] Removing original file: {filename}") os.remove(filename) print("[~] Cleaning up...") del mdx_sess, wave_processed, wave if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() print("[+] MDX processing completed") return main_filepath, invert_filepath def run_roformer(model_params, output_dir, model_name, filename, exclude_main=False, exclude_inversion=False, suffix=None, invert_suffix=None, denoise=False, keep_orig=True, m_threads=2): print(f"[~] Running RoFormer on file: {filename}") os.makedirs(output_dir, exist_ok=True) print("[~] Loading audio file...") wave, sr = librosa.load(filename, mono=False, sr=44100) base_name = os.path.splitext(os.path.basename(filename))[0] roformer_output_format = 'wav' roformer_overlap = 4 roformer_segment_size = 256 print(f"[~] Output directory: {output_dir}") prompt = f'audio-separator "{filename}" --model_filename {model_name} --output_dir="{output_dir}" --output_format={roformer_output_format} --normalization=0.9 --mdxc_overlap={roformer_overlap} --mdxc_segment_size={roformer_segment_size}' print(f"[~] Running command: {prompt}") os.system(prompt) vocals_file = f"{base_name}_Vocals.wav" instrumental_file = f"{base_name}_Instrumental.wav" main_filepath = None invert_filepath = None if not exclude_main: main_filepath = os.path.join(output_dir, vocals_file) if os.path.exists(os.path.join(output_dir, f"{base_name}_(Vocals)_{model_name.replace('.9755.ckpt', '')}.wav")): print(f"[~] Renaming vocals file to: {main_filepath}") os.rename(os.path.join(output_dir, f"{base_name}_(Vocals)_{model_name.replace('.9755.ckpt', '')}.wav"), main_filepath) if not exclude_inversion: invert_filepath = os.path.join(output_dir, instrumental_file) if os.path.exists(os.path.join(output_dir, f"{base_name}_(Instrumental)_{model_name.replace('.9755.ckpt', '')}.wav")): print(f"[~] Renaming instrumental file to: {invert_filepath}") os.rename(os.path.join(output_dir, f"{base_name}_(Instrumental)_{model_name.replace('.9755.ckpt', '')}.wav"), invert_filepath) if not keep_orig: print(f"[~] Removing original file: {filename}") os.remove(filename) print("[+] RoFormer processing completed") return main_filepath, invert_filepath