Spaces:
Paused
Paused
import gradio as gr | |
import torch | |
from diffusers import AudioLDM2Pipeline | |
# make Space compatible with CPU duplicates | |
if torch.cuda.is_available(): | |
device = "cuda" | |
torch_dtype = torch.float16 | |
else: | |
device = "cpu" | |
torch_dtype = torch.float32 | |
# load the diffusers pipeline | |
repo_id = "cvssp/audioldm2" | |
pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device) | |
# pipe.unet = torch.compile(pipe.unet) | |
# set the generator for reproducibility | |
generator = torch.Generator(device) | |
def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates): | |
if text is None: | |
raise gr.Error("Please provide a text input.") | |
# test values | |
negative_prompt="Low quality." | |
duration=10 | |
guidance_scale=3.5 | |
random_seed=45 | |
n_candidates=3 | |
waveforms = pipe( | |
text, | |
audio_length_in_s=duration, | |
guidance_scale=guidance_scale, | |
num_inference_steps=200, | |
negative_prompt=negative_prompt, | |
num_waveforms_per_prompt=n_candidates if n_candidates else 1, | |
generator=generator.manual_seed(int(random_seed)), | |
)["audios"] | |
print(waveforms) | |
return [gr.make_waveform((16000, waveforms[0])), gr.make_waveform((16000, waveforms[1])), gr.make_waveform((16000, waveforms[2]))] | |
gradio_interface = gr.Interface( | |
fn = text2audio, | |
inputs = "text", | |
outputs = ["audio", "audio", "audio"], | |
) | |
gradio_interface.launch() | |