File size: 1,450 Bytes
2de6199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e09662f
 
 
 
 
e58ab9f
e09662f
2de6199
 
 
 
ee59f9d
2de6199
 
 
 
 
f101f29
 
 
2de6199
 
fac1cff
2de6199
c222126
2de6199
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import gradio as gr
import torch
from diffusers import AudioLDM2Pipeline

# make Space compatible with CPU duplicates
if torch.cuda.is_available():
    device = "cuda"
    torch_dtype = torch.float16
else:
    device = "cpu"
    torch_dtype = torch.float32

# load the diffusers pipeline
repo_id = "cvssp/audioldm2"
pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
# pipe.unet = torch.compile(pipe.unet)

# set the generator for reproducibility
generator = torch.Generator(device)


def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
    if text is None:
        raise gr.Error("Please provide a text input.")

    # test values
    negative_prompt="Low quality."
    duration=10
    guidance_scale=3.5
    random_seed=45
    n_candidates=3

    waveforms = pipe(
        text,
        audio_length_in_s=duration,
        guidance_scale=guidance_scale,
        num_inference_steps=200,
        negative_prompt=negative_prompt,
        num_waveforms_per_prompt=n_candidates if n_candidates else 1,
        generator=generator.manual_seed(int(random_seed)),
    )["audios"]

    print(waveforms)

    return [gr.make_waveform((16000, waveforms[0])), gr.make_waveform((16000, waveforms[1])), gr.make_waveform((16000, waveforms[2]))]   

gradio_interface = gr.Interface(
	fn = text2audio,
	inputs = "text",
	outputs = ["audio", "audio", "audio"],
)

gradio_interface.launch()