File size: 4,331 Bytes
7718032
849fe7e
 
7718032
 
849fe7e
7718032
 
58d8890
450bd2c
 
 
 
 
9f04bd1
 
450bd2c
 
 
 
b7d6c4c
 
 
 
 
 
 
 
 
 
 
7718032
0b066f5
4c807d1
0b066f5
 
7718032
b7d6c4c
 
7718032
 
b7d6c4c
6db627d
7718032
 
 
1a2c5bd
e0dcf02
1a2c5bd
 
cdb7851
1a2c5bd
e0dcf02
1a2c5bd
 
e0dcf02
1a2c5bd
 
 
 
e0dcf02
1a2c5bd
e0dcf02
b898d4b
2da28ad
 
 
0f45386
 
 
ab9867e
0f45386
 
ab9867e
7718032
ab9867e
849fe7e
7718032
89285d1
 
7718032
0b10fd7
 
5178b9b
0b10fd7
6bad35a
 
7718032
4b738f1
0b10fd7
7718032
4b738f1
 
 
 
7718032
4b738f1
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca7acb
 
 
 
4b738f1
 
9559379
3ca7acb
 
0b066f5
9559379
0b066f5
7718032
4b738f1
03bf86f
0b066f5
7718032
b9d57d5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import gradio as gr
import os 
hf_token = os.environ.get('HF_TOKEN')

lpmc_client = gr.load("seungheondoh/LP-Music-Caps-demo", src="spaces")

from gradio_client import Client

client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)

from diffusers import DiffusionPipeline
import torch

pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
pipe.to("cuda")
#pipe.enable_model_cpu_offload()

# if using torch < 2.0
# pipe.enable_xformers_memory_efficient_attention()

from pydub import AudioSegment

def cut_audio(input_path, output_path, max_duration=30000):
    audio = AudioSegment.from_file(input_path)

    if len(audio) > max_duration:
        audio = audio[:max_duration]

    audio.export(output_path, format="mp3")

    return output_path

def solo_xd(prompt):
    images = pipe(prompt=prompt).images[0]
    return images

def infer(audio_file):

    truncated_audio = cut_audio(audio_file, "trunc_audio.mp3")
    
    cap_result = lpmc_client(
    				truncated_audio,	# str (filepath or URL to file) in 'audio_path' Audio component
    				api_name="predict"
    )
    print(cap_result)

    #summarize_q = f"""

    #I'll give you a list of music descriptions. Create a summary reflecting the musical ambiance. 
    #Do not processs each segment, but provide a summary for the whole instead.
    
    #Here's the list:

    #{cap_result}
    #"""

    #summary_result = client.predict(
    #				summarize_q,	# str in 'Message' Textbox component
    #				api_name="/chat_1"
    #)

    #print(f"SUMMARY: {summary_result}")

    llama_q = f"""
    I'll give you a music description, from i want you to provide an illustrative image description that would fit well with the music.
    Do not processs each segment or song, but provide a summary for the whole instead.
    Answer with only one image description. Never do lists. Maximum 77 tokens.

    Here's the music description :

    {cap_result}
    
    """
    
    result = client.predict(
    				llama_q,	# str in 'Message' Textbox component
    				api_name="/predict"
    )

    
    
    
    print(f"Llama2 result: {result}")

    images = pipe(prompt=result).images[0]

    print("Finished")
    
    #return cap_result, result, images
    return images, result, gr.update(visible=True)

css = """
#col-container {max-width: 510px; margin-left: auto; margin-right: auto;}
"""
with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;">
                <div
                style="
                    display: inline-flex;
                    align-items: center;
                    gap: 0.8rem;
                    font-size: 1.75rem;
                "
                >
                <h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
                    Music To Image
                </h1>
                </div>
                <p style="margin-bottom: 10px; font-size: 94%">
                Sends an audio into <a href="https://huggingface.co/spaces/seungheondoh/LP-Music-Caps-demo" target="_blank">LP-Music-Caps</a>
                to generate a audio caption which is then translated to an illustrative image description with Llama2, and finally run through 
                Stable Diffusion XL to generate an image from the audio ! <br /><br />
                Note: Only the first 30 seconds of your audio will be used for inference.
                </p>
            </div>""")
        audio_input = gr.Audio(label="Music input", type="filepath", source="upload")
        infer_btn = gr.Button("Generate Image from Music")
        #lpmc_cap = gr.Textbox(label="Lp Music Caps caption")
        llama_trans_cap = gr.Textbox(label="Llama translation", visible=False)
        img_result = gr.Image(label="Image Result")
        tryagain_btn = gr.Button("Try again ?", visible=False)

    #infer_btn.click(fn=infer, inputs=[audio_input], outputs=[lpmc_cap, llama_trans_cap, img_result])
    infer_btn.click(fn=infer, inputs=[audio_input], outputs=[img_result, llama_trans_cap, tryagain_btn])
    tryagain_btn.click(fn=solo_xd, inputs=[llama_trans_cap], outputs=[img_result])

demo.queue(max_size=20).launch()