File size: 1,549 Bytes
9fd51b2 01f0cbf 9fd51b2 9f29da8 9fd51b2 01f0cbf 9fd51b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import gradio as gr
import whisper
from PIL import Image
import os
MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD')
from diffusers import StableDiffusionPipeline
whisper_model = whisper.load_model("small")
device="cpu"
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=MY_SECRET_TOKEN)
pipe.to(device)
def get_transcribe(audio):
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
#_, probs = whisper_model.detect_language(mel)
options = whisper.DecodingOptions(task="translate", fp16 = False)
result = whisper.decode(whisper_model, mel, options)
print(result)
print(result.text)
return result.text
def get_images(audio):
prompt = get_transcribe(audio)
#image = pipe(prompt, init_image=init_image)["sample"][0]
images_list = pipe([prompt] * 2)
images = []
safe_image = Image.open(r"unsafe.png")
for i, image in enumerate(images_list["sample"]):
if(images_list["nsfw_content_detected"][i]):
images.append(safe_image)
else:
images.append(image)
return images
#inputs
audio = gr.Audio(label="Input Audio", show_label=False, source="microphone", type="filepath")
#outputs
gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery").style(grid=[2], height="auto")
gr.Interface(fn=get_images, inputs=audio, outputs=gallery).queue(max_size=10).launch(enable_queue=True) |