Spaces:

fffiloni
/

speech-to-image

Paused

File size: 2,036 Bytes

0d33acd
 
 
 
 
 
 
 
 
09780d3
 
 
0d33acd
 
 
 
 
 
 
 
 
09780d3
59ea9bc
 
0d33acd
 
4055f04
0d33acd
 
 
 
a59f564
 
fcae65f
 
a59f564
 
 
801fc0e
0d33acd
 
 
ca1c374
 
 
893b4fe
801fc0e
9768faf
893b4fe
4f7689a
0d33acd
 
 
a59f564
0d33acd

import gradio as gr
import torch

from diffusers import DiffusionPipeline
from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
)

import os
MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD')

device = "cuda" if torch.cuda.is_available() else "cpu"
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

diffuser_pipeline = DiffusionPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    custom_pipeline="speech_to_image_diffusion",
    speech_model=model,
    speech_processor=processor,
    use_auth_token=MY_SECRET_TOKEN,
    revision="fp16",
    torch_dtype=torch.float16,
)

diffuser_pipeline.enable_attention_slicing()
diffuser_pipeline = diffuser_pipeline.to(device)

#————————————————————————————————————————————
# GRADIO SETUP
title = "Speech to Diffusion • Community Pipeline"
description = """
<p style='text-align: center;'>This demo can generate an image from an audio sample using pre-trained OpenAI whisper-small and Stable Diffusion.<br />
Community examples consist of both inference and training examples that have been added by the community.<br />
<a href='https://github.com/huggingface/diffusers/tree/main/examples/community#speech-to-image' target='_blank'> Click here for more information about community pipelines </a>
</p>
"""
audio_input = gr.Audio(source="microphone", type="numpy")
image_output = gr.Image()

def speech_to_text(audio_sample):
  #text = audio_sample["text"].lower()
  #print(text)
  #speech_data = audio_sample["audio"]["array"]
  print("————————")
  print(audio_sample)
  print(audio_sample[1])
  print("————————")
  output = diffuser_pipeline(audio_sample[1])
  
  return output.images[0]

demo = gr.Interface(fn=speech_to_text, inputs=audio_input, outputs=image_output, title=title, description=description)
demo.launch()