Nathan Slaughter
use json output
bec6d62
import torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoModelForCausalLM, AutoTokenizer, EncoderDecoderCache
import gradio as gr
import librosa
# Determine the device
if torch.cuda.is_available(): # for CUDA
device = torch.device("cuda")
elif torch.backends.mps.is_available(): # for Apple MPS
device = torch.device("mps")
else: # fallback for CPU
device = torch.device("cpu")
# Load the audio processor and model
stt_processor = AutoProcessor.from_pretrained("openai/whisper-large", language='en')
stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
"openai/whisper-large"
)
# Move the model to the device
stt_model.to(device)
def transcribe_audio(audio_path: str):
transcript = ""
try:
audio, sr = librosa.load(audio_path, sr=16000)
inputs = stt_processor(audio, sampling_rate=16000, return_tensors="pt", language='en')
input_features = inputs.input_features.to(device)
with torch.no_grad():
predicted_ids = stt_model.generate(input_features)
transcript = stt_processor.batch_decode(predicted_ids, skip_special_tokens=True, language='en')[0]
except Exception as e:
return f"Error during transcription: {str(e)}"
finally:
return transcript
def extract_action_items(transcript: str) -> str:
"""
Extracts action items from a transcript using the Llama-3.1-8B-Instruct model.
see example code in the model card: https://huggingface.co/Qwen/Qwen/Qwen2.5-7B-Instruct
"""
model_name = "Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
messages = [
{"role": "system", "content": "You are the perfect action item extraction system."},
{"role": "user", "content": """Extract the action items from the voice note transcript using the following format:\n\n
[
{"title 1": "description 1"},
"title 2": "description 2"},
...
]"""},
{"role": "user", "content": f"transcript: {transcript}"}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(
**model_inputs,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
return response
def transcribe_and_extract_action_items(audio_path):
transcript = transcribe_audio(audio_path)
action_items_text = extract_action_items(transcript)
return transcript, action_items_text
##################################################
# Gradio Interface
##################################################
# Define the Gradio interface components
input_audio = gr.Audio(
type="filepath",
label="Upload or Record Audio"
)
output_transcript = gr.Textbox(
label="Transcript",
lines=10,
placeholder="The transcribed text will appear here..."
)
output_action_items = gr.Textbox(
label="Action Items",
lines=10,
placeholder="Extracted action items will appear here..."
)
# 4. Create the Gradio interface
interface = gr.Interface(
fn=transcribe_and_extract_action_items,
inputs=input_audio,
outputs=[output_transcript, output_action_items],
title="Audio Transcription and Action Item Extraction",
description=(
"Upload or record an audio clip. The system will transcribe the audio "
"and extract actionable items from the transcript."
),
theme="default"
)
# 5. Launch the interface
if __name__ == "__main__":
interface.launch()