test-1 / app.py
Daniel Tse
Remove max_len and min_len
4fa56af
raw
history blame
4.57 kB
import streamlit as st
import whisper
import os
import torch
import nltk
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoModelForSeq2SeqLM
from pydub import AudioSegment
from nltk import sent_tokenize
nltk.download('punkt')
from langchain.text_splitter import RecursiveCharacterTextSplitter
def transcribe_audio(audiofile):
st.session_state['audio'] = audiofile
print(f"audio_file_session_state:{st.session_state['audio'] }")
st.info("Getting size of file")
#get size of audio file
audio_size = round(os.path.getsize(st.session_state['audio'])/(1024*1024),1)
print(f"audio file size:{audio_size}")
#determine audio duration
podcast = AudioSegment.from_mp3(st.session_state['audio'])
st.session_state['audio_segment'] = podcast
podcast_duration = podcast.duration_seconds
print(f"Audio Duration: {podcast_duration}")
st.info("Transcribing")
whisper_model = whisper.load_model("small.en")
transcription = whisper_model.transcribe(audiofile)
st.session_state['transcription'] = transcription
print(f"ranscription: {transcription['text']}")
st.info('Done Transcription')
return transcription
def chunk_and_preprocess_text(text, model_name= 'philschmid/flan-t5-base-samsum'):
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentences = sent_tokenize(text)
length = 0
chunk = ""
chunks = []
count = -1
for sentence in sentences:
count += 1
combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
chunk += sentence + " " # add the sentence to the chunk
length = combined_length # update the length counter
# if it is the last sentence
if count == len(sentences) - 1:
chunks.append(chunk) # save the chunk
else:
chunks.append(chunk) # save the chunk
# reset
length = 0
chunk = ""
# take care of the overflow sentence
chunk += sentence + " "
length = len(tokenizer.tokenize(sentence))
return chunks
def summarize_podcast(audiotranscription):
st.info("Summarizing...")
summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)
st.info("Chunking text")
text_chunks = chunk_and_preprocess_text(audiotranscription)
#summarized_text = summarizer(text_chunks, max_len=200,min_len=50)
summarized_text = summarizer(text_chunks)
st.session_state['summary'] = summarized_text
return summarized_text
def prepare_text_for_qa(audiotranscription):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
documents = text_splitter.split_documents(audiotranscription)
revalue = ""
return revalue
st.markdown("# Podcast Q&amp;A")
st.markdown(
"""
This helps understand information-dense podcast episodes by doing the following:
- Speech to Text transcription - using OpenSource Whisper Model
- Summarizes the episode
- Allows you to ask questions and returns direct quotes from the episode.
- As a proof of Concept: the Podcast Episode of Marketplace Business News Podcast from NPR on June 14 is used in this codebase.
- The file is THE ONLY HARDCODED piece of information used in this application.
"""
)
st.text("Marketplace Episode June 14 2023")
st.audio("marketplace-2023-06-14.mp3")
if st.button("Process Audio File"):
podcast_text = transcribe_audio("marketplace-2023-06-14.mp3")
#write text out
with st.expander("See Transcription"):
st.caption(podcast_text['text'])
#Summarize Text
podcast_summary = summarize_podcast(podcast_text['text'])
st.markdown(
"""
##Summary of Text
"""
)
st.text(podcast_summary['summary_text'])
if st.button("Summarize Podcast"):
with open('transcription.txt', 'r') as file:
podcast_text = file.read().rstrip()
podcast_summary = summarize_podcast(podcast_text)
st.markdown(
"""
##Summary of Text
"""
)
st.text(podcast_summary['summary_text'])
#audio_file = st.file_uploader("Upload audio copy of file", key="upload", type=['.mp3'])
# if audio_file:
# transcribe_audio(audio_file)