Spaces:

AkashKhamkar
/

test_sum

Runtime error

App Files Files Community

test_sum / app.py

AkashKhamkar

Update app.py

9485796 about 2 years ago

raw

history blame

No virus

11.4 kB

	import streamlit as st
	import sentence_transformers
	from transformers import AutoTokenizer
	from youtube_transcript_api import YouTubeTranscriptApi
	import os
	import ast
	import pandas as pd
	from segmentation import SemanticTextSegmentation
	import re
	from symspellpy import SymSpell
	import pkg_resources
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	from torch import cuda
	from transformers import pipeline
	import nltk
	nltk.download('stopwords')
	from PIL import Image
	from PIL import ImageDraw
	from PIL import ImageFont


	if not os.path.exists('./transcripts'):
	os.mkdir('./transcripts')
	device = 'cuda' if cuda.is_available() else 'cpu'

	def clean_text(link,start,end):

	tokenizer = AutoTokenizer.from_pretrained("CareerNinja/t5_large_1e-4_on_V3dataset")
	sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
	dictionary_path = pkg_resources.resource_filename(
	"symspellpy", "frequency_dictionary_en_82_765.txt"
	)
	sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

	def id_ts_grabber(link):
	youtube_video = link.split("=")
	video_id = youtube_video[1]
	#print(f""" This is the video ID: {video_id} and this is the Timestamp: {time_stamp}""")
	return video_id
	#print(f""" This is the video ID: {video_id} and no Timestamp was found""")

	def seg_getter(data,ts,es):
	starts = []
	for line in data:
	ccs = ast.literal_eval(line)
	starts.append(float(ccs['start']))
	#print(starts)
	#ts_ = float(ts.strip("s&end"))
	#es_ = float(es.strip(es[-1]))
	if not(es) :
	e_val = starts[-1]
	else:
	e_val = starts[min(range(len(starts)), key = lambda i: abs(starts[i]-float(es)))]

	t_val = starts[min(range(len(starts)), key = lambda i: abs(starts[i]-float(ts)))]
	tid = starts.index(t_val)
	eid = starts.index(e_val)
	ts_list_len = len(starts[tid:eid])
	return tid, ts_list_len


	def get_cc(video_id):
	try:
	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
	try:
	# filter for manually created transcripts
	transcript = transcript_list.find_manually_created_transcript(['en','en-US','en-GB','en-IN'])
	except Exception as e:
	# print(e)
	transcript = None

	manual = True
	if not transcript:
	try:
	# or automatically generated ones
	transcript = transcript_list.find_generated_transcript(['en'])
	manual = False
	except Exception as e:
	# print(e)
	transcript = None

	if transcript:
	if manual: file_name = os.path.join('transcripts', str(video_id) + "_cc_manual" + ".txt")
	else: file_name = os.path.join('transcripts', str(video_id) + "_cc_auto" + ".txt")
	with open(file_name, 'w') as file:
	for line in transcript.fetch():
	file.write(str(line).replace(r'\xa0', ' ').replace(r'\n', '') + '\n')
	# print(f"CC downloaded in {file_name}")
	return file_name
	else:
	#print("No transcript found")
	return None

	except Exception as e:
	#print(e)
	return None

	def transcript_creator(filename,timestamp,end_pt):
	#print(filename)
	with open(filename, 'r') as f:
	data = f.readlines()
	#print("This is data: ", data)
	transcripts = []
	#print("this is ts: ",timestamp)
	if not(timestamp) and not(end_pt):
	#print("executing 1 ")
	for line in data:
	ccs = ast.literal_eval(line)
	transcripts.append(ccs['text'])
	return transcripts

	elif not(timestamp) and end_pt :
	timestamp = 0
	start,lenlist = seg_getter(data, timestamp, end_pt)

	for t in range(lenlist):
	ccs = ast.literal_eval(data[start+t])
	transcripts.append(ccs['text'])
	return transcripts


	else :
	#print("executing 2")
	start,lenlist = seg_getter(data,timestamp,end_pt)
	#print(f""" This is the ts list{ts_len}""")
	for t in range(lenlist):
	ccs = ast.literal_eval(data[start+t])
	transcripts.append(ccs['text'])
	return transcripts

	def transcript_collector(link,ts,es):
	vid = id_ts_grabber(link)
	print(f""" Fetching the transcript """)
	filename = get_cc(vid)
	return transcript_creator(filename, ts, es), vid

	transcript = pd.DataFrame(columns=['text', 'video_id'])
	transcript.loc[0,'text'],transcript.loc[0,'video_id'] = transcript_collector(link,start,end)

	def segment(corpus):
	text_data = [re.sub(r'\[.*?\]', '', x).strip() for x in corpus]
	text_data = [x for x in text_data if x != '']
	df = pd.DataFrame(text_data, columns=["utterance"])
	# remove new line, tab, return
	df["utterance"] = df["utterance"].apply(lambda x: x.replace("\n", " ").replace("\r", " ").replace("\t", " "))
	# remove Nan
	df.dropna(inplace=True)
	sts = SemanticTextSegmentation(df)
	texts = sts.get_segments()
	return texts

	sf = pd.DataFrame(columns=['Segmented_Text','video_id'])

	text = segment(transcript.at[0,'text'])

	for i in range(len(text)):
	sf.loc[i, 'Segmented_Text'] = text[i]
	sf.loc[i, 'video_id'] = transcript.at[0,'video_id']

	def word_seg(text):
	text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ").replace("\xa0", " ")
	results = sym_spell.word_segmentation(text, max_edit_distance=0)
	texts = results.segmented_string
	#result = re.sub(r'[^\w\s]', '',texts).lower()
	return texts

	for i in range(len(sf)):
	#st.write(sf.at[i, 'Segmented_Text'])
	sf.loc[i, 'Segmented_Text'] = word_seg(sf.at[i, 'Segmented_Text'])
	sf.loc[i, 'Lengths'] = len(tokenizer(sf.at[i, 'Segmented_Text'])['input_ids'])

	texts = pd.DataFrame(columns=['texts'])

	def segment_loader(dataframe):
	flag = 0
	for i in range(len(dataframe)):
	if flag > 0:
	flag -= 1
	continue
	m = 512
	iter = 0
	texts.loc[i, 'texts'] = dataframe.at[i+iter, 'Segmented_Text']
	length = dataframe.at[i+iter, 'Lengths']
	texts.loc[i,'video_id'] = dataframe.at[i, 'video_id']
	while i+iter < len(dataframe)-1 and dataframe.at[i, 'video_id'] == dataframe.at[i+iter+1, 'video_id']:
	if length + dataframe.at[i + iter + 1, 'Lengths'] <= m :
	texts.loc[i,'texts'] += " " + dataframe.at[i+iter+1, 'Segmented_Text']
	length += dataframe.at[i+iter + 1,'Lengths']
	iter += 1
	else:
	break

	flag = iter
	return texts

	cleaned_text = segment_loader(sf)
	cleaned_text.reset_index(drop=True, inplace=True)

	return cleaned_text


	def t5_summarizer(link,start, end):
	input_text = clean_text(link,start,end)
	lst_outputs = []
	tokenizer1 = AutoTokenizer.from_pretrained("CareerNinja/t5_large_1e-4_on_V3dataset")
	model1 = AutoModelForSeq2SeqLM.from_pretrained("CareerNinja/t5_large_1e-4_on_V3dataset")
	summarizer1 = pipeline("summarization", model=model1, tokenizer=tokenizer1)
	print(f""" Entered summarizer ! """)
	st.write('Below is the summary of the given URL: ')
	for i in range(len(input_text)):
	summary = summarizer1(input_text.at[i,'texts'], min_length=64, max_length=128)
	sumry = list(summary[0].values())
	input_text.loc[i,'Generated Summary'] = sumry[0]
	lst_outputs.append(sumry[0])
	st.write(input_text.at[i,'Generated Summary'])
	if i != len(input_text) - 1:
	st.write('=====================================================================================')
	return lst_outputs

	def card_creator(path, text, y_value):
	img = Image.open(path)

	def text_wrap(text, font, max_width):
	"""Wrap text base on specified width.
	This is to enable text of width more than the image width to be display
	nicely.
	@params:
	text: str
	text to wrap
	font: obj
	font of the text
	max_width: int
	width to split the text with
	@return
	lines: list[str]
	list of sub-strings
	"""
	lines = []

	# If the text width is smaller than the image width, then no need to split
	# just add it to the line list and return
	if font.getsize(text)[0] <= max_width:
	lines.append(text)
	else:
	#split the line by spaces to get words
	words = text.split(' ')
	i = 0
	# append every word to a line while its width is shorter than the image width
	while i < len(words):
	line = ''
	while i < len(words) and font.getsize(line + words[i])[0] <= max_width:
	line = line + words[i]+ " "
	i += 1
	if not line:
	line = words[i]
	i += 1
	lines.append(line)
	return lines


	font_path = 'Montserrat-Regular.ttf'
	font = ImageFont.truetype(font=font_path, size=22)
	lines = text_wrap(text, font, img.size[0] - 44)
	line_height = font.getsize('hg')[1]

	draw = ImageDraw.Draw(img)
	#Draw text on image
	color = 'rgb(255,255,255)' # white color
	x = 22
	y = y_value
	for line in lines:
	draw.text((x,y), line, fill=color, font=font)

	y = y + line_height # update y-axis for new line
	img.save("card.png")
	st.image(img, caption="Summary Card")

	def main():

	if 'submitted' not in st.session_state:
	st.session_state.submitted = False

	if 'opt' not in st.session_state:
	st.session_state.opt = []

	def callback():
	st.session_state.submitted = True

	st.title('Video Summarizer')
	url = st.text_input('Enter the Video Link')
	start_pt = st.text_input('Enter the Start point in secs')
	end_pt = st.text_input('Enter the end point in secs')

	if (st.button("Submit URL", on_click=callback) and url) :
	opt = t5_summarizer(url,start_pt,end_pt)
	st.session_state.opt = opt
	#st.write(st.session_state)
	#text = st.text_input('Enter the Summary here to make a Summary Card.')
	#text = st.selectbox('Select the summary you want to creat a card of ', opt, key="text")
	#st.write('You selected:', option)
	if st.session_state.submitted and st.session_state.opt:
	text = st.selectbox('Select the summary you want to creat a card of ', st.session_state.opt)

	option = st.selectbox('Which color template would you like to use ?',('Elf Green','Dark Pastel Green'))
	if st.button("Generate Summary Card") and text and option:
	if option == 'Elf Green':
	if len(text) > 380 :
	st.error('Summary is too long !')
	else:
	card_creator('iteration5_empty.png',text,335)
	else :
	if len(text) > 430 :
	st.error('Summary is too long !')
	else :
	card_creator('X-93.png',text,285)

	with open("card.png", "rb") as file:
	btn = st.download_button(
	label="Download card",
	data=file,
	file_name="card.png",
	mime="image/png"
	)

	main()