Spaces:

AkashKhamkar
/

test_sum

Runtime error

App Files Files Community

test_sum / segmentation.py

AkashKhamkar

Update segmentation.py

fb101c8 about 2 years ago

raw

history blame

No virus

3.4 kB

	from functools import lru_cache
	import attr
	import pandas as pd
	import numpy as np
	import spacy
	from nltk.tokenize.texttiling import TextTilingTokenizer
	from sklearn.metrics.pairwise import cosine_similarity
	from sentence_transformers import SentenceTransformer

	@lru_cache
	def load_sentence_transformer(model_name='all-MiniLM-L6-v2'):
	"""
	all_MiniLM_L6_v2 - offline
	all-MiniLM-L6-v2 - Online
	"""
	model = SentenceTransformer(model_name)
	return model

	@lru_cache
	def load_spacy():
	return spacy.load('en_core_web_sm')


	model = load_sentence_transformer()
	nlp = load_spacy()


	@attr.s
	class SemanticTextSegmentation:

	"""
	Segment a call transcript based on topics discussed in the call using
	TextTilling with Sentence Similarity via sentence transformer.
	Paramters
	---------
	data: pd.Dataframe
	Pass the trascript in the dataframe format
	utterance: str
	pass the column name which represent utterance in transcript dataframe
	"""

	data = attr.ib()
	utterance = attr.ib(default='utterance')

	def __attrs_post_init__(self):
	columns = self.data.columns.tolist()

	def get_segments(self, threshold=0.7):
	"""
	returns the transcript segments computed with texttiling and sentence-transformer.
	Paramters
	---------
	threshold: float
	sentence similarity threshold. (used to merge the sentences into coherant segments)
	Return
	------
	new_segments: list
	list of segments
	"""
	segments = self._text_tilling()
	merge_index = self._merge_segments(segments, threshold)
	new_segments = []
	for i in merge_index:
	seg = ' '.join([segments[_] for _ in i])
	new_segments.append(seg)
	return new_segments

	def _merge_segments(self, segments, threshold):
	segment_map = [0]
	for index, (text1, text2) in enumerate(zip(segments[:-1], segments[1:])):
	sim = self._get_similarity(text1, text2)
	if sim >= threshold:
	segment_map.append(0)
	else:
	segment_map.append(1)
	return self._index_mapping(segment_map)

	def _index_mapping(self, segment_map):
	index_list = []
	temp = []
	for index, i in enumerate(segment_map):
	if i == 1:
	index_list.append(temp)
	temp = [index]
	else:
	temp.append(index)
	index_list.append(temp)
	return index_list

	def _get_similarity(self, text1, text2):
	sentence_1 = [i.text.strip()
	for i in nlp(text1).sents if len(i.text.split(' ')) > 1]
	sentence_2 = [i.text.strip()
	for i in nlp(text2).sents if len(i.text.split(' ')) > 2]
	embeding_1 = model.encode(sentence_1)
	embeding_2 = model.encode(sentence_2)
	embeding_1 = np.mean(embeding_1, axis=0).reshape(1, -1)
	embeding_2 = np.mean(embeding_2, axis=0).reshape(1, -1)
	sim = cosine_similarity(embeding_1, embeding_2)
	return sim

	def _text_tilling(self):
	tt = TextTilingTokenizer(w=15, k=10)
	text = '\n\n\t'.join(self.data[self.utterance].tolist())
	segment = tt.tokenize(text)
	segment = [i.replace("\n\n\t", ' ') for i in segment]
	return segment