Daniel Tse commited on
Commit
9bb604c
1 Parent(s): 01bea1f

Add sentence chunking

Browse files
Files changed (1) hide show
  1. app.py +36 -1
app.py CHANGED
@@ -29,12 +29,47 @@ def transcribe_audio(audiofile):
29
  st.info('Done Transcription')
30
 
31
  return transcription
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def summarize_podcast(audiotranscription):
34
  st.info("Summarizing...")
35
  summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)
36
 
37
- summarized_text = summarizer(audiotranscription)
 
 
 
38
  st.session_state['summary'] = summarized_text
39
  return summarized_text
40
 
 
29
  st.info('Done Transcription')
30
 
31
  return transcription
32
+ def chunk_and_preprocess_text(text, model_name= 'philschmid/flan-t5-base-samsum'):
33
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
34
+ sentences = sent_tokenize(text)
35
+
36
+ length = 0
37
+ chunk = ""
38
+ chunks = []
39
+ count = -1
40
+
41
+ for sentence in sentences:
42
+ count += 1
43
+ combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
44
+
45
+ if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
46
+ chunk += sentence + " " # add the sentence to the chunk
47
+ length = combined_length # update the length counter
48
+
49
+ # if it is the last sentence
50
+ if count == len(sentences) - 1:
51
+ chunks.append(chunk) # save the chunk
52
+
53
+ else:
54
+ chunks.append(chunk) # save the chunk
55
+ # reset
56
+ length = 0
57
+ chunk = ""
58
+
59
+ # take care of the overflow sentence
60
+ chunk += sentence + " "
61
+ length = len(tokenizer.tokenize(sentence))
62
+
63
+ return chunks
64
 
65
  def summarize_podcast(audiotranscription):
66
  st.info("Summarizing...")
67
  summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)
68
 
69
+ st.info("Chunking text")
70
+ text_chunks = chunk_and_preprocess_text(audiotranscription)
71
+
72
+ summarized_text = summarizer(text_chunks, max_len=200,min_len=50)
73
  st.session_state['summary'] = summarized_text
74
  return summarized_text
75