hylee719 commited on
Commit
a343bb9
1 Parent(s): 585aa63
Files changed (1) hide show
  1. handler.py +20 -139
handler.py CHANGED
@@ -3,9 +3,6 @@ from scipy.special import softmax
3
  import numpy as np
4
  import weakref
5
  import re
6
- import nltk
7
- from nltk.corpus import stopwords
8
- nltk.download('stopwords')
9
 
10
  from utils import clean_str, clean_str_nopunct
11
  import torch
@@ -13,7 +10,7 @@ from utils import MultiHeadModel, BertInputBuilder, get_num_words, MATH_PREFIXES
13
 
14
  import transformers
15
  from transformers import BertTokenizer, BertForSequenceClassification
16
- from transformers.utils import logging
17
 
18
  transformers.logging.set_verbosity_debug()
19
 
@@ -33,15 +30,9 @@ class Utterance:
33
  self.endtime = endtime
34
  self.transcript = weakref.ref(transcript) if transcript else None
35
  self.props = kwargs
36
- self.role = None
37
- self.word_count = self.get_num_words()
38
- self.timestamp = [starttime, endtime]
39
- self.unit_measure = None
40
- self.aggregate_unit_measure = endtime
41
  self.num_math_terms = None
42
  self.math_terms = None
43
 
44
- # moments
45
  self.uptake = None
46
  self.reasoning = None
47
  self.question = None
@@ -71,20 +62,6 @@ class Utterance:
71
  **self.props
72
  }
73
 
74
- def to_talk_timeline_dict(self):
75
- return{
76
- 'speaker': self.speaker,
77
- 'text': self.text,
78
- 'role': self.role,
79
- 'timestamp': self.timestamp,
80
- 'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False, 'focusingQuestion': True if self.focusing_question else False},
81
- 'unitMeasure': self.unit_measure,
82
- 'aggregateUnitMeasure': self.aggregate_unit_measure,
83
- 'wordCount': self.word_count,
84
- 'numMathTerms': self.num_math_terms,
85
- 'mathTerms': self.math_terms
86
- }
87
-
88
  def __repr__(self):
89
  return f"Utterance(speaker='{self.speaker}'," \
90
  f"text='{self.text}', uid={self.uid}," \
@@ -114,86 +91,6 @@ class Transcript:
114
  def length(self):
115
  return len(self.utterances)
116
 
117
- def update_utterance_roles(self, uptake_speaker):
118
- for utt in self.utterances:
119
- if (utt.speaker == uptake_speaker):
120
- utt.role = 'teacher'
121
- else:
122
- utt.role = 'student'
123
-
124
- def get_talk_distribution_and_length(self, uptake_speaker):
125
- if ((uptake_speaker is None)):
126
- return None
127
- teacher_words = 0
128
- teacher_utt_count = 0
129
- student_words = 0
130
- student_utt_count = 0
131
- for utt in self.utterances:
132
- if (utt.speaker == uptake_speaker):
133
- utt.role = 'teacher'
134
- teacher_words += utt.get_num_words()
135
- teacher_utt_count += 1
136
- else:
137
- utt.role = 'student'
138
- student_words += utt.get_num_words()
139
- student_utt_count += 1
140
- teacher_percentage = round(
141
- (teacher_words / (teacher_words + student_words)) * 100)
142
- student_percentage = 100 - teacher_percentage
143
- avg_teacher_length = teacher_words / teacher_utt_count
144
- avg_student_length = student_words / student_utt_count
145
- return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
146
-
147
- def get_word_cloud_dicts(self):
148
- teacher_dict = {}
149
- student_dict = {}
150
- uptake_teacher_dict = {}
151
- stop_words = stopwords.words('english')
152
- # stopwords = nltk.corpus.stopwords.word('english')
153
- # print("stopwords: ", stopwords)
154
- for utt in self.utterances:
155
- words = (utt.get_clean_text(remove_punct=True)).split(' ')
156
- for word in words:
157
- if word in stop_words: continue
158
- if utt.role == 'teacher':
159
- if word not in teacher_dict:
160
- teacher_dict[word] = 0
161
- teacher_dict[word] += 1
162
- if utt.uptake == 1:
163
- if word not in uptake_teacher_dict:
164
- uptake_teacher_dict[word] = 0
165
- uptake_teacher_dict[word] += 1
166
- else:
167
- if word not in student_dict:
168
- student_dict[word] = 0
169
- student_dict[word] += 1
170
- dict_list = []
171
- uptake_dict_list = []
172
- for word in uptake_teacher_dict.keys():
173
- uptake_dict_list.append({'text': word, 'value': uptake_teacher_dict[word], 'category': 'teacher'})
174
- for word in teacher_dict.keys():
175
- dict_list.append(
176
- {'text': word, 'value': teacher_dict[word], 'category': 'teacher'})
177
- for word in student_dict.keys():
178
- dict_list.append(
179
- {'text': word, 'value': student_dict[word], 'category': 'student'})
180
- sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
181
- sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
182
- return sorted_dict_list[:50], sorted_uptake_dict_list[:50]
183
-
184
- def get_talk_timeline(self):
185
- return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
186
-
187
- def calculate_aggregate_word_count(self):
188
- unit_measures = [utt.unit_measure for utt in self.utterances]
189
- if None in unit_measures:
190
- aggregate_word_count = 0
191
- for utt in self.utterances:
192
- aggregate_word_count += utt.get_num_words()
193
- utt.unit_measure = utt.get_num_words()
194
- utt.aggregate_unit_measure = aggregate_word_count
195
-
196
-
197
  def to_dict(self):
198
  return {
199
  'utterances': [utterance.to_dict() for utterance in self.utterances],
@@ -321,6 +218,8 @@ class UptakeModel:
321
  return_pooler_output=False)
322
  return output
323
 
 
 
324
  class FocusingQuestionModel:
325
  def __init__(self, device, tokenizer, input_builder, max_length=128, path=FOCUSING_QUESTION_MODEL):
326
  print("Loading models...")
@@ -355,7 +254,8 @@ class FocusingQuestionModel:
355
  output = self.model(input_ids=instance["input_ids"],
356
  attention_mask=instance["attention_mask"],
357
  token_type_ids=instance["token_type_ids"])
358
- return output
 
359
 
360
  def load_math_terms():
361
  math_terms = []
@@ -365,29 +265,23 @@ def load_math_terms():
365
  math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
366
  math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
367
  else:
368
- math_terms.append(term)
369
- math_terms_dict[term] = term
370
  return math_terms, math_terms_dict
371
 
372
  def run_math_density(transcript):
373
  math_terms, math_terms_dict = load_math_terms()
374
- sorted_terms = sorted(math_terms, key=len, reverse=True)
375
- for i, utt in enumerate(transcript.utterances):
376
  text = utt.get_clean_text(remove_punct=False)
377
- num_matches = 0
378
- matched_positions = set()
379
- match_list = []
380
- for term in sorted_terms:
381
- matches = list(re.finditer(term, text, re.IGNORECASE))
382
- # Filter out matches that share positions with longer terms
383
- matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
384
- if len(matches) > 0:
385
- match_list.append(math_terms_dict[term])
386
- # Update matched positions
387
- matched_positions.update((match.start(), match.end()) for match in matches)
388
- num_matches += len(matches)
389
- utt.num_math_terms = num_matches
390
- utt.math_terms = match_list
391
 
392
  class EndpointHandler():
393
  def __init__(self, path="."):
@@ -419,13 +313,13 @@ class EndpointHandler():
419
  transcript.add_utterance(Utterance(**utt))
420
 
421
  print("Running inference on %d examples..." % transcript.length())
422
- logging.set_verbosity_info()
423
  # Uptake
424
  uptake_model = UptakeModel(
425
  self.device, self.tokenizer, self.input_builder)
426
- uptake_speaker = params.pop("uptake_speaker", None)
427
  uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
428
  uptake_speaker=uptake_speaker)
 
429
  # Reasoning
430
  reasoning_model = ReasoningModel(
431
  self.device, self.tokenizer, self.input_builder)
@@ -443,17 +337,4 @@ class EndpointHandler():
443
 
444
  run_math_density(transcript)
445
 
446
- transcript.update_utterance_roles(uptake_speaker)
447
- transcript.calculate_aggregate_word_count()
448
- return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None}
449
- talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
450
- return_dict['talkDistribution'] = talk_dist
451
- return_dict['talkLength'] = talk_len
452
- talk_moments = transcript.get_talk_timeline()
453
- return_dict['talkMoments'] = talk_moments
454
- word_cloud, uptake_word_cloud = transcript.get_word_cloud_dicts()
455
- return_dict['commonTopWords'] = word_cloud
456
- return_dict['uptakeTopwords'] = uptake_word_cloud
457
-
458
-
459
- return return_dict
 
3
  import numpy as np
4
  import weakref
5
  import re
 
 
 
6
 
7
  from utils import clean_str, clean_str_nopunct
8
  import torch
 
10
 
11
  import transformers
12
  from transformers import BertTokenizer, BertForSequenceClassification
13
+
14
 
15
  transformers.logging.set_verbosity_debug()
16
 
 
30
  self.endtime = endtime
31
  self.transcript = weakref.ref(transcript) if transcript else None
32
  self.props = kwargs
 
 
 
 
 
33
  self.num_math_terms = None
34
  self.math_terms = None
35
 
 
36
  self.uptake = None
37
  self.reasoning = None
38
  self.question = None
 
62
  **self.props
63
  }
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def __repr__(self):
66
  return f"Utterance(speaker='{self.speaker}'," \
67
  f"text='{self.text}', uid={self.uid}," \
 
91
  def length(self):
92
  return len(self.utterances)
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  def to_dict(self):
95
  return {
96
  'utterances': [utterance.to_dict() for utterance in self.utterances],
 
218
  return_pooler_output=False)
219
  return output
220
 
221
+
222
+
223
  class FocusingQuestionModel:
224
  def __init__(self, device, tokenizer, input_builder, max_length=128, path=FOCUSING_QUESTION_MODEL):
225
  print("Loading models...")
 
254
  output = self.model(input_ids=instance["input_ids"],
255
  attention_mask=instance["attention_mask"],
256
  token_type_ids=instance["token_type_ids"])
257
+ return output
258
+
259
 
260
  def load_math_terms():
261
  math_terms = []
 
265
  math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
266
  math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
267
  else:
268
+ math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
269
+ math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
270
  return math_terms, math_terms_dict
271
 
272
  def run_math_density(transcript):
273
  math_terms, math_terms_dict = load_math_terms()
274
+ for i, utt in enumerate(transcript.utterances):
275
+ found_math_terms = set()
276
  text = utt.get_clean_text(remove_punct=False)
277
+ num_math_terms = 0
278
+ for term in math_terms:
279
+ count = len(re.findall(term, text))
280
+ if count > 0:
281
+ found_math_terms.add(math_terms_dict[term])
282
+ num_math_terms += count
283
+ utt.num_math_terms = num_math_terms
284
+ utt.math_terms = list(found_math_terms)
 
 
 
 
 
 
285
 
286
  class EndpointHandler():
287
  def __init__(self, path="."):
 
313
  transcript.add_utterance(Utterance(**utt))
314
 
315
  print("Running inference on %d examples..." % transcript.length())
316
+ uptake_speaker = params.pop("uptake_speaker", None)
317
  # Uptake
318
  uptake_model = UptakeModel(
319
  self.device, self.tokenizer, self.input_builder)
 
320
  uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
321
  uptake_speaker=uptake_speaker)
322
+
323
  # Reasoning
324
  reasoning_model = ReasoningModel(
325
  self.device, self.tokenizer, self.input_builder)
 
337
 
338
  run_math_density(transcript)
339
 
340
+ return transcript.to_dict()