hylee719 commited on
Commit
56773d6
1 Parent(s): ee3321b

merge new March changes

Browse files
Files changed (1) hide show
  1. handler.py +59 -27
handler.py CHANGED
@@ -86,7 +86,7 @@ class Utterance:
86
  'aggregateUnitMeasure': self.aggregate_unit_measure,
87
  'wordCount': self.word_count,
88
  'numMathTerms': self.num_math_terms,
89
- 'mathTerms': self.math_terms
90
  }
91
 
92
  def __repr__(self):
@@ -157,34 +157,45 @@ class Transcript:
157
  uptake_teacher_dict = {}
158
  stop_words = stopwords.words('english')
159
  for utt in self.utterances:
160
- words = (utt.get_clean_text(remove_punct=True)).split(' ')
161
  for word in words:
162
- if word in stop_words: continue
 
163
  if utt.role == 'teacher':
164
- if word not in teacher_dict:
165
- teacher_dict[word] = 0
166
- teacher_dict[word] += 1
167
  if utt.uptake == 1:
168
  if word not in uptake_teacher_dict:
169
  uptake_teacher_dict[word] = 0
170
  uptake_teacher_dict[word] += 1
 
 
 
 
 
 
 
171
  else:
172
  if word not in student_dict:
173
  student_dict[word] = 0
174
  student_dict[word] += 1
175
  dict_list = []
176
  uptake_dict_list = []
 
 
177
  for word in uptake_teacher_dict.keys():
178
  uptake_dict_list.append({'text': word, 'value': uptake_teacher_dict[word], 'category': 'teacher'})
179
  for word in teacher_dict.keys():
180
- dict_list.append(
181
- {'text': word, 'value': teacher_dict[word], 'category': 'teacher'})
 
182
  for word in student_dict.keys():
183
- dict_list.append(
184
- {'text': word, 'value': student_dict[word], 'category': 'student'})
 
185
  sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
186
  sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
187
- return sorted_dict_list[:50], sorted_uptake_dict_list[:50]
 
 
188
 
189
  def get_talk_timeline(self):
190
  return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
@@ -377,9 +388,10 @@ def load_math_terms():
377
  def run_math_density(transcript):
378
  math_terms, math_terms_dict = load_math_terms()
379
  sorted_terms = sorted(math_terms, key=len, reverse=True)
380
- math_word_cloud = {}
 
381
  for i, utt in enumerate(transcript.utterances):
382
- text = utt.get_clean_text(remove_punct=False)
383
  num_matches = 0
384
  matched_positions = set()
385
  match_list = []
@@ -387,22 +399,41 @@ def run_math_density(transcript):
387
  matches = list(re.finditer(term, text, re.IGNORECASE))
388
  # Filter out matches that share positions with longer terms
389
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
 
390
  if len(matches) > 0:
391
- if math_terms_dict[term] not in math_word_cloud:
392
- math_word_cloud[math_terms_dict[term]] = 0
393
- math_word_cloud[math_terms_dict[term]] += len(matches)
394
- match_list.append(math_terms_dict[term])
 
 
 
 
 
395
  # Update matched positions
396
  matched_positions.update((match.start(), match.end()) for match in matches)
397
  num_matches += len(matches)
 
398
  utt.num_math_terms = num_matches
399
  utt.math_terms = match_list
 
 
 
 
400
  dict_list = []
401
- for word in math_word_cloud.keys():
402
- dict_list.append(
403
- {'text': word, 'value': math_word_cloud[word]})
 
 
 
 
 
404
  sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
405
- return sorted_dict_list[:50]
 
 
 
406
 
407
  class EndpointHandler():
408
  def __init__(self, path="."):
@@ -457,18 +488,19 @@ class EndpointHandler():
457
  focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
458
  del focusing_question_model
459
 
460
- math_cloud = run_math_density(transcript)
461
  transcript.update_utterance_roles(uptake_speaker)
 
462
  transcript.calculate_aggregate_word_count()
463
- return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None, 'mathTopWords': None}
464
  talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
465
  return_dict['talkDistribution'] = talk_dist
466
  return_dict['talkLength'] = talk_len
467
  talk_moments = transcript.get_talk_timeline()
468
  return_dict['talkMoments'] = talk_moments
469
- word_cloud, uptake_word_cloud = transcript.get_word_clouds()
470
- return_dict['commonTopWords'] = word_cloud
471
- return_dict['uptakeTopWords'] = uptake_word_cloud
472
- return_dict['mathTopWords'] = math_cloud
 
473
 
474
  return return_dict
 
86
  'aggregateUnitMeasure': self.aggregate_unit_measure,
87
  'wordCount': self.word_count,
88
  'numMathTerms': self.num_math_terms,
89
+ 'mathTerms': self.math_terms,
90
  }
91
 
92
  def __repr__(self):
 
157
  uptake_teacher_dict = {}
158
  stop_words = stopwords.words('english')
159
  for utt in self.utterances:
160
+ words = (utt.get_clean_text(remove_punct=True)).split(' ')
161
  for word in words:
162
+ if word in stop_words or word in ['inaudible', 'crosstalk']: continue
163
+ # handle uptake case
164
  if utt.role == 'teacher':
 
 
 
165
  if utt.uptake == 1:
166
  if word not in uptake_teacher_dict:
167
  uptake_teacher_dict[word] = 0
168
  uptake_teacher_dict[word] += 1
169
+ # ignore math words so they don't get tagged as general
170
+ if any(math_word in word for math_word in utt.math_terms): continue
171
+ if utt.role == 'teacher':
172
+ if word not in teacher_dict:
173
+ teacher_dict[word] = 0
174
+ teacher_dict[word] += 1
175
+
176
  else:
177
  if word not in student_dict:
178
  student_dict[word] = 0
179
  student_dict[word] += 1
180
  dict_list = []
181
  uptake_dict_list = []
182
+ teacher_dict_list = []
183
+ student_dict_list = []
184
  for word in uptake_teacher_dict.keys():
185
  uptake_dict_list.append({'text': word, 'value': uptake_teacher_dict[word], 'category': 'teacher'})
186
  for word in teacher_dict.keys():
187
+ teacher_dict_list.append(
188
+ {'text': word, 'value': teacher_dict[word], 'category': 'general'})
189
+ dict_list.append({'text': word, 'value': teacher_dict[word], 'category': 'general'})
190
  for word in student_dict.keys():
191
+ student_dict_list.append(
192
+ {'text': word, 'value': student_dict[word], 'category': 'general'})
193
+ dict_list.append({'text': word, 'value': student_dict[word], 'category': 'general'})
194
  sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
195
  sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
196
+ sorted_teacher_dict_list = sorted(teacher_dict_list, key=lambda x: x['value'], reverse=True)
197
+ sorted_student_dict_list = sorted(student_dict_list, key=lambda x: x['value'], reverse=True)
198
+ return sorted_dict_list[:50], sorted_uptake_dict_list[:50], sorted_teacher_dict_list[:50], sorted_student_dict_list[:50]
199
 
200
  def get_talk_timeline(self):
201
  return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
 
388
  def run_math_density(transcript):
389
  math_terms, math_terms_dict = load_math_terms()
390
  sorted_terms = sorted(math_terms, key=len, reverse=True)
391
+ teacher_math_word_cloud = {}
392
+ student_math_word_cloud = {}
393
  for i, utt in enumerate(transcript.utterances):
394
+ text = utt.get_clean_text(remove_punct=True)
395
  num_matches = 0
396
  matched_positions = set()
397
  match_list = []
 
399
  matches = list(re.finditer(term, text, re.IGNORECASE))
400
  # Filter out matches that share positions with longer terms
401
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
402
+ # matched_text = [match.group(0) for match in matches]
403
  if len(matches) > 0:
404
+ if utt.role == "teacher":
405
+ if math_terms_dict[term] not in teacher_math_word_cloud:
406
+ teacher_math_word_cloud[math_terms_dict[term]] = 0
407
+ teacher_math_word_cloud[math_terms_dict[term]] += len(matches)
408
+ else:
409
+ if math_terms_dict[term] not in student_math_word_cloud:
410
+ student_math_word_cloud[math_terms_dict[term]] = 0
411
+ student_math_word_cloud[math_terms_dict[term]] += len(matches)
412
+ match_list.append(math_terms_dict[term])
413
  # Update matched positions
414
  matched_positions.update((match.start(), match.end()) for match in matches)
415
  num_matches += len(matches)
416
+ # print("match group list: ", [match.group(0) for match in matches])
417
  utt.num_math_terms = num_matches
418
  utt.math_terms = match_list
419
+ # utt.math_match_positions = list(matched_positions)
420
+ # utt.math_terms_raw = [text[start:end] for start, end in matched_positions]
421
+ teacher_dict_list = []
422
+ student_dict_list = []
423
  dict_list = []
424
+ for word in teacher_math_word_cloud.keys():
425
+ teacher_dict_list.append(
426
+ {'text': word, 'value': teacher_math_word_cloud[word], 'category': "math"})
427
+ dict_list.append({'text': word, 'value': teacher_math_word_cloud[word], 'category': "math"})
428
+ for word in student_math_word_cloud.keys():
429
+ student_dict_list.append(
430
+ {'text': word, 'value': student_math_word_cloud[word], 'category': "math"})
431
+ dict_list.append({'text': word, 'value': student_math_word_cloud[word], 'category': "math"})
432
  sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
433
+ sorted_teacher_dict_list = sorted(teacher_dict_list, key=lambda x: x['value'], reverse=True)
434
+ sorted_student_dict_list = sorted(student_dict_list, key=lambda x: x['value'], reverse=True)
435
+ # return sorted_dict_list[:50]
436
+ return sorted_dict_list[:50], sorted_teacher_dict_list[:50], sorted_student_dict_list[:50]
437
 
438
  class EndpointHandler():
439
  def __init__(self, path="."):
 
488
  focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
489
  del focusing_question_model
490
 
 
491
  transcript.update_utterance_roles(uptake_speaker)
492
+ sorted_math_cloud, teacher_math_cloud, student_math_cloud = run_math_density(transcript)
493
  transcript.calculate_aggregate_word_count()
494
+ return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'studentTopWords': None, 'teacherTopWords': None}
495
  talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
496
  return_dict['talkDistribution'] = talk_dist
497
  return_dict['talkLength'] = talk_len
498
  talk_moments = transcript.get_talk_timeline()
499
  return_dict['talkMoments'] = talk_moments
500
+ word_cloud, uptake_word_cloud, teacher_general_cloud, student_general_cloud = transcript.get_word_clouds()
501
+ teacher_cloud = teacher_math_cloud + teacher_general_cloud
502
+ student_cloud = student_math_cloud + student_general_cloud
503
+ return_dict['teacherTopWords'] = teacher_cloud
504
+ return_dict['studentTopWords'] = student_cloud
505
 
506
  return return_dict