hylee719 commited on
Commit
ee3321b
1 Parent(s): 631742f

merge tested changes for math word cloud

Browse files
Files changed (1) hide show
  1. handler.py +25 -11
handler.py CHANGED
@@ -40,7 +40,7 @@ class Utterance:
40
  self.unit_measure = endtime - starttime
41
  else:
42
  self.unit_measure = None
43
- self.aggregate_unit_measure = endtime
44
  self.num_math_terms = None
45
  self.math_terms = None
46
 
@@ -151,7 +151,7 @@ class Transcript:
151
  avg_student_length = student_words / student_utt_count if student_utt_count > 0 else 0
152
  return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
153
 
154
- def get_word_cloud_dicts(self):
155
  teacher_dict = {}
156
  student_dict = {}
157
  uptake_teacher_dict = {}
@@ -232,7 +232,7 @@ class QuestionModel:
232
  max_length=self.max_length,
233
  input_str=True)
234
  output = self.get_prediction(instance)
235
- print(output)
236
  utt.question = np.argmax(
237
  output["is_question_logits"][0].tolist())
238
 
@@ -377,6 +377,7 @@ def load_math_terms():
377
  def run_math_density(transcript):
378
  math_terms, math_terms_dict = load_math_terms()
379
  sorted_terms = sorted(math_terms, key=len, reverse=True)
 
380
  for i, utt in enumerate(transcript.utterances):
381
  text = utt.get_clean_text(remove_punct=False)
382
  num_matches = 0
@@ -387,12 +388,21 @@ def run_math_density(transcript):
387
  # Filter out matches that share positions with longer terms
388
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
389
  if len(matches) > 0:
 
 
 
390
  match_list.append(math_terms_dict[term])
391
  # Update matched positions
392
  matched_positions.update((match.start(), match.end()) for match in matches)
393
  num_matches += len(matches)
394
  utt.num_math_terms = num_matches
395
  utt.math_terms = match_list
 
 
 
 
 
 
396
 
397
  class EndpointHandler():
398
  def __init__(self, path="."):
@@ -427,34 +437,38 @@ class EndpointHandler():
427
  uptake_speaker = params.pop("uptake_speaker", None)
428
  uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
429
  uptake_speaker=uptake_speaker)
 
 
430
  # Reasoning
431
  reasoning_model = ReasoningModel(
432
  self.device, self.tokenizer, self.input_builder)
433
  reasoning_model.run_inference(transcript, uptake_speaker=uptake_speaker)
434
-
 
435
  # Question
436
  question_model = QuestionModel(
437
  self.device, self.tokenizer, self.input_builder)
438
  question_model.run_inference(transcript)
439
-
 
440
  # Focusing Question
441
  focusing_question_model = FocusingQuestionModel(
442
  self.device, self.tokenizer, self.input_builder)
443
  focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
444
-
445
- run_math_density(transcript)
446
-
447
  transcript.update_utterance_roles(uptake_speaker)
448
  transcript.calculate_aggregate_word_count()
449
- return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None}
450
  talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
451
  return_dict['talkDistribution'] = talk_dist
452
  return_dict['talkLength'] = talk_len
453
  talk_moments = transcript.get_talk_timeline()
454
  return_dict['talkMoments'] = talk_moments
455
- word_cloud, uptake_word_cloud = transcript.get_word_cloud_dicts()
456
  return_dict['commonTopWords'] = word_cloud
457
  return_dict['uptakeTopWords'] = uptake_word_cloud
458
-
459
 
460
  return return_dict
 
40
  self.unit_measure = endtime - starttime
41
  else:
42
  self.unit_measure = None
43
+ self.aggregate_unit_measure = endtime
44
  self.num_math_terms = None
45
  self.math_terms = None
46
 
 
151
  avg_student_length = student_words / student_utt_count if student_utt_count > 0 else 0
152
  return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
153
 
154
+ def get_word_clouds(self):
155
  teacher_dict = {}
156
  student_dict = {}
157
  uptake_teacher_dict = {}
 
232
  max_length=self.max_length,
233
  input_str=True)
234
  output = self.get_prediction(instance)
235
+ # print(output)
236
  utt.question = np.argmax(
237
  output["is_question_logits"][0].tolist())
238
 
 
377
  def run_math_density(transcript):
378
  math_terms, math_terms_dict = load_math_terms()
379
  sorted_terms = sorted(math_terms, key=len, reverse=True)
380
+ math_word_cloud = {}
381
  for i, utt in enumerate(transcript.utterances):
382
  text = utt.get_clean_text(remove_punct=False)
383
  num_matches = 0
 
388
  # Filter out matches that share positions with longer terms
389
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
390
  if len(matches) > 0:
391
+ if math_terms_dict[term] not in math_word_cloud:
392
+ math_word_cloud[math_terms_dict[term]] = 0
393
+ math_word_cloud[math_terms_dict[term]] += len(matches)
394
  match_list.append(math_terms_dict[term])
395
  # Update matched positions
396
  matched_positions.update((match.start(), match.end()) for match in matches)
397
  num_matches += len(matches)
398
  utt.num_math_terms = num_matches
399
  utt.math_terms = match_list
400
+ dict_list = []
401
+ for word in math_word_cloud.keys():
402
+ dict_list.append(
403
+ {'text': word, 'value': math_word_cloud[word]})
404
+ sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
405
+ return sorted_dict_list[:50]
406
 
407
  class EndpointHandler():
408
  def __init__(self, path="."):
 
437
  uptake_speaker = params.pop("uptake_speaker", None)
438
  uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
439
  uptake_speaker=uptake_speaker)
440
+ del uptake_model
441
+
442
  # Reasoning
443
  reasoning_model = ReasoningModel(
444
  self.device, self.tokenizer, self.input_builder)
445
  reasoning_model.run_inference(transcript, uptake_speaker=uptake_speaker)
446
+ del reasoning_model
447
+
448
  # Question
449
  question_model = QuestionModel(
450
  self.device, self.tokenizer, self.input_builder)
451
  question_model.run_inference(transcript)
452
+ del question_model
453
+
454
  # Focusing Question
455
  focusing_question_model = FocusingQuestionModel(
456
  self.device, self.tokenizer, self.input_builder)
457
  focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
458
+ del focusing_question_model
459
+
460
+ math_cloud = run_math_density(transcript)
461
  transcript.update_utterance_roles(uptake_speaker)
462
  transcript.calculate_aggregate_word_count()
463
+ return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None, 'mathTopWords': None}
464
  talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
465
  return_dict['talkDistribution'] = talk_dist
466
  return_dict['talkLength'] = talk_len
467
  talk_moments = transcript.get_talk_timeline()
468
  return_dict['talkMoments'] = talk_moments
469
+ word_cloud, uptake_word_cloud = transcript.get_word_clouds()
470
  return_dict['commonTopWords'] = word_cloud
471
  return_dict['uptakeTopWords'] = uptake_word_cloud
472
+ return_dict['mathTopWords'] = math_cloud
473
 
474
  return return_dict