hylee719 commited on
Commit
212d174
1 Parent(s): a343bb9

fix math term detection

Browse files
Files changed (1) hide show
  1. handler.py +18 -12
handler.py CHANGED
@@ -265,23 +265,29 @@ def load_math_terms():
265
  math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
266
  math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
267
  else:
268
- math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
269
- math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
270
  return math_terms, math_terms_dict
271
 
272
  def run_math_density(transcript):
273
  math_terms, math_terms_dict = load_math_terms()
274
- for i, utt in enumerate(transcript.utterances):
275
- found_math_terms = set()
276
  text = utt.get_clean_text(remove_punct=False)
277
- num_math_terms = 0
278
- for term in math_terms:
279
- count = len(re.findall(term, text))
280
- if count > 0:
281
- found_math_terms.add(math_terms_dict[term])
282
- num_math_terms += count
283
- utt.num_math_terms = num_math_terms
284
- utt.math_terms = list(found_math_terms)
 
 
 
 
 
 
285
 
286
  class EndpointHandler():
287
  def __init__(self, path="."):
 
265
  math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
266
  math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
267
  else:
268
+ math_terms.append(term)
269
+ math_terms_dict[term] = term
270
  return math_terms, math_terms_dict
271
 
272
  def run_math_density(transcript):
273
  math_terms, math_terms_dict = load_math_terms()
274
+ sorted_terms = sorted(math_terms, key=len, reverse=True)
275
+ for i, utt in enumerate(transcript.utterances):
276
  text = utt.get_clean_text(remove_punct=False)
277
+ num_matches = 0
278
+ matched_positions = set()
279
+ match_list = []
280
+ for term in sorted_terms:
281
+ matches = list(re.finditer(term, text, re.IGNORECASE))
282
+ # Filter out matches that share positions with longer terms
283
+ matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
284
+ if len(matches) > 0:
285
+ match_list.append(math_terms_dict[term])
286
+ # Update existing match positions
287
+ matched_positions.update((match.start(), match.end()) for match in matches)
288
+ num_matches += len(matches)
289
+ utt.num_math_terms = num_matches
290
+ utt.math_terms = match_list
291
 
292
  class EndpointHandler():
293
  def __init__(self, path="."):