File size: 982 Bytes
8b414b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from functools import lru_cache

import pandas as pd
from tqdm import tqdm

from src.feature_extractors.text_statistics_utils import preprocess_test
from src.spell_checker import SmartSpellChecker
from src.text_preprocessings.base_preprocessing import BasePreprocessor


class SpellcheckTextPreprocessor(BasePreprocessor):
    def __init__(self, spellcheck: SmartSpellChecker):
        super(SpellcheckTextPreprocessor, self).__init__()
        self._spellcheck = spellcheck

    def preprocess_data(self, data: pd.Series) -> pd.Series:
        out_texts = []

        for text in tqdm(data, desc="Preprocessing texts (correcting mistakes, removing tokens, etc.)..."):
            text = self._preprocess_text(text)
            out_texts.append(text)

        return pd.Series(out_texts, index=data.index)

    @lru_cache()
    def _preprocess_text(self, text: str) -> str:
        text = preprocess_test(text)
        text = self._spellcheck.correct_text(text)
        return text