import pandas as pd from nltk.tokenize import TweetTokenizer from nltk.tokenize.treebank import TreebankWordDetokenizer word_tokenize = TweetTokenizer().tokenize from converters.Date import DateVietnamese from converters.Time import Time from converters.Money import Money from converters.Fraction import Fraction from converters.Telephone import TelephoneVietnamese from converters.Cardinal import CardinalVietnamese from converters.Decimal import Decimal from converters.Range import Range from converters.Meansure import Measure labels ={ 'DATE': DateVietnamese(), 'TIME':Time(), 'MONEY':Money(), 'FRACTION':Fraction(), 'TELEPHONE':TelephoneVietnamese(), 'CARDINAL':CardinalVietnamese(), 'DECIMAL':Decimal(), 'RANGE' :Range(), 'MEANSURE': Measure() } def has_numbers(inputString): return any(char.isdigit() for char in inputString) def has_date(inputString): if "/" not in inputString: return False splt = inputString.split("/") for i in splt: if not i.isdigit(): return False if len(splt) >3 : return False if len(splt) == 2: month = int(splt[0]) year = int(splt[1]) if month >12 or year > 2200 or month <1: return False if len(splt)==3: day =int(splt[0]) month = int(splt[1]) year =int(splt[2]) if day >31 or month > 12 or year >2200 or day < 1 or month <1: return False return True def is_time(text): if ":" not in text: return False if "-" in text: text = text[:-1] splt = text.split(":") if len(splt)>3 or '' in splt: return False elif len(splt)==2: HH,MM = int(splt[0]),int(splt[1]) if HH >24 or MM >60: return False elif len(splt) ==3: HH,MM,SS = int(splt[0]),int(splt[1]),int(splt[2]) if HH>24 or MM>60 or SS>100: return False return True def is_money(inputString): return inputString.startswith(('$', '€', '£', '¥')) def is_fraction(inputString): return "/" in inputString def is_decimal(inputString): return "." in inputString def is_cardinal(inputString): return "," in inputString or len(inputString) <= 3 def is_range(inputString) : return "-" in inputString def is_telephone(inputString): if inputString.startswith(("19", "18", "0")) and len(inputString)>4: return True def is_meansure(text): if text in labels['MEANSURE'].custom_dict: return True def normalize_single(text,previous=""): if has_numbers(text): if has_date(text): text = labels["DATE"].convert_date(text) elif is_time(text): if text.endswith("-"): kq = labels['TIME'].convert(text[:-1]) kq += " đến" else: kq = labels['TIME'].convert(text) text =kq elif is_money(text): text = labels['MONEY'].convert(text) elif is_decimal(text): text = labels['DECIMAL'].convert(text) elif is_telephone(text): text =labels['TELEPHONE'].convert(text) elif is_cardinal(text): text = labels['CARDINAL'].convert(text) elif is_range(text): text = labels['RANGE'].convert(text) if is_fraction(text): text = labels['FRACTION'].convert(text) if has_numbers(text): text = labels['CARDINAL'].convert(text) text = text.replace("%", " phần trăm ") text = text.replace("&", " và ") text = text.replace("°"," độ ") return text if __name__ == "__main__": v ="90000" v =word_tokenize(v) print(v) for i in v: te =normalize_single(i) print(i, te)