txya900619's picture
fix: number not parse prob
4a0fd18
raw
history blame
No virus
4.28 kB
import os
import re
from pathlib import Path
import jieba
from omegaconf import OmegaConf
from ipa.convert_digits import parse_num
from ipa.proc_text import (
apply_v2f,
normalize_text,
prep_regex,
run_jieba,
update_jieba_dict,
)
ipa_configs = OmegaConf.to_object(OmegaConf.load("configs/ipa.yaml"))
for key in ipa_configs["preserved_list"]:
ipa_configs["v2f_dict"].pop(key, None)
delimiter_regex, replace_regex, v2f_regex = prep_regex(
ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"]
)
def get_ipa(raw_text: str, dialect: str) -> tuple[str, str, str, list[str]]:
pinyin_split = re.split(
r"([a-z]+\d+)", raw_text
)
final_words = []
final_pinyin = []
final_ipa = []
final_missing_words = []
for hanzi_or_pinyin in pinyin_split:
if len(hanzi_or_pinyin.strip()) == 0:
continue
if re.search(r"[a-z]+\d+", hanzi_or_pinyin):
final_words.append(hanzi_or_pinyin)
final_pinyin.append(hanzi_or_pinyin)
pinyin, tone = re.match(r"([a-z]+)(\d+)?", hanzi_or_pinyin).groups()
tone = f"_{tone}" if tone else ""
ipa = parse_pinyin_to_ipa(pinyin)
if ipa is None:
final_missing_words.append(pinyin)
continue
final_ipa.append(ipa + tone)
else:
words, ipa, pinyin, missing_words = parse_hanzi_to_ipa(
hanzi_or_pinyin, dialect
)
final_words.extend(words)
final_ipa.extend(ipa)
final_pinyin.extend(pinyin)
final_missing_words.extend(missing_words)
if len(final_ipa) == 0 or len(final_missing_words) > 0:
return final_words, final_ipa, final_pinyin, final_missing_words
final_words = " ".join(final_words).replace(" , ", ",")
final_ipa = " ".join(final_ipa).replace(" , ", ",")
final_pinyin = " ".join(final_pinyin).replace(" , ", ",")
return final_words, final_ipa, final_pinyin, final_missing_words
def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space="")->list[str]:
text = []
ipa_list = re.split(r"(?<![\d])(?=[\d])|(?<=[\d])(?![\d])", ipa)
print(ipa_list)
for word in ipa_list:
if word.isdigit():
text.append(word)
else:
if len(as_space) > 0:
word = re.sub(r"[{}]".format(as_space), " ", word)
if len(delete_chars) > 0:
word = re.sub(r"[{}]".format(delete_chars), "", word)
word = word.replace(",", " , ")
text.extend(word)
return text
def parse_pinyin_to_ipa(pinyin: str)->str|None:
if pinyin not in ipa_configs["pinyin_to_ipa_dict"]:
return None
ipa_dict_result = ipa_configs["pinyin_to_ipa_dict"][pinyin]
ipa = "+".join(ipa_dict_result).replace(" ", "-")
return ipa
def parse_hanzi_to_ipa(
hanzi: str, dialect: str
) -> tuple[list[str], list[str], list[str], list[str]]:
lexicon = ipa_configs["lexicon"][dialect]
update_jieba_dict(
list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt"
)
text = normalize_text(hanzi, ipa_configs["replace_dict"], replace_regex)
text = parse_num(text)
text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()]
text = ",".join(text_parts)
word_list = run_jieba(text)
word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex)
word_list = run_jieba("".join(word_list))
final_words = []
final_pinyin = []
final_ipa = []
missing_words = []
for word in word_list:
if not bool(word.strip()):
continue
if word == ",":
final_words.append(",")
final_pinyin.append(",")
final_ipa.append(",")
elif word not in lexicon:
final_words.append(word)
missing_words.append(word)
else:
final_words.append(f"{word}")
final_pinyin.append(lexicon[word]["pinyin"][0])
# NOTE 只有 lexicon[word] 中的第一個 ipa 才被考慮
final_ipa.append(lexicon[word]["ipa"][0])
return final_words, final_ipa, final_pinyin, missing_words