txya900619's picture
feat: init upload
5e8e534
raw
history blame
No virus
2.97 kB
import os
import re
from pathlib import Path
import jieba
from omegaconf import OmegaConf
from ipa.convert_digits import parse_num
from ipa.proc_text import (
apply_v2f,
normalize_text,
prep_regex,
run_jieba,
update_jieba_dict,
)
ipa_configs = OmegaConf.to_object(OmegaConf.load("configs/ipa.yaml"))
for key in ipa_configs["preserved_list"]:
ipa_configs["v2f_dict"].pop(key, None)
delimiter_regex, replace_regex, v2f_regex = prep_regex(
ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"]
)
def get_ipa(raw_text, dialect):
lexicon = ipa_configs["lexicon"][dialect]
update_jieba_dict(
list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt"
)
text = normalize_text(raw_text, ipa_configs["replace_dict"], replace_regex)
text = parse_num(text)
text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()]
text = ",".join(text_parts)
word_list = run_jieba(text)
word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex)
word_list = run_jieba("".join(word_list))
final_words = []
final_pinyin = []
final_ipa = []
missing_words = []
for word in word_list:
if not bool(word.strip()):
continue
if word == ",":
final_words.append(",")
final_pinyin.append(",")
final_ipa.append(",")
elif word not in lexicon:
final_words.append(word)
missing_words.append(word)
else:
final_words.append(f"{word}")
final_pinyin.append(lexicon[word]['pinyin'][0])
# NOTE 只有 lexicon[word] 中的第一個 ipa 才被考慮
final_ipa.append(lexicon[word]['ipa'][0].replace(" ", "-"))
if len(final_ipa) == 0 or len(missing_words) > 0:
return final_words, final_ipa, final_pinyin, missing_words
final_words = " ".join(final_words).replace(" , ", ",")
final_ipa = " ".join(final_ipa).replace(" , ", ",")
final_pinyin = " ".join(final_pinyin).replace(" , ", ",")
return final_words, final_ipa, final_pinyin, missing_words
def parse_ipa(ipa: str):
text = []
ipa_list = re.split(r"(?<![, -])(?=[, -])|(?<=[, -])(?![, -])",ipa)
# tone as a separate token
for phoneme_with_tone in ipa_list:
if phoneme_with_tone ==" ":
text.append(phoneme_with_tone)
continue
elif phoneme_with_tone == ",":
text.extend(" , ")
continue
elif phoneme_with_tone == "-": # use " " split 詞 (or use " " to split 字)
continue
split_phoneme_and_tone = phoneme_with_tone.split("_")
if len(split_phoneme_and_tone) == 2:
phoneme, tone = split_phoneme_and_tone
text.extend(phoneme)
text.append(tone)
else:
text.extend(split_phoneme_and_tone[0])
return text