File size: 2,969 Bytes
5e8e534
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import re
from pathlib import Path

import jieba
from omegaconf import OmegaConf

from ipa.convert_digits import parse_num
from ipa.proc_text import (
    apply_v2f,
    normalize_text,
    prep_regex,
    run_jieba,
    update_jieba_dict,
)

ipa_configs = OmegaConf.to_object(OmegaConf.load("configs/ipa.yaml"))
for key in ipa_configs["preserved_list"]:
    ipa_configs["v2f_dict"].pop(key, None)
delimiter_regex, replace_regex, v2f_regex = prep_regex(
    ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"]
)

def get_ipa(raw_text, dialect):
    lexicon = ipa_configs["lexicon"][dialect]
    update_jieba_dict(
    list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt"
    )
    text = normalize_text(raw_text, ipa_configs["replace_dict"], replace_regex)
    text = parse_num(text)
    text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()]
    text = ",".join(text_parts)
    word_list = run_jieba(text)
    word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex)
    word_list = run_jieba("".join(word_list))

    final_words = []
    final_pinyin = []
    final_ipa = []
    missing_words = []
    for word in word_list:
        if not bool(word.strip()):
            continue
        if word == ",":
            final_words.append(",")
            final_pinyin.append(",")
            final_ipa.append(",")
        elif word not in lexicon:
            final_words.append(word)
            missing_words.append(word)
        else:
            final_words.append(f"{word}")
            final_pinyin.append(lexicon[word]['pinyin'][0])
            # NOTE 只有 lexicon[word] 中的第一個 ipa 才被考慮
            final_ipa.append(lexicon[word]['ipa'][0].replace(" ", "-"))

    if len(final_ipa) == 0 or len(missing_words) > 0:
        return final_words, final_ipa, final_pinyin, missing_words

    final_words = " ".join(final_words).replace(" , ", ",")
    final_ipa = " ".join(final_ipa).replace(" , ", ",")
    final_pinyin = " ".join(final_pinyin).replace(" , ", ",")
    
    return final_words, final_ipa, final_pinyin, missing_words

def parse_ipa(ipa: str):
    text = []
    ipa_list = re.split(r"(?<![, -])(?=[, -])|(?<=[, -])(?![, -])",ipa)
    # tone as a separate token
    for phoneme_with_tone in ipa_list:
        if phoneme_with_tone ==" ":
            text.append(phoneme_with_tone)
            continue
        elif phoneme_with_tone == ",":
            text.extend(" , ")
            continue
        elif phoneme_with_tone == "-": # use " " split 詞 (or use " " to split 字)
            continue

        split_phoneme_and_tone = phoneme_with_tone.split("_")

        if len(split_phoneme_and_tone) == 2:
            phoneme, tone = split_phoneme_and_tone
            text.extend(phoneme) 
            text.append(tone)
        else:
            text.extend(split_phoneme_and_tone[0])
    return text