|
import string
|
|
from nltk.tokenize import TweetTokenizer
|
|
word_tokenize = TweetTokenizer().tokenize
|
|
special_mappings = {
|
|
"a": "ɐ",
|
|
"'t": 't',
|
|
"'ve": "v",
|
|
"'m": "m",
|
|
"'re": "ɹ",
|
|
"d": "d",
|
|
'll': "l",
|
|
"n't": "nt",
|
|
"'ll": "l",
|
|
"'d": "d",
|
|
"'": "ʔ",
|
|
"wasn": "wˈɒzən",
|
|
"hasn": "hˈæzn",
|
|
"doesn": "dˈʌzən",
|
|
}
|
|
|
|
|
|
def phonemize(text, global_phonemizer, tokenizer):
|
|
|
|
words = word_tokenize(text)
|
|
|
|
phonemes_bad = [global_phonemizer.phonemize([word], strip=True)[0] if word not in string.punctuation else word for
|
|
word in words]
|
|
input_ids = []
|
|
phonemes = []
|
|
|
|
for i in range(len(words)):
|
|
word = words[i]
|
|
phoneme = phonemes_bad[i]
|
|
|
|
|
|
|
|
try :
|
|
token = tokenizer[word]
|
|
except:
|
|
continue
|
|
input_ids.append(token)
|
|
phonemes.append(phoneme)
|
|
|
|
assert len(input_ids) == len(phonemes)
|
|
return {'input_ids': input_ids, 'phonemes': phonemes} |