Generate_Audio / phonemize.py
namkuner's picture
Upload phonemize.py
5a6dd33 verified
import string
from nltk.tokenize import TweetTokenizer
word_tokenize = TweetTokenizer().tokenize
special_mappings = {
"a": "ɐ",
"'t": 't',
"'ve": "v",
"'m": "m",
"'re": "ɹ",
"d": "d",
'll': "l",
"n't": "nt",
"'ll": "l",
"'d": "d",
"'": "ʔ",
"wasn": "wˈɒzən",
"hasn": "hˈæzn",
"doesn": "dˈʌzən",
}
def phonemize(text, global_phonemizer, tokenizer):
words = word_tokenize(text)
phonemes_bad = [global_phonemizer.phonemize([word], strip=True)[0] if word not in string.punctuation else word for
word in words]
input_ids = []
phonemes = []
for i in range(len(words)):
word = words[i]
phoneme = phonemes_bad[i]
# process special cases (NOT COMPLETE)
try :
token = tokenizer[word]
except:
continue
input_ids.append(token)
phonemes.append(phoneme)
assert len(input_ids) == len(phonemes)
return {'input_ids': input_ids, 'phonemes': phonemes}