|
from pydub import AudioSegment
|
|
import pandas as pd
|
|
import nltk
|
|
from nltk.tokenize import TweetTokenizer
|
|
word_tokenize = TweetTokenizer().tokenize
|
|
from text_nomalize import normalize_single
|
|
|
|
from gemini_normalize import call_api
|
|
def replace_t(word):
|
|
x = "t̪"
|
|
return word.replace(x, 't0')
|
|
|
|
def replace_special_char(word):
|
|
special_chars_to_ignore = "̪̃/^"
|
|
for char in special_chars_to_ignore:
|
|
word= word.replace(char, '')
|
|
return word
|
|
def clean_word(input_string):
|
|
special_chars = "{<[)}>(]"
|
|
for char in special_chars:
|
|
input_string = input_string.replace(char, '"')
|
|
return input_string
|
|
|
|
import phonemizer
|
|
global_phonemizer = phonemizer.backend.EspeakBackend(language='vi', preserve_punctuation=True, with_stress=True,language_switch='remove-flags',words_mismatch='ignore')
|
|
def has_numbers(inputString):
|
|
return any(char.isdigit() for char in inputString)
|
|
|
|
def word_process(parquet_path):
|
|
df = pd.read_parquet(parquet_path, engine='fastparquet')
|
|
for index, row in df.iterrows():
|
|
text = df.loc[index, 'normal_text']
|
|
|
|
text = word_tokenize(text)
|
|
text = [clean_word(word) for word in text]
|
|
|
|
text = ' '.join(text)
|
|
|
|
ps = global_phonemizer.phonemize([text])
|
|
print(ps)
|
|
ps = [replace_t(p) for p in ps]
|
|
print(ps)
|
|
ps = [replace_special_char(p) for p in ps]
|
|
df.loc[index, 'phonemes'] = ps[0]
|
|
|
|
df.to_parquet(parquet_path, engine='fastparquet')
|
|
|
|
|
|
|
|
import os
|
|
|
|
def generate(root_path,parquet_file):
|
|
total_duration = 0
|
|
silent = AudioSegment.silent(duration=400)
|
|
df = pd.read_parquet(root_path+"/"+parquet_file, engine='fastparquet')
|
|
new_df = pd.DataFrame(columns=['path', 'text','speaker'])
|
|
for index, row in df.iterrows():
|
|
|
|
file= df.loc[index, 'audio.path']
|
|
text = df.loc[index, 'text']
|
|
|
|
audio = AudioSegment.from_wav(file)
|
|
if len(audio)/1000.0>6.2 and len(audio)/1000.0<8.2:
|
|
|
|
audio = silent + audio + silent
|
|
total_duration += len(audio)
|
|
if not(os.path.exists("audio_datasets/chunk_8/202")):
|
|
os.makedirs("audio_datasets/chunk_8/202/")
|
|
path ="audio_datasets/chunk_8/202/" + str(index) + ".wav"
|
|
audio.export(path, format="wav")
|
|
new_df = new_df.append({'path':path,'text':text,'speaker':202},ignore_index=True)
|
|
|
|
|
|
if total_duration/(1000*30*60)>=1:
|
|
print(new_df.head())
|
|
new_df =new_df.reset_index(drop=True)
|
|
new_df.to_parquet(root_path+"/"+"betterversionvn_30p_chunk_8.parquet", engine='fastparquet')
|
|
break
|
|
|
|
def normalize_text(parquet_file):
|
|
df = pd.read_parquet(parquet_file, engine='fastparquet')
|
|
req = []
|
|
print(df.shape[0]-1)
|
|
dem = 0
|
|
for index, row in df.iterrows():
|
|
text = df.loc[index, 'text']
|
|
req.append(text)
|
|
|
|
if len(req)==50 or index == df.shape[0]-1:
|
|
res = call_api(str(req))
|
|
|
|
for idx,r in enumerate(res):
|
|
df.loc[50*dem+idx, 'normal_text'] = r
|
|
dem+=1
|
|
req = []
|
|
for index, row in df.iterrows():
|
|
if has_numbers(df.loc[index, 'normal_text']):
|
|
print("has number",df.loc[index, 'normal_text'])
|
|
elif df.loc[index, 'normal_text'] != df.loc[index, 'text'] and not has_numbers(df.loc[index, 'text']):
|
|
print(df.loc[index, 'normal_text'])
|
|
print(df.loc[index, 'text'])
|
|
|
|
df.to_parquet(parquet_file, engine='fastparquet')
|
|
|
|
|
|
def read(parquet_file):
|
|
df = pd.read_parquet(parquet_file, engine='fastparquet')
|
|
for index, row in df.iterrows():
|
|
if has_numbers(df.loc[index, 'normal_text']):
|
|
print("has number",df.loc[index, 'normal_text'])
|
|
elif df.loc[index, 'normal_text'] != df.loc[index, 'text'] and not has_numbers(df.loc[index, 'text']):
|
|
print(df.loc[index, 'normal_text'])
|
|
print(df.loc[index, 'text'])
|
|
print(df.loc[index, 'audio.path'])
|
|
df.loc[index, 'normal_text'] =df.loc[index, 'text']
|
|
|
|
|
|
def copy_audio(parquet_file):
|
|
import shutil
|
|
df = pd.read_parquet(parquet_file, engine='fastparquet')
|
|
for index, row in df.iterrows():
|
|
file= df.loc[index, 'audio.path']
|
|
shutil.copy2(file, file.replace("dataset","data"))
|
|
|
|
def export(parquet_file,output_file):
|
|
df = pd.read_parquet(parquet_file, engine='fastparquet')
|
|
data =[]
|
|
for index, row in df.iterrows():
|
|
data.append(f"{df.loc[index, 'audio.path']}|{df.loc[index, 'phonemes']}|0")
|
|
|
|
with open(output_file, 'w',encoding="utf-8") as f:
|
|
for item in data:
|
|
f.write("%s\n" % item)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pass |