|
import pandas as pd
|
|
import os
|
|
from pydub import AudioSegment
|
|
|
|
from nltk.tokenize import TweetTokenizer
|
|
word_tokenize = TweetTokenizer().tokenize
|
|
|
|
from gemini_normalize import call_api
|
|
|
|
import phonemizer
|
|
global_phonemizer = phonemizer.backend.EspeakBackend(language='vi', preserve_punctuation=True, with_stress=True,language_switch='remove-flags',words_mismatch='ignore')
|
|
|
|
def has_numbers(inputString):
|
|
return any(char.isdigit() for char in inputString)
|
|
def generate(root_path,parquet_file):
|
|
total_duration = 0
|
|
|
|
df = pd.read_parquet(root_path+"/"+parquet_file, engine='fastparquet')
|
|
new_df = pd.DataFrame(columns=['audio.path', 'text','speaker_id'])
|
|
for index, row in df.iterrows():
|
|
if index < 3000:
|
|
continue
|
|
file= df.loc[index, 'audio.path']
|
|
text = df.loc[index, 'text']
|
|
|
|
audio = AudioSegment.from_wav(file)
|
|
if len(audio)/1000.0>2:
|
|
total_duration += len(audio)
|
|
new_df = new_df.append({'audio.path':file,'text':text,'speaker_id':0},ignore_index=True)
|
|
|
|
|
|
if total_duration/(1000*60*6)>=1:
|
|
print(new_df.head())
|
|
new_df =new_df.reset_index(drop=True)
|
|
new_df.to_parquet(root_path+"/"+"Xanh24h_1h_test.parquet", engine='fastparquet')
|
|
break
|
|
|
|
|
|
def normalize_text(parquet_file):
|
|
df = pd.read_parquet(parquet_file, engine='fastparquet')
|
|
req = []
|
|
print(df.shape[0]-1)
|
|
dem = 0
|
|
for index, row in df.iterrows():
|
|
text = df.loc[index, 'text']
|
|
req.append(text)
|
|
|
|
if len(req)==50 or index == df.shape[0]-1:
|
|
res = call_api(str(req))
|
|
|
|
for idx,r in enumerate(res):
|
|
df.loc[50*dem+idx, 'normal_text'] = r
|
|
dem+=1
|
|
req = []
|
|
for index, row in df.iterrows():
|
|
if has_numbers(df.loc[index, 'normal_text']):
|
|
print("has number",df.loc[index, 'normal_text'])
|
|
elif df.loc[index, 'normal_text'] != df.loc[index, 'text'] and not has_numbers(df.loc[index, 'text']):
|
|
print(df.loc[index, 'normal_text'])
|
|
print(df.loc[index, 'text'])
|
|
df.loc[index, 'normal_text'] = df.loc[index, 'text']
|
|
df.to_parquet(parquet_file, engine='fastparquet')
|
|
|
|
def word_process(parquet_path):
|
|
df = pd.read_parquet(parquet_path, engine='fastparquet')
|
|
for index, row in df.iterrows():
|
|
text = df.loc[index, 'normal_text']
|
|
|
|
text = word_tokenize(text)
|
|
text = ' '.join(text)
|
|
|
|
ps = global_phonemizer.phonemize([text])
|
|
|
|
df.loc[index, 'phonemes'] = ps[0]
|
|
|
|
df.to_parquet(parquet_path, engine='fastparquet')
|
|
|
|
def copy_audio(parquet_file):
|
|
import shutil
|
|
df = pd.read_parquet(parquet_file, engine='fastparquet')
|
|
for index, row in df.iterrows():
|
|
file= df.loc[index, 'audio.path']
|
|
shutil.copy2(file, file.replace("dataset","data"))
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
pass |