Generate_Audio / test_dataset.py
namkuner's picture
Upload 10 files
d29da97 verified
raw
history blame
3.26 kB
import pandas as pd
import os
from pydub import AudioSegment
from nltk.tokenize import TweetTokenizer
word_tokenize = TweetTokenizer().tokenize
from gemini_normalize import call_api
import phonemizer
global_phonemizer = phonemizer.backend.EspeakBackend(language='vi', preserve_punctuation=True, with_stress=True,language_switch='remove-flags',words_mismatch='ignore')
def has_numbers(inputString):
return any(char.isdigit() for char in inputString)
def generate(root_path,parquet_file):
total_duration = 0
df = pd.read_parquet(root_path+"/"+parquet_file, engine='fastparquet')
new_df = pd.DataFrame(columns=['audio.path', 'text','speaker_id'])
for index, row in df.iterrows():
if index < 3000:
continue
file= df.loc[index, 'audio.path']
text = df.loc[index, 'text']
audio = AudioSegment.from_wav(file)
if len(audio)/1000.0>2:
total_duration += len(audio)
new_df = new_df.append({'audio.path':file,'text':text,'speaker_id':0},ignore_index=True)
if total_duration/(1000*60*6)>=1:
print(new_df.head())
new_df =new_df.reset_index(drop=True)
new_df.to_parquet(root_path+"/"+"Xanh24h_1h_test.parquet", engine='fastparquet')
break
def normalize_text(parquet_file):
df = pd.read_parquet(parquet_file, engine='fastparquet')
req = []
print(df.shape[0]-1)
dem = 0
for index, row in df.iterrows():
text = df.loc[index, 'text']
req.append(text)
# print(index)
if len(req)==50 or index == df.shape[0]-1:
res = call_api(str(req))
for idx,r in enumerate(res):
df.loc[50*dem+idx, 'normal_text'] = r
dem+=1
req = []
for index, row in df.iterrows():
if has_numbers(df.loc[index, 'normal_text']):
print("has number",df.loc[index, 'normal_text'])
elif df.loc[index, 'normal_text'] != df.loc[index, 'text'] and not has_numbers(df.loc[index, 'text']):
print(df.loc[index, 'normal_text'])
print(df.loc[index, 'text'])
df.loc[index, 'normal_text'] = df.loc[index, 'text']
df.to_parquet(parquet_file, engine='fastparquet')
def word_process(parquet_path):
df = pd.read_parquet(parquet_path, engine='fastparquet')
for index, row in df.iterrows():
text = df.loc[index, 'normal_text']
text = word_tokenize(text)
text = ' '.join(text)
ps = global_phonemizer.phonemize([text])
df.loc[index, 'phonemes'] = ps[0]
df.to_parquet(parquet_path, engine='fastparquet')
def copy_audio(parquet_file):
import shutil
df = pd.read_parquet(parquet_file, engine='fastparquet')
for index, row in df.iterrows():
file= df.loc[index, 'audio.path']
shutil.copy2(file, file.replace("dataset","data"))
if __name__ == "__main__":
# generate("dataset","Xanh24h.parquet")
# normalize_text("dataset/Xanh24h_1h_test.parquet")
# read("dataset/Xanh24h_1h.parquet")
# word_process("dataset/Xanh24h_1h_test.parquet")
# copy_audio("dataset/Xanh24h_1h_test.parquet")
pass