Generate_Audio / main.py
namkuner's picture
Upload folder using huggingface_hub
05819d1 verified
from pydub import AudioSegment
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer
word_tokenize = TweetTokenizer().tokenize
from text_nomalize import normalize_single
from gemini_normalize import call_api
def replace_t(word):
x = "t̪"
return word.replace(x, 't0')
def replace_special_char(word):
special_chars_to_ignore = "̪̃/^"
for char in special_chars_to_ignore:
word= word.replace(char, '')
return word
def clean_word(input_string):
special_chars = "{<[)}>(]"
for char in special_chars:
input_string = input_string.replace(char, '"')
return input_string
import phonemizer
global_phonemizer = phonemizer.backend.EspeakBackend(language='vi', preserve_punctuation=True, with_stress=True,language_switch='remove-flags',words_mismatch='ignore')
def has_numbers(inputString):
return any(char.isdigit() for char in inputString)
def word_process(parquet_path):
df = pd.read_parquet(parquet_path, engine='fastparquet')
for index, row in df.iterrows():
text = df.loc[index, 'normal_text']
text = word_tokenize(text)
text = [clean_word(word) for word in text]
text = ' '.join(text)
ps = global_phonemizer.phonemize([text])
print(ps)
ps = [replace_t(p) for p in ps]
print(ps)
ps = [replace_special_char(p) for p in ps]
df.loc[index, 'phonemes'] = ps[0]
df.to_parquet(parquet_path, engine='fastparquet')
import os
def generate(root_path,parquet_file):
total_duration = 0
silent = AudioSegment.silent(duration=400)
df = pd.read_parquet(root_path+"/"+parquet_file, engine='fastparquet')
new_df = pd.DataFrame(columns=['path', 'text','speaker'])
for index, row in df.iterrows():
file= df.loc[index, 'audio.path']
text = df.loc[index, 'text']
audio = AudioSegment.from_wav(file)
if len(audio)/1000.0>6.2 and len(audio)/1000.0<8.2:
audio = silent + audio + silent
total_duration += len(audio)
if not(os.path.exists("audio_datasets/chunk_8/202")):
os.makedirs("audio_datasets/chunk_8/202/")
path ="audio_datasets/chunk_8/202/" + str(index) + ".wav"
audio.export(path, format="wav")
new_df = new_df.append({'path':path,'text':text,'speaker':202},ignore_index=True)
if total_duration/(1000*30*60)>=1:
print(new_df.head())
new_df =new_df.reset_index(drop=True)
new_df.to_parquet(root_path+"/"+"betterversionvn_30p_chunk_8.parquet", engine='fastparquet')
break
def normalize_text(parquet_file):
df = pd.read_parquet(parquet_file, engine='fastparquet')
req = []
print(df.shape[0]-1)
dem = 0
for index, row in df.iterrows():
text = df.loc[index, 'text']
req.append(text)
# print(index)
if len(req)==50 or index == df.shape[0]-1:
res = call_api(str(req))
for idx,r in enumerate(res):
df.loc[50*dem+idx, 'normal_text'] = r
dem+=1
req = []
for index, row in df.iterrows():
if has_numbers(df.loc[index, 'normal_text']):
print("has number",df.loc[index, 'normal_text'])
elif df.loc[index, 'normal_text'] != df.loc[index, 'text'] and not has_numbers(df.loc[index, 'text']):
print(df.loc[index, 'normal_text'])
print(df.loc[index, 'text'])
df.to_parquet(parquet_file, engine='fastparquet')
def read(parquet_file):
df = pd.read_parquet(parquet_file, engine='fastparquet')
for index, row in df.iterrows():
if has_numbers(df.loc[index, 'normal_text']):
print("has number",df.loc[index, 'normal_text'])
elif df.loc[index, 'normal_text'] != df.loc[index, 'text'] and not has_numbers(df.loc[index, 'text']):
print(df.loc[index, 'normal_text'])
print(df.loc[index, 'text'])
print(df.loc[index, 'audio.path'])
df.loc[index, 'normal_text'] =df.loc[index, 'text']
# df.to_parquet(parquet_file, engine='fastparquet')
def copy_audio(parquet_file):
import shutil
df = pd.read_parquet(parquet_file, engine='fastparquet')
for index, row in df.iterrows():
file= df.loc[index, 'audio.path']
shutil.copy2(file, file.replace("dataset","data"))
def export(parquet_file,output_file):
df = pd.read_parquet(parquet_file, engine='fastparquet')
data =[]
for index, row in df.iterrows():
data.append(f"{df.loc[index, 'audio.path']}|{df.loc[index, 'phonemes']}|0")
with open(output_file, 'w',encoding="utf-8") as f:
for item in data:
f.write("%s\n" % item)
if __name__ == "__main__":
# generate("dataset","betterversionvn.parquet")
# normalize_text("dataset/Xanh24h_1h.parquet")
# read("dataset/Xanh24h_1h.parquet")
# word_process("dataset/Xanh24h_1h_test.parquet")
# # copy_audio("dataset/Xanh24h_1h.parquet")
# export("dataset/Xanh24h_1h_test.parquet","val_list.txt")
pass