from pydub import AudioSegment import pandas as pd import nltk from nltk.tokenize import TweetTokenizer word_tokenize = TweetTokenizer().tokenize from text_nomalize import normalize_single from gemini_normalize import call_api def replace_t(word): x = "t̪" return word.replace(x, 't0') def replace_special_char(word): special_chars_to_ignore = "̪̃/^" for char in special_chars_to_ignore: word= word.replace(char, '') return word def clean_word(input_string): special_chars = "{<[)}>(]" for char in special_chars: input_string = input_string.replace(char, '"') return input_string import phonemizer global_phonemizer = phonemizer.backend.EspeakBackend(language='vi', preserve_punctuation=True, with_stress=True,language_switch='remove-flags',words_mismatch='ignore') def has_numbers(inputString): return any(char.isdigit() for char in inputString) def word_process(parquet_path): df = pd.read_parquet(parquet_path, engine='fastparquet') for index, row in df.iterrows(): text = df.loc[index, 'normal_text'] text = word_tokenize(text) text = [clean_word(word) for word in text] text = ' '.join(text) ps = global_phonemizer.phonemize([text]) print(ps) ps = [replace_t(p) for p in ps] print(ps) ps = [replace_special_char(p) for p in ps] df.loc[index, 'phonemes'] = ps[0] df.to_parquet(parquet_path, engine='fastparquet') import os def generate(root_path,parquet_file): total_duration = 0 silent = AudioSegment.silent(duration=400) df = pd.read_parquet(root_path+"/"+parquet_file, engine='fastparquet') new_df = pd.DataFrame(columns=['path', 'text','speaker']) for index, row in df.iterrows(): file= df.loc[index, 'audio.path'] text = df.loc[index, 'text'] audio = AudioSegment.from_wav(file) if len(audio)/1000.0>6.2 and len(audio)/1000.0<8.2: audio = silent + audio + silent total_duration += len(audio) if not(os.path.exists("audio_datasets/chunk_8/202")): os.makedirs("audio_datasets/chunk_8/202/") path ="audio_datasets/chunk_8/202/" + str(index) + ".wav" audio.export(path, format="wav") new_df = new_df.append({'path':path,'text':text,'speaker':202},ignore_index=True) if total_duration/(1000*30*60)>=1: print(new_df.head()) new_df =new_df.reset_index(drop=True) new_df.to_parquet(root_path+"/"+"betterversionvn_30p_chunk_8.parquet", engine='fastparquet') break def normalize_text(parquet_file): df = pd.read_parquet(parquet_file, engine='fastparquet') req = [] print(df.shape[0]-1) dem = 0 for index, row in df.iterrows(): text = df.loc[index, 'text'] req.append(text) # print(index) if len(req)==50 or index == df.shape[0]-1: res = call_api(str(req)) for idx,r in enumerate(res): df.loc[50*dem+idx, 'normal_text'] = r dem+=1 req = [] for index, row in df.iterrows(): if has_numbers(df.loc[index, 'normal_text']): print("has number",df.loc[index, 'normal_text']) elif df.loc[index, 'normal_text'] != df.loc[index, 'text'] and not has_numbers(df.loc[index, 'text']): print(df.loc[index, 'normal_text']) print(df.loc[index, 'text']) df.to_parquet(parquet_file, engine='fastparquet') def read(parquet_file): df = pd.read_parquet(parquet_file, engine='fastparquet') for index, row in df.iterrows(): if has_numbers(df.loc[index, 'normal_text']): print("has number",df.loc[index, 'normal_text']) elif df.loc[index, 'normal_text'] != df.loc[index, 'text'] and not has_numbers(df.loc[index, 'text']): print(df.loc[index, 'normal_text']) print(df.loc[index, 'text']) print(df.loc[index, 'audio.path']) df.loc[index, 'normal_text'] =df.loc[index, 'text'] # df.to_parquet(parquet_file, engine='fastparquet') def copy_audio(parquet_file): import shutil df = pd.read_parquet(parquet_file, engine='fastparquet') for index, row in df.iterrows(): file= df.loc[index, 'audio.path'] shutil.copy2(file, file.replace("dataset","data")) def export(parquet_file,output_file): df = pd.read_parquet(parquet_file, engine='fastparquet') data =[] for index, row in df.iterrows(): data.append(f"{df.loc[index, 'audio.path']}|{df.loc[index, 'phonemes']}|0") with open(output_file, 'w',encoding="utf-8") as f: for item in data: f.write("%s\n" % item) if __name__ == "__main__": # generate("dataset","betterversionvn.parquet") # normalize_text("dataset/Xanh24h_1h.parquet") # read("dataset/Xanh24h_1h.parquet") # word_process("dataset/Xanh24h_1h_test.parquet") # # copy_audio("dataset/Xanh24h_1h.parquet") # export("dataset/Xanh24h_1h_test.parquet","val_list.txt") pass