import pandas as pd import os from pydub import AudioSegment from nltk.tokenize import TweetTokenizer word_tokenize = TweetTokenizer().tokenize from gemini_normalize import call_api import phonemizer global_phonemizer = phonemizer.backend.EspeakBackend(language='vi', preserve_punctuation=True, with_stress=True,language_switch='remove-flags',words_mismatch='ignore') def has_numbers(inputString): return any(char.isdigit() for char in inputString) def generate(root_path,parquet_file): total_duration = 0 df = pd.read_parquet(root_path+"/"+parquet_file, engine='fastparquet') new_df = pd.DataFrame(columns=['audio.path', 'text','speaker_id']) for index, row in df.iterrows(): if index < 3000: continue file= df.loc[index, 'audio.path'] text = df.loc[index, 'text'] audio = AudioSegment.from_wav(file) if len(audio)/1000.0>2: total_duration += len(audio) new_df = new_df.append({'audio.path':file,'text':text,'speaker_id':0},ignore_index=True) if total_duration/(1000*60*6)>=1: print(new_df.head()) new_df =new_df.reset_index(drop=True) new_df.to_parquet(root_path+"/"+"Xanh24h_1h_test.parquet", engine='fastparquet') break def normalize_text(parquet_file): df = pd.read_parquet(parquet_file, engine='fastparquet') req = [] print(df.shape[0]-1) dem = 0 for index, row in df.iterrows(): text = df.loc[index, 'text'] req.append(text) # print(index) if len(req)==50 or index == df.shape[0]-1: res = call_api(str(req)) for idx,r in enumerate(res): df.loc[50*dem+idx, 'normal_text'] = r dem+=1 req = [] for index, row in df.iterrows(): if has_numbers(df.loc[index, 'normal_text']): print("has number",df.loc[index, 'normal_text']) elif df.loc[index, 'normal_text'] != df.loc[index, 'text'] and not has_numbers(df.loc[index, 'text']): print(df.loc[index, 'normal_text']) print(df.loc[index, 'text']) df.loc[index, 'normal_text'] = df.loc[index, 'text'] df.to_parquet(parquet_file, engine='fastparquet') def word_process(parquet_path): df = pd.read_parquet(parquet_path, engine='fastparquet') for index, row in df.iterrows(): text = df.loc[index, 'normal_text'] text = word_tokenize(text) text = ' '.join(text) ps = global_phonemizer.phonemize([text]) df.loc[index, 'phonemes'] = ps[0] df.to_parquet(parquet_path, engine='fastparquet') def copy_audio(parquet_file): import shutil df = pd.read_parquet(parquet_file, engine='fastparquet') for index, row in df.iterrows(): file= df.loc[index, 'audio.path'] shutil.copy2(file, file.replace("dataset","data")) if __name__ == "__main__": # generate("dataset","Xanh24h.parquet") # normalize_text("dataset/Xanh24h_1h_test.parquet") # read("dataset/Xanh24h_1h.parquet") # word_process("dataset/Xanh24h_1h_test.parquet") # copy_audio("dataset/Xanh24h_1h_test.parquet") pass