namkuner
/

Generate_Audio

Model card Files Files and versions Community

Generate_Audio / main.py

namkuner

Upload folder using huggingface_hub

05819d1 verified 6 days ago

raw

history blame contribute delete

5.22 kB

	from pydub import AudioSegment
	import pandas as pd
	import nltk
	from nltk.tokenize import TweetTokenizer
	word_tokenize = TweetTokenizer().tokenize
	from text_nomalize import normalize_single

	from gemini_normalize import call_api
	def replace_t(word):
	x = "t̪"
	return word.replace(x, 't0')

	def replace_special_char(word):
	special_chars_to_ignore = "̪̃/^"
	for char in special_chars_to_ignore:
	word= word.replace(char, '')
	return word
	def clean_word(input_string):
	special_chars = "{<[)}>(]"
	for char in special_chars:
	input_string = input_string.replace(char, '"')
	return input_string

	import phonemizer
	global_phonemizer = phonemizer.backend.EspeakBackend(language='vi', preserve_punctuation=True, with_stress=True,language_switch='remove-flags',words_mismatch='ignore')
	def has_numbers(inputString):
	return any(char.isdigit() for char in inputString)

	def word_process(parquet_path):
	df = pd.read_parquet(parquet_path, engine='fastparquet')
	for index, row in df.iterrows():
	text = df.loc[index, 'normal_text']

	text = word_tokenize(text)
	text = [clean_word(word) for word in text]

	text = ' '.join(text)

	ps = global_phonemizer.phonemize([text])
	print(ps)
	ps = [replace_t(p) for p in ps]
	print(ps)
	ps = [replace_special_char(p) for p in ps]
	df.loc[index, 'phonemes'] = ps[0]

	df.to_parquet(parquet_path, engine='fastparquet')



	import os

	def generate(root_path,parquet_file):
	total_duration = 0
	silent = AudioSegment.silent(duration=400)
	df = pd.read_parquet(root_path+"/"+parquet_file, engine='fastparquet')
	new_df = pd.DataFrame(columns=['path', 'text','speaker'])
	for index, row in df.iterrows():

	file= df.loc[index, 'audio.path']
	text = df.loc[index, 'text']

	audio = AudioSegment.from_wav(file)
	if len(audio)/1000.0>6.2 and len(audio)/1000.0<8.2:

	audio = silent + audio + silent
	total_duration += len(audio)
	if not(os.path.exists("audio_datasets/chunk_8/202")):
	os.makedirs("audio_datasets/chunk_8/202/")
	path ="audio_datasets/chunk_8/202/" + str(index) + ".wav"
	audio.export(path, format="wav")
	new_df = new_df.append({'path':path,'text':text,'speaker':202},ignore_index=True)


	if total_duration/(10003060)>=1:
	print(new_df.head())
	new_df =new_df.reset_index(drop=True)
	new_df.to_parquet(root_path+"/"+"betterversionvn_30p_chunk_8.parquet", engine='fastparquet')
	break

	def normalize_text(parquet_file):
	df = pd.read_parquet(parquet_file, engine='fastparquet')
	req = []
	print(df.shape[0]-1)
	dem = 0
	for index, row in df.iterrows():
	text = df.loc[index, 'text']
	req.append(text)
	# print(index)
	if len(req)==50 or index == df.shape[0]-1:
	res = call_api(str(req))

	for idx,r in enumerate(res):
	df.loc[50*dem+idx, 'normal_text'] = r
	dem+=1
	req = []
	for index, row in df.iterrows():
	if has_numbers(df.loc[index, 'normal_text']):
	print("has number",df.loc[index, 'normal_text'])
	elif df.loc[index, 'normal_text'] != df.loc[index, 'text'] and not has_numbers(df.loc[index, 'text']):
	print(df.loc[index, 'normal_text'])
	print(df.loc[index, 'text'])

	df.to_parquet(parquet_file, engine='fastparquet')


	def read(parquet_file):
	df = pd.read_parquet(parquet_file, engine='fastparquet')
	for index, row in df.iterrows():
	if has_numbers(df.loc[index, 'normal_text']):
	print("has number",df.loc[index, 'normal_text'])
	elif df.loc[index, 'normal_text'] != df.loc[index, 'text'] and not has_numbers(df.loc[index, 'text']):
	print(df.loc[index, 'normal_text'])
	print(df.loc[index, 'text'])
	print(df.loc[index, 'audio.path'])
	df.loc[index, 'normal_text'] =df.loc[index, 'text']
	# df.to_parquet(parquet_file, engine='fastparquet')

	def copy_audio(parquet_file):
	import shutil
	df = pd.read_parquet(parquet_file, engine='fastparquet')
	for index, row in df.iterrows():
	file= df.loc[index, 'audio.path']
	shutil.copy2(file, file.replace("dataset","data"))

	def export(parquet_file,output_file):
	df = pd.read_parquet(parquet_file, engine='fastparquet')
	data =[]
	for index, row in df.iterrows():
	data.append(f"{df.loc[index, 'audio.path']}\|{df.loc[index, 'phonemes']}\|0")

	with open(output_file, 'w',encoding="utf-8") as f:
	for item in data:
	f.write("%s\n" % item)


	if __name__ == "__main__":
	# generate("dataset","betterversionvn.parquet")
	# normalize_text("dataset/Xanh24h_1h.parquet")
	# read("dataset/Xanh24h_1h.parquet")
	# word_process("dataset/Xanh24h_1h_test.parquet")
	# # copy_audio("dataset/Xanh24h_1h.parquet")
	# export("dataset/Xanh24h_1h_test.parquet","val_list.txt")
	pass