File size: 3,258 Bytes
d29da97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import os
from pydub import AudioSegment

from nltk.tokenize import TweetTokenizer
word_tokenize = TweetTokenizer().tokenize

from gemini_normalize import call_api

import phonemizer
global_phonemizer = phonemizer.backend.EspeakBackend(language='vi', preserve_punctuation=True,  with_stress=True,language_switch='remove-flags',words_mismatch='ignore')

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)
def generate(root_path,parquet_file):
    total_duration = 0

    df = pd.read_parquet(root_path+"/"+parquet_file, engine='fastparquet')
    new_df = pd.DataFrame(columns=['audio.path', 'text','speaker_id'])
    for index, row in df.iterrows():
        if index < 3000:
            continue
        file= df.loc[index, 'audio.path']
        text = df.loc[index, 'text']

        audio = AudioSegment.from_wav(file)
        if len(audio)/1000.0>2:
            total_duration += len(audio)
            new_df = new_df.append({'audio.path':file,'text':text,'speaker_id':0},ignore_index=True)


        if total_duration/(1000*60*6)>=1:
            print(new_df.head())
            new_df =new_df.reset_index(drop=True)
            new_df.to_parquet(root_path+"/"+"Xanh24h_1h_test.parquet", engine='fastparquet')
            break


def normalize_text(parquet_file):
    df = pd.read_parquet(parquet_file, engine='fastparquet')
    req = []
    print(df.shape[0]-1)
    dem = 0
    for index, row in df.iterrows():
        text = df.loc[index, 'text']
        req.append(text)
        # print(index)
        if len(req)==50 or index == df.shape[0]-1:
            res = call_api(str(req))

            for idx,r in enumerate(res):
                df.loc[50*dem+idx, 'normal_text'] = r
            dem+=1
            req = []
    for index, row in df.iterrows():
        if has_numbers(df.loc[index, 'normal_text']):
            print("has number",df.loc[index, 'normal_text'])
        elif df.loc[index, 'normal_text'] != df.loc[index, 'text'] and  not has_numbers(df.loc[index, 'text']):
            print(df.loc[index, 'normal_text'])
            print(df.loc[index, 'text'])
            df.loc[index, 'normal_text'] = df.loc[index, 'text']
    df.to_parquet(parquet_file, engine='fastparquet')

def word_process(parquet_path):
    df = pd.read_parquet(parquet_path, engine='fastparquet')
    for index, row in df.iterrows():
        text = df.loc[index, 'normal_text']

        text = word_tokenize(text)
        text = ' '.join(text)

        ps = global_phonemizer.phonemize([text])

        df.loc[index, 'phonemes'] = ps[0]

    df.to_parquet(parquet_path, engine='fastparquet')

def copy_audio(parquet_file):
    import shutil
    df = pd.read_parquet(parquet_file, engine='fastparquet')
    for index, row in df.iterrows():
        file= df.loc[index, 'audio.path']
        shutil.copy2(file, file.replace("dataset","data"))
if __name__ == "__main__":
    # generate("dataset","Xanh24h.parquet")
    # normalize_text("dataset/Xanh24h_1h_test.parquet")
    # read("dataset/Xanh24h_1h.parquet")
    # word_process("dataset/Xanh24h_1h_test.parquet")
    # copy_audio("dataset/Xanh24h_1h_test.parquet")
    pass