BilalSardar commited on
Commit
f2f06bf
1 Parent(s): ad0737d

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -165
app.py DELETED
@@ -1,165 +0,0 @@
1
- from turtle import title
2
- import gradio as gr
3
-
4
- import git
5
- import os
6
- os.system('git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS')
7
- os.system('pip install -q -e TTS/')
8
- os.system('pip install -q torchaudio==0.9.0')
9
-
10
- import sys
11
- TTS_PATH = "TTS/"
12
-
13
- # add libraries into environment
14
- sys.path.append(TTS_PATH) # set this if TTS is not installed globally
15
-
16
- import os
17
- import string
18
- import time
19
- import argparse
20
- import json
21
-
22
- import numpy as np
23
- import IPython
24
- from IPython.display import Audio
25
-
26
-
27
- import torch
28
-
29
- from TTS.tts.utils.synthesis import synthesis
30
- #from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
31
- try:
32
- from TTS.utils.audio import AudioProcessor
33
- except:
34
- from TTS.utils.audio import AudioProcessor
35
-
36
-
37
- from TTS.tts.models import setup_model
38
- from TTS.config import load_config
39
- from TTS.tts.models.vits import *
40
-
41
- OUT_PATH = 'out/'
42
-
43
- # create output path
44
- os.makedirs(OUT_PATH, exist_ok=True)
45
-
46
- # model vars
47
- MODEL_PATH = '/home/user/app/best_model_latest.pth.tar'
48
- CONFIG_PATH = '/home/user/app/config.json'
49
- TTS_LANGUAGES = "/home/user/app/language_ids.json"
50
- TTS_SPEAKERS = "/home/user/app/speakers.json"
51
- USE_CUDA = torch.cuda.is_available()
52
-
53
- # load the config
54
- C = load_config(CONFIG_PATH)
55
-
56
-
57
- # load the audio processor
58
- ap = AudioProcessor(**C.audio)
59
-
60
- speaker_embedding = None
61
-
62
- C.model_args['d_vector_file'] = TTS_SPEAKERS
63
- C.model_args['use_speaker_encoder_as_loss'] = False
64
-
65
- model = setup_model(C)
66
- model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
67
- # print(model.language_manager.num_languages, model.embedded_language_dim)
68
- # print(model.emb_l)
69
- cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
70
- # remove speaker encoder
71
- model_weights = cp['model'].copy()
72
- for key in list(model_weights.keys()):
73
- if "speaker_encoder" in key:
74
- del model_weights[key]
75
-
76
- model.load_state_dict(model_weights)
77
-
78
-
79
- model.eval()
80
-
81
- if USE_CUDA:
82
- model = model.cuda()
83
-
84
- # synthesize voice
85
- use_griffin_lim = False
86
-
87
- os.system('pip install -q pydub ffmpeg-normalize')
88
-
89
- CONFIG_SE_PATH = "config_se.json"
90
- CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
91
-
92
- from TTS.tts.utils.speakers import SpeakerManager
93
- from pydub import AudioSegment
94
- import librosa
95
-
96
- SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)
97
-
98
- def compute_spec(ref_file):
99
- y, sr = librosa.load(ref_file, sr=ap.sample_rate)
100
- spec = ap.spectrogram(y)
101
- spec = torch.FloatTensor(spec).unsqueeze(0)
102
- return spec
103
-
104
-
105
-
106
- def greet(Text,Voicetoclone,VoiceMicrophone):
107
- text= "%s" % (Text)
108
- if Voicetoclone is not None:
109
- reference_files= "%s" % (Voicetoclone)
110
- print("path url")
111
- print(Voicetoclone)
112
- sample= str(Voicetoclone)
113
- else:
114
- reference_files= "%s" % (VoiceMicrophone)
115
- print("path url")
116
- print(VoiceMicrophone)
117
- sample= str(VoiceMicrophone)
118
- size= len(reference_files)*sys.getsizeof(reference_files)
119
- size2= size / 1000000
120
- if (size2 > 0.012) or len(text)>2000:
121
- message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes."
122
- print(message)
123
- raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.")
124
- else:
125
- os.system('ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f')
126
- reference_emb = SE_speaker_manager.compute_d_vector_from_clip(reference_files)
127
- model.length_scale = 1 # scaler for the duration predictor. The larger it is, the slower the speech.
128
- model.inference_noise_scale = 0.3 # defines the noise variance applied to the random z vector at inference.
129
- model.inference_noise_scale_dp = 0.3 # defines the noise variance applied to the duration predictor z vector at inference.
130
- text = text
131
- model.language_manager.language_id_mapping
132
- language_id = 0
133
-
134
- print(" > text: {}".format(text))
135
- wav, alignment, _, _ = synthesis(
136
- model,
137
- text,
138
- C,
139
- "cuda" in str(next(model.parameters()).device),
140
- ap,
141
- speaker_id=None,
142
- d_vector=reference_emb,
143
- style_wav=None,
144
- language_id=language_id,
145
- enable_eos_bos_chars=C.enable_eos_bos_chars,
146
- use_griffin_lim=True,
147
- do_trim_silence=False,
148
- ).values()
149
- print("Generated Audio")
150
- IPython.display.display(Audio(wav, rate=ap.sample_rate))
151
- #file_name = text.replace(" ", "_")
152
- #file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
153
- file_name="Audio.wav"
154
- out_path = os.path.join(OUT_PATH, file_name)
155
- print(" > Saving output to {}".format(out_path))
156
- ap.save_wav(wav, out_path)
157
- return out_path
158
-
159
- demo = gr.Interface(
160
- fn=greet,
161
- inputs=[gr.inputs.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),gr.Audio(type="filepath", source="upload",label='Please upload a voice to clone (max. 30mb)'),gr.Audio(source="microphone", type="filepath", streaming=True)],
162
- outputs="audio",
163
- title="Bilal's Voice Cloning Tool"
164
- )
165
- demo.launch()