Dizayee commited on
Commit
504881d
1 Parent(s): df6ce86

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ trainer_0_log.txt filter=lfs diff=lfs merge=lfs -text
ZD_trainer (copy).py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # os.environ["CUDA_VISIBLE_DEVICES"] = "7"
3
+
4
+ import wandb
5
+ from trainer import Trainer, TrainerArgs
6
+
7
+ from TTS.tts.configs.shared_configs import BaseDatasetConfig , CharactersConfig
8
+ from TTS.config.shared_configs import BaseAudioConfig
9
+ from TTS.tts.configs.vits_config import VitsConfig
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.models.vits import Vits, VitsAudioConfig
12
+ from TTS.tts.utils.text.tokenizer import TTSTokenizer
13
+ from TTS.utils.audio import AudioProcessor
14
+
15
+ # Start a wandb run with `sync_tensorboard=True`
16
+ #wandb.init(project="persian-tts-vits-grapheme-azure-fa", group="GPU 6,7 accel mixed fp16 64x64", sync_tensorboard=True)
17
+
18
+ # output_path = os.path.dirname(os.path.abspath(__file__))
19
+ # output_path = output_path + '/notebook_files/runs'
20
+ #output_path = wandb.run.dir
21
+ output_path = "ZD_output"
22
+
23
+ print("output path is:")
24
+ print(output_path)
25
+
26
+ cache_path = "cache"
27
+ dataset_config = BaseDatasetConfig(
28
+ formatter="mozilla", meta_file_train="metadata.csv", path="/home/bargh1/ZD_Final"
29
+ )
30
+
31
+ character_config=CharactersConfig(
32
+ characters=' ي ء ا ب ت ث ج ح خ د ذ ر ز ژ س ش ع غ ف ق ل م ن ه و ۆ ی ڕ چ ڕ گ ک پ ە ڤ ھ ێ ك',
33
+ # characters="!¡'(),-.:;¿?ABCDEFGHIJKLMNOPRSTUVWXYZabcdefghijklmnopqrstuvwxyzáçèéêëìíîïñòóôöùúûü«°±µ»$%&‘’‚“`”„",
34
+ punctuations='!(),-.:;? ̠،؛؟‌<>',
35
+ phonemes='ˈˌːˑpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟaegiouwyɪʊ̩æɑɔəɚɛɝɨ̃ʉʌʍ0123456789"#$%*+/=ABCDEFGHIJKLMNOPRSTUVWXYZ[]^_{}',
36
+ pad="<PAD>",
37
+ eos="<EOS>",
38
+ bos="<BOS>",
39
+ blank="<BLNK>",
40
+ characters_class="TTS.tts.models.vits.VitsCharacters",
41
+ )
42
+
43
+ audio_config = BaseAudioConfig(
44
+ sample_rate=22050,
45
+ do_trim_silence=True,
46
+ min_level_db=-1,
47
+ # do_sound_norm=True,
48
+ signal_norm=True,
49
+ clip_norm=True,
50
+ symmetric_norm=True,
51
+ max_norm = 0.9,
52
+ resample=True,
53
+ win_length=1024,
54
+ hop_length=256,
55
+ num_mels=80,
56
+ mel_fmin=0,
57
+ mel_fmax=None
58
+ )
59
+
60
+ vits_audio_config = VitsAudioConfig(
61
+ sample_rate=22050,
62
+ # do_sound_norm=True,
63
+ win_length=1024,
64
+ hop_length=256,
65
+ num_mels=80,
66
+ # do_trim_silence=True, #from hugging
67
+ mel_fmin=0,
68
+ mel_fmax=None
69
+ )
70
+ config = VitsConfig(
71
+ audio=vits_audio_config, #from huggingface
72
+ run_name="persian-tts-vits-grapheme-azure",
73
+ batch_size=16,
74
+ batch_group_size=16,
75
+ eval_batch_size=4,
76
+ num_loader_workers=4,
77
+ num_eval_loader_workers=2,
78
+ run_eval=True,
79
+ run_eval_steps = 200,
80
+ print_eval=True,
81
+ test_delay_epochs=-1,
82
+ epochs=1000,
83
+ save_step=200,
84
+ text_cleaner="basic_cleaners", #from MH
85
+ use_phonemes=False,
86
+ # phonemizer='persian_mh', #from TTS github
87
+ # phoneme_language="fa",
88
+ characters=character_config, #test without as well
89
+ phoneme_cache_path=os.path.join(cache_path, "phoneme_cache_grapheme_azure"),
90
+ compute_input_seq_cache=True,
91
+ print_step=200,
92
+ mixed_precision=False, #from TTS - True causes error "Expected reduction dim"
93
+ test_sentences=[
94
+ ["دەتوانی لەم بەرهەمە دەخوێنیت بەشێوەیەکی خوشەویست."],
95
+ ["ئەو پاشانی کاردەکات بە دڵخوازی و دەچێت بەهەڵە دڵی دوایی."],
96
+ ["سەرەتا دەبێت بە هەرێمی نەخشەی بەکاربێنیت."],
97
+ ],
98
+ output_path=output_path,
99
+ datasets=[dataset_config]
100
+ )
101
+
102
+ # INITIALIZE THE AUDIO PROCESSOR
103
+ # Audio processor is used for feature extraction and audio I/O.
104
+ # It mainly serves to the dataloader and the training loggers.
105
+ ap = AudioProcessor.init_from_config(config)
106
+
107
+ # INITIALIZE THE TOKENIZER
108
+ # Tokenizer is used to convert text to sequences of token IDs.
109
+ # config is updated with the default characters if not defined in the config.
110
+ tokenizer, config = TTSTokenizer.init_from_config(config)
111
+
112
+ # LOAD DATA SAMPLES
113
+ # Each sample is a list of ```[text, audio_file_path, speaker_name]```
114
+ # You can define your custom sample loader returning the list of samples.
115
+ # Or define your custom formatter and pass it to the `load_tts_samples`.
116
+ # Check `TTS.tts.datasets.load_tts_samples` for more details.
117
+ train_samples, eval_samples = load_tts_samples(
118
+ dataset_config,
119
+ eval_split=True,
120
+ eval_split_max_size=config.eval_split_max_size,
121
+ eval_split_size=config.eval_split_size,
122
+ )
123
+
124
+ # init model
125
+ model = Vits(config, ap, tokenizer, speaker_manager=None)
126
+
127
+ # init the trainer and 🚀
128
+
129
+ trainer = Trainer(
130
+ TrainerArgs(use_accelerate=True),
131
+ config,
132
+ output_path,
133
+ model=model,
134
+ train_samples=train_samples,
135
+ eval_samples=eval_samples,
136
+ )
137
+ trainer.fit()
best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f58c07ad9699fe61a24aa5dfabc42134678151bdfbb9f8aa27315d0c426fcae
3
+ size 998154294
config.json ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "ZD_output",
3
+ "logger_uri": null,
4
+ "run_name": "persian-tts-vits-grapheme-azure",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 200,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": null,
14
+ "save_step": 200,
15
+ "save_n_checkpoints": 5,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 0,
19
+ "target_loss": null,
20
+ "print_eval": true,
21
+ "test_delay_epochs": -1,
22
+ "run_eval": true,
23
+ "run_eval_steps": 200,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 1000,
29
+ "batch_size": 16,
30
+ "eval_batch_size": 4,
31
+ "grad_clip": [
32
+ 1000,
33
+ 1000
34
+ ],
35
+ "scheduler_after_epoch": true,
36
+ "lr": 0.001,
37
+ "optimizer": "AdamW",
38
+ "optimizer_params": {
39
+ "betas": [
40
+ 0.8,
41
+ 0.99
42
+ ],
43
+ "eps": 1e-09,
44
+ "weight_decay": 0.01
45
+ },
46
+ "lr_scheduler": null,
47
+ "lr_scheduler_params": {},
48
+ "use_grad_scaler": false,
49
+ "allow_tf32": false,
50
+ "cudnn_enable": true,
51
+ "cudnn_deterministic": false,
52
+ "cudnn_benchmark": false,
53
+ "training_seed": 54321,
54
+ "model": "vits",
55
+ "num_loader_workers": 4,
56
+ "num_eval_loader_workers": 2,
57
+ "use_noise_augment": false,
58
+ "audio": {
59
+ "fft_size": 1024,
60
+ "sample_rate": 22050,
61
+ "win_length": 1024,
62
+ "hop_length": 256,
63
+ "num_mels": 80,
64
+ "mel_fmin": 0,
65
+ "mel_fmax": null
66
+ },
67
+ "use_phonemes": false,
68
+ "phonemizer": null,
69
+ "phoneme_language": null,
70
+ "compute_input_seq_cache": true,
71
+ "text_cleaner": "basic_cleaners",
72
+ "enable_eos_bos_chars": false,
73
+ "test_sentences_file": "",
74
+ "phoneme_cache_path": "cache/phoneme_cache_grapheme_azure",
75
+ "characters": {
76
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
77
+ "vocab_dict": null,
78
+ "pad": "<PAD>",
79
+ "eos": "<EOS>",
80
+ "bos": "<BOS>",
81
+ "blank": "<BLNK>",
82
+ "characters": " \u064a \u0621 \u0627 \u0628 \u062a \u062b \u062c \u062d \u062e \u062f \u0630 \u0631 \u0632 \u0698 \u0633 \u0634 \u0639 \u063a \u0641 \u0642 \u0644 \u0645 \u0646 \u0647 \u0648 \u06c6 \u06cc \u0695 \u0686 \u0695 \u06af \u06a9 \u067e \u06d5 \u06a4 \u06be \u06ce \u0643",
83
+ "punctuations": "!(),-.:;? \u0320\u060c\u061b\u061f\u200c<>",
84
+ "phonemes": "\u02c8\u02cc\u02d0\u02d1pbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029faegiouwy\u026a\u028a\u0329\u00e6\u0251\u0254\u0259\u025a\u025b\u025d\u0268\u0303\u0289\u028c\u028d0123456789\"#$%*+/=ABCDEFGHIJKLMNOPRSTUVWXYZ[]^_{}",
85
+ "is_unique": true,
86
+ "is_sorted": true
87
+ },
88
+ "add_blank": true,
89
+ "batch_group_size": 16,
90
+ "loss_masking": null,
91
+ "min_audio_len": 1,
92
+ "max_audio_len": Infinity,
93
+ "min_text_len": 1,
94
+ "max_text_len": Infinity,
95
+ "compute_f0": false,
96
+ "compute_energy": false,
97
+ "compute_linear_spec": true,
98
+ "precompute_num_workers": 0,
99
+ "start_by_longest": false,
100
+ "shuffle": false,
101
+ "drop_last": false,
102
+ "datasets": [
103
+ {
104
+ "formatter": "mozilla",
105
+ "dataset_name": "",
106
+ "path": "/home/bargh1/ZD_Final",
107
+ "meta_file_train": "metadata.csv",
108
+ "ignored_speakers": null,
109
+ "language": "",
110
+ "phonemizer": "",
111
+ "meta_file_val": "",
112
+ "meta_file_attn_mask": ""
113
+ }
114
+ ],
115
+ "test_sentences": [
116
+ [
117
+ "\u062f\u06d5\u062a\u0648\u0627\u0646\u06cc \u0644\u06d5\u0645 \u0628\u06d5\u0631\u0647\u06d5\u0645\u06d5 \u062f\u06d5\u062e\u0648\u06ce\u0646\u06cc\u062a \u0628\u06d5\u0634\u06ce\u0648\u06d5\u06cc\u06d5\u06a9\u06cc \u062e\u0648\u0634\u06d5\u0648\u06cc\u0633\u062a."
118
+ ],
119
+ [
120
+ "\u0626\u06d5\u0648 \u067e\u0627\u0634\u0627\u0646\u06cc \u06a9\u0627\u0631\u062f\u06d5\u06a9\u0627\u062a \u0628\u06d5 \u062f\u06b5\u062e\u0648\u0627\u0632\u06cc \u0648 \u062f\u06d5\u0686\u06ce\u062a \u0628\u06d5\u0647\u06d5\u06b5\u06d5 \u062f\u06b5\u06cc \u062f\u0648\u0627\u06cc\u06cc."
121
+ ],
122
+ [
123
+ "\u0633\u06d5\u0631\u06d5\u062a\u0627 \u062f\u06d5\u0628\u06ce\u062a \u0628\u06d5 \u0647\u06d5\u0631\u06ce\u0645\u06cc \u0646\u06d5\u062e\u0634\u06d5\u06cc \u0628\u06d5\u06a9\u0627\u0631\u0628\u06ce\u0646\u06cc\u062a."
124
+ ]
125
+ ],
126
+ "eval_split_max_size": null,
127
+ "eval_split_size": 0.01,
128
+ "use_speaker_weighted_sampler": false,
129
+ "speaker_weighted_sampler_alpha": 1.0,
130
+ "use_language_weighted_sampler": false,
131
+ "language_weighted_sampler_alpha": 1.0,
132
+ "use_length_weighted_sampler": false,
133
+ "length_weighted_sampler_alpha": 1.0,
134
+ "model_args": {
135
+ "num_chars": 232,
136
+ "out_channels": 513,
137
+ "spec_segment_size": 32,
138
+ "hidden_channels": 192,
139
+ "hidden_channels_ffn_text_encoder": 768,
140
+ "num_heads_text_encoder": 2,
141
+ "num_layers_text_encoder": 6,
142
+ "kernel_size_text_encoder": 3,
143
+ "dropout_p_text_encoder": 0.1,
144
+ "dropout_p_duration_predictor": 0.5,
145
+ "kernel_size_posterior_encoder": 5,
146
+ "dilation_rate_posterior_encoder": 1,
147
+ "num_layers_posterior_encoder": 16,
148
+ "kernel_size_flow": 5,
149
+ "dilation_rate_flow": 1,
150
+ "num_layers_flow": 4,
151
+ "resblock_type_decoder": "1",
152
+ "resblock_kernel_sizes_decoder": [
153
+ 3,
154
+ 7,
155
+ 11
156
+ ],
157
+ "resblock_dilation_sizes_decoder": [
158
+ [
159
+ 1,
160
+ 3,
161
+ 5
162
+ ],
163
+ [
164
+ 1,
165
+ 3,
166
+ 5
167
+ ],
168
+ [
169
+ 1,
170
+ 3,
171
+ 5
172
+ ]
173
+ ],
174
+ "upsample_rates_decoder": [
175
+ 8,
176
+ 8,
177
+ 2,
178
+ 2
179
+ ],
180
+ "upsample_initial_channel_decoder": 512,
181
+ "upsample_kernel_sizes_decoder": [
182
+ 16,
183
+ 16,
184
+ 4,
185
+ 4
186
+ ],
187
+ "periods_multi_period_discriminator": [
188
+ 2,
189
+ 3,
190
+ 5,
191
+ 7,
192
+ 11
193
+ ],
194
+ "use_sdp": true,
195
+ "noise_scale": 1.0,
196
+ "inference_noise_scale": 0.667,
197
+ "length_scale": 1,
198
+ "noise_scale_dp": 1.0,
199
+ "inference_noise_scale_dp": 1.0,
200
+ "max_inference_len": null,
201
+ "init_discriminator": true,
202
+ "use_spectral_norm_disriminator": false,
203
+ "use_speaker_embedding": false,
204
+ "num_speakers": 0,
205
+ "speakers_file": null,
206
+ "d_vector_file": null,
207
+ "speaker_embedding_channels": 256,
208
+ "use_d_vector_file": false,
209
+ "d_vector_dim": 0,
210
+ "detach_dp_input": true,
211
+ "use_language_embedding": false,
212
+ "embedded_language_dim": 4,
213
+ "num_languages": 0,
214
+ "language_ids_file": null,
215
+ "use_speaker_encoder_as_loss": false,
216
+ "speaker_encoder_config_path": "",
217
+ "speaker_encoder_model_path": "",
218
+ "condition_dp_on_speaker": true,
219
+ "freeze_encoder": false,
220
+ "freeze_DP": false,
221
+ "freeze_PE": false,
222
+ "freeze_flow_decoder": false,
223
+ "freeze_waveform_decoder": false,
224
+ "encoder_sample_rate": null,
225
+ "interpolate_z": true,
226
+ "reinit_DP": false,
227
+ "reinit_text_encoder": false
228
+ },
229
+ "lr_gen": 0.0002,
230
+ "lr_disc": 0.0002,
231
+ "lr_scheduler_gen": "ExponentialLR",
232
+ "lr_scheduler_gen_params": {
233
+ "gamma": 0.999875,
234
+ "last_epoch": -1
235
+ },
236
+ "lr_scheduler_disc": "ExponentialLR",
237
+ "lr_scheduler_disc_params": {
238
+ "gamma": 0.999875,
239
+ "last_epoch": -1
240
+ },
241
+ "kl_loss_alpha": 1.0,
242
+ "disc_loss_alpha": 1.0,
243
+ "gen_loss_alpha": 1.0,
244
+ "feat_loss_alpha": 1.0,
245
+ "mel_loss_alpha": 45.0,
246
+ "dur_loss_alpha": 1.0,
247
+ "speaker_encoder_loss_alpha": 1.0,
248
+ "return_wav": true,
249
+ "use_weighted_sampler": false,
250
+ "weighted_sampler_attrs": {},
251
+ "weighted_sampler_multipliers": {},
252
+ "r": 1,
253
+ "num_speakers": 0,
254
+ "use_speaker_embedding": false,
255
+ "speakers_file": null,
256
+ "speaker_embedding_channels": 256,
257
+ "language_ids_file": null,
258
+ "use_language_embedding": false,
259
+ "use_d_vector_file": false,
260
+ "d_vector_file": null,
261
+ "d_vector_dim": 0,
262
+ "github_branch": "* dev"
263
+ }
events.out.tfevents.1713913523.lambda-01.15004.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c53ae448214d1f559e8bfe01721d24e6b979f9f35e295612f67f18cc06c901b
3
+ size 765208893
trainer_0_log.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0f1c8be5120e86a74c99dc0dac7d58e8e84e4a8ea82492f8b1063be6a83f7c7
3
+ size 12234499