emilios commited on
Commit
0476d44
1 Parent(s): c93577b

Training in progress, step 1000

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +0 -1
  2. checkpoint-1000/config.json +41 -0
  3. checkpoint-1000/global_step1000/mp_rank_00_model_states.pt +3 -0
  4. checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  5. checkpoint-1000/latest +1 -0
  6. checkpoint-1000/preprocessor_config.json +0 -0
  7. checkpoint-1000/pytorch_model.bin +3 -0
  8. checkpoint-1000/rng_state.pth +3 -0
  9. checkpoint-1000/trainer_state.json +265 -0
  10. checkpoint-1000/training_args.bin +3 -0
  11. checkpoint-1000/zero_to_fp32.py +482 -0
  12. checkpoint-12000/config.json +41 -0
  13. checkpoint-12000/global_step12000/mp_rank_00_model_states.pt +3 -0
  14. checkpoint-12000/global_step12000/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  15. checkpoint-12000/latest +1 -0
  16. checkpoint-12000/preprocessor_config.json +0 -0
  17. checkpoint-12000/pytorch_model.bin +3 -0
  18. checkpoint-12000/rng_state.pth +3 -0
  19. checkpoint-12000/trainer_state.json +3004 -0
  20. checkpoint-12000/training_args.bin +3 -0
  21. checkpoint-12000/zero_to_fp32.py +482 -0
  22. checkpoint-17000/config.json +41 -0
  23. checkpoint-17000/global_step17000/mp_rank_00_model_states.pt +3 -0
  24. checkpoint-17000/global_step17000/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  25. checkpoint-17000/latest +1 -0
  26. checkpoint-17000/preprocessor_config.json +0 -0
  27. checkpoint-17000/pytorch_model.bin +3 -0
  28. checkpoint-17000/rng_state.pth +3 -0
  29. checkpoint-17000/trainer_state.json +4249 -0
  30. checkpoint-17000/training_args.bin +3 -0
  31. checkpoint-17000/zero_to_fp32.py +482 -0
  32. checkpoint-18000/config.json +41 -0
  33. checkpoint-18000/global_step18000/mp_rank_00_model_states.pt +3 -0
  34. checkpoint-18000/global_step18000/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  35. checkpoint-18000/latest +1 -0
  36. checkpoint-18000/preprocessor_config.json +0 -0
  37. checkpoint-18000/pytorch_model.bin +3 -0
  38. checkpoint-18000/rng_state.pth +3 -0
  39. checkpoint-18000/trainer_state.json +4498 -0
  40. checkpoint-18000/training_args.bin +3 -0
  41. checkpoint-18000/zero_to_fp32.py +482 -0
  42. checkpoint-19000/config.json +41 -0
  43. checkpoint-19000/global_step19000/mp_rank_00_model_states.pt +3 -0
  44. checkpoint-19000/global_step19000/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  45. checkpoint-19000/latest +1 -0
  46. checkpoint-19000/preprocessor_config.json +0 -0
  47. checkpoint-19000/pytorch_model.bin +3 -0
  48. checkpoint-19000/rng_state.pth +3 -0
  49. checkpoint-19000/trainer_state.json +0 -0
  50. checkpoint-19000/training_args.bin +3 -0
.gitignore CHANGED
@@ -1 +0,0 @@
1
- checkpoint-*/
 
 
checkpoint-1000/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "emilios/whisper-medium-el-n2",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 24,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.1,
21
+ "encoder_attention_heads": 16,
22
+ "encoder_ffn_dim": 4096,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 24,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": null,
27
+ "init_std": 0.02,
28
+ "is_encoder_decoder": true,
29
+ "max_length": 448,
30
+ "max_source_positions": 1500,
31
+ "max_target_positions": 448,
32
+ "model_type": "whisper",
33
+ "num_hidden_layers": 24,
34
+ "num_mel_bins": 80,
35
+ "pad_token_id": 50257,
36
+ "scale_embedding": false,
37
+ "torch_dtype": "float16",
38
+ "transformers_version": "4.26.0.dev0",
39
+ "use_cache": false,
40
+ "vocab_size": 51865
41
+ }
checkpoint-1000/global_step1000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb8f431b487681c52c81a3a62a99a63d952a5bc428be168eb056374b66b0831d
3
+ size 1527967899
checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2329af6bfc8cdeb9f375d3b3911f30dea002bbc406ce09034e255002ccfd7c87
3
+ size 9166378846
checkpoint-1000/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step1000
checkpoint-1000/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c003ccc0e40b6e5ee396b9f08631bc9a513d9d258d81e20345f01d9d83e99efd
3
+ size 1527847357
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e36ccd83d6f112f5f2cd8edf5ad0910ba5cb52adc5a9745e1bd9ca403d362b3
3
+ size 14575
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 10.36404160475483,
3
+ "best_model_checkpoint": "./checkpoint-1000",
4
+ "epoch": 58.8235294117647,
5
+ "global_step": 1000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 1.47,
12
+ "learning_rate": 1.5136083400296205e-06,
13
+ "loss": 0.0024,
14
+ "step": 25
15
+ },
16
+ {
17
+ "epoch": 2.94,
18
+ "learning_rate": 1.8687587131475301e-06,
19
+ "loss": 0.0024,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 4.41,
24
+ "learning_rate": 2.0711488350670174e-06,
25
+ "loss": 0.0023,
26
+ "step": 75
27
+ },
28
+ {
29
+ "epoch": 5.88,
30
+ "learning_rate": 2.213317753617305e-06,
31
+ "loss": 0.0023,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 7.35,
36
+ "learning_rate": 2.3230029693718747e-06,
37
+ "loss": 0.002,
38
+ "step": 125
39
+ },
40
+ {
41
+ "epoch": 8.82,
42
+ "learning_rate": 2.412322158351148e-06,
43
+ "loss": 0.002,
44
+ "step": 150
45
+ },
46
+ {
47
+ "epoch": 10.29,
48
+ "learning_rate": 2.4876668872198717e-06,
49
+ "loss": 0.0022,
50
+ "step": 175
51
+ },
52
+ {
53
+ "epoch": 11.76,
54
+ "learning_rate": 2.552824062407326e-06,
55
+ "loss": 0.0021,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 13.24,
60
+ "learning_rate": 2.610223373296667e-06,
61
+ "loss": 0.0034,
62
+ "step": 225
63
+ },
64
+ {
65
+ "epoch": 14.71,
66
+ "learning_rate": 2.661517182828361e-06,
67
+ "loss": 0.0019,
68
+ "step": 250
69
+ },
70
+ {
71
+ "epoch": 16.18,
72
+ "learning_rate": 2.7078803874740543e-06,
73
+ "loss": 0.0018,
74
+ "step": 275
75
+ },
76
+ {
77
+ "epoch": 17.65,
78
+ "learning_rate": 2.750178319990197e-06,
79
+ "loss": 0.0023,
80
+ "step": 300
81
+ },
82
+ {
83
+ "epoch": 19.12,
84
+ "learning_rate": 2.7890667754365044e-06,
85
+ "loss": 0.0019,
86
+ "step": 325
87
+ },
88
+ {
89
+ "epoch": 20.59,
90
+ "learning_rate": 2.8250546392106077e-06,
91
+ "loss": 0.0021,
92
+ "step": 350
93
+ },
94
+ {
95
+ "epoch": 22.06,
96
+ "learning_rate": 2.8585447348549113e-06,
97
+ "loss": 0.0023,
98
+ "step": 375
99
+ },
100
+ {
101
+ "epoch": 23.53,
102
+ "learning_rate": 2.889861392935294e-06,
103
+ "loss": 0.0021,
104
+ "step": 400
105
+ },
106
+ {
107
+ "epoch": 25.0,
108
+ "learning_rate": 2.9192696063561725e-06,
109
+ "loss": 0.0016,
110
+ "step": 425
111
+ },
112
+ {
113
+ "epoch": 26.47,
114
+ "learning_rate": 2.946988676871634e-06,
115
+ "loss": 0.0018,
116
+ "step": 450
117
+ },
118
+ {
119
+ "epoch": 27.94,
120
+ "learning_rate": 2.973202150939645e-06,
121
+ "loss": 0.0022,
122
+ "step": 475
123
+ },
124
+ {
125
+ "epoch": 29.41,
126
+ "learning_rate": 2.998065193492142e-06,
127
+ "loss": 0.0018,
128
+ "step": 500
129
+ },
130
+ {
131
+ "epoch": 30.88,
132
+ "learning_rate": 2.9559999999999997e-06,
133
+ "loss": 0.0018,
134
+ "step": 525
135
+ },
136
+ {
137
+ "epoch": 32.35,
138
+ "learning_rate": 2.9060000000000002e-06,
139
+ "loss": 0.0021,
140
+ "step": 550
141
+ },
142
+ {
143
+ "epoch": 33.82,
144
+ "learning_rate": 2.856e-06,
145
+ "loss": 0.0017,
146
+ "step": 575
147
+ },
148
+ {
149
+ "epoch": 35.29,
150
+ "learning_rate": 2.8060000000000003e-06,
151
+ "loss": 0.0018,
152
+ "step": 600
153
+ },
154
+ {
155
+ "epoch": 36.76,
156
+ "learning_rate": 2.756e-06,
157
+ "loss": 0.0016,
158
+ "step": 625
159
+ },
160
+ {
161
+ "epoch": 38.24,
162
+ "learning_rate": 2.706e-06,
163
+ "loss": 0.0017,
164
+ "step": 650
165
+ },
166
+ {
167
+ "epoch": 39.71,
168
+ "learning_rate": 2.656e-06,
169
+ "loss": 0.0016,
170
+ "step": 675
171
+ },
172
+ {
173
+ "epoch": 41.18,
174
+ "learning_rate": 2.606e-06,
175
+ "loss": 0.0013,
176
+ "step": 700
177
+ },
178
+ {
179
+ "epoch": 42.65,
180
+ "learning_rate": 2.556e-06,
181
+ "loss": 0.0013,
182
+ "step": 725
183
+ },
184
+ {
185
+ "epoch": 44.12,
186
+ "learning_rate": 2.5060000000000002e-06,
187
+ "loss": 0.0012,
188
+ "step": 750
189
+ },
190
+ {
191
+ "epoch": 45.59,
192
+ "learning_rate": 2.456e-06,
193
+ "loss": 0.0013,
194
+ "step": 775
195
+ },
196
+ {
197
+ "epoch": 47.06,
198
+ "learning_rate": 2.4060000000000003e-06,
199
+ "loss": 0.0013,
200
+ "step": 800
201
+ },
202
+ {
203
+ "epoch": 48.53,
204
+ "learning_rate": 2.356e-06,
205
+ "loss": 0.0013,
206
+ "step": 825
207
+ },
208
+ {
209
+ "epoch": 50.0,
210
+ "learning_rate": 2.306e-06,
211
+ "loss": 0.0011,
212
+ "step": 850
213
+ },
214
+ {
215
+ "epoch": 51.47,
216
+ "learning_rate": 2.256e-06,
217
+ "loss": 0.0013,
218
+ "step": 875
219
+ },
220
+ {
221
+ "epoch": 52.94,
222
+ "learning_rate": 2.2059999999999997e-06,
223
+ "loss": 0.0014,
224
+ "step": 900
225
+ },
226
+ {
227
+ "epoch": 54.41,
228
+ "learning_rate": 2.156e-06,
229
+ "loss": 0.0011,
230
+ "step": 925
231
+ },
232
+ {
233
+ "epoch": 55.88,
234
+ "learning_rate": 2.106e-06,
235
+ "loss": 0.0024,
236
+ "step": 950
237
+ },
238
+ {
239
+ "epoch": 57.35,
240
+ "learning_rate": 2.0560000000000003e-06,
241
+ "loss": 0.0014,
242
+ "step": 975
243
+ },
244
+ {
245
+ "epoch": 58.82,
246
+ "learning_rate": 2.006e-06,
247
+ "loss": 0.0014,
248
+ "step": 1000
249
+ },
250
+ {
251
+ "epoch": 58.82,
252
+ "eval_loss": 0.494384765625,
253
+ "eval_runtime": 154.3699,
254
+ "eval_samples_per_second": 1.762,
255
+ "eval_steps_per_second": 0.11,
256
+ "eval_wer": 10.36404160475483,
257
+ "step": 1000
258
+ }
259
+ ],
260
+ "max_steps": 2000,
261
+ "num_train_epochs": 118,
262
+ "total_flos": 3.0824308756665336e+19,
263
+ "trial_name": null,
264
+ "trial_params": null
265
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ed73d2b317cd1e368b1f5f7eb5b1eb41b01338f297addbfd473d2f8fb949e5d
3
+ size 4731
checkpoint-1000/zero_to_fp32.py ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
4
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
5
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
6
+ # application.
7
+ #
8
+ # example: python zero_to_fp32.py . pytorch_model.bin
9
+
10
+ import argparse
11
+ import torch
12
+ import glob
13
+ import math
14
+ import os
15
+ import re
16
+ from collections import OrderedDict
17
+
18
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
19
+ # DeepSpeed data structures it has to be available in the current python environment.
20
+ from deepspeed.utils import logger
21
+ from deepspeed.checkpoint.constants import (DS_VERSION,
22
+ OPTIMIZER_STATE_DICT,
23
+ SINGLE_PARTITION_OF_FP32_GROUPS,
24
+ FP32_FLAT_GROUPS,
25
+ ZERO_STAGE,
26
+ PARTITION_COUNT,
27
+ PARAM_SHAPES,
28
+ BUFFER_NAMES)
29
+
30
+ debug = 0
31
+
32
+ # load to cpu
33
+ device = torch.device('cpu')
34
+
35
+
36
+ def atoi(text):
37
+ return int(text) if text.isdigit() else text
38
+
39
+
40
+ def natural_keys(text):
41
+ '''
42
+ alist.sort(key=natural_keys) sorts in human order
43
+ http://nedbatchelder.com/blog/200712/human_sorting.html
44
+ (See Toothy's implementation in the comments)
45
+ '''
46
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
47
+
48
+
49
+ def get_model_state_file(checkpoint_dir, zero_stage):
50
+ if not os.path.isdir(checkpoint_dir):
51
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
52
+
53
+ # there should be only one file
54
+ if zero_stage == 2:
55
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
56
+ elif zero_stage == 3:
57
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
58
+
59
+ if not os.path.exists(file):
60
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
61
+
62
+ return file
63
+
64
+
65
+ def get_optim_files(checkpoint_dir):
66
+ # XXX: need to test that this simple glob rule works for multi-node setup too
67
+ optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
68
+ "*_optim_states.pt")),
69
+ key=natural_keys)
70
+
71
+ if len(optim_files) == 0:
72
+ raise FileNotFoundError(
73
+ f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
74
+
75
+ return optim_files
76
+
77
+
78
+ def parse_model_state(file):
79
+ state_dict = torch.load(file, map_location=device)
80
+
81
+ if BUFFER_NAMES not in state_dict:
82
+ raise ValueError(f"{file} is not a model state checkpoint")
83
+ buffer_names = state_dict[BUFFER_NAMES]
84
+ if debug:
85
+ print("Found buffers:", buffer_names)
86
+
87
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
88
+ buffers = {
89
+ k: v.float()
90
+ for k,
91
+ v in state_dict["module"].items() if k in buffer_names
92
+ }
93
+ param_shapes = state_dict[PARAM_SHAPES]
94
+
95
+ ds_version = state_dict.get(DS_VERSION, None)
96
+
97
+ return buffers, param_shapes, ds_version
98
+
99
+
100
+ def parse_optim_states(files, ds_checkpoint_dir):
101
+
102
+ total_files = len(files)
103
+ state_dicts = []
104
+ for f in files:
105
+ state_dicts.append(torch.load(f, map_location=device))
106
+
107
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
108
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
109
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
110
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
111
+
112
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
113
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
114
+ # use the max of the partition_count to get the dp world_size.
115
+
116
+ if type(world_size) is list:
117
+ world_size = max(world_size)
118
+
119
+ if world_size != total_files:
120
+ raise ValueError(
121
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
122
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
123
+ )
124
+
125
+ # the groups are named differently in each stage
126
+ if zero_stage == 2:
127
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
128
+ elif zero_stage == 3:
129
+ fp32_groups_key = FP32_FLAT_GROUPS
130
+ else:
131
+ raise ValueError(f"unknown zero stage {zero_stage}")
132
+
133
+ if zero_stage == 2:
134
+ fp32_flat_groups = [
135
+ state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
136
+ for i in range(len(state_dicts))
137
+ ]
138
+ elif zero_stage == 3:
139
+ # if there is more than one param group, there will be multiple flattened tensors - one
140
+ # flattened tensor per group - for simplicity merge them into a single tensor
141
+ #
142
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
143
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
144
+
145
+ fp32_flat_groups = [
146
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
147
+ 0) for i in range(len(state_dicts))
148
+ ]
149
+
150
+ return zero_stage, world_size, fp32_flat_groups
151
+
152
+
153
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
154
+ """
155
+ Returns fp32 state_dict reconstructed from ds checkpoint
156
+
157
+ Args:
158
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
159
+
160
+ """
161
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
162
+
163
+ optim_files = get_optim_files(ds_checkpoint_dir)
164
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
165
+ print(
166
+ f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
167
+
168
+ model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
169
+ buffers, param_shapes, ds_version = parse_model_state(model_file)
170
+ print(f'Parsing checkpoint created by deepspeed=={ds_version}')
171
+
172
+ if zero_stage == 2:
173
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
174
+ param_shapes,
175
+ fp32_flat_groups,
176
+ buffers)
177
+ elif zero_stage == 3:
178
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
179
+ param_shapes,
180
+ fp32_flat_groups,
181
+ buffers)
182
+
183
+
184
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
185
+ param_shapes,
186
+ fp32_flat_groups,
187
+ buffers):
188
+
189
+ # Reconstruction protocol:
190
+ #
191
+ # XXX: document this
192
+
193
+ if debug:
194
+ for i in range(world_size):
195
+ for j in range(len(fp32_flat_groups[0])):
196
+ print(
197
+ f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
198
+
199
+ # XXX: memory usage doubles here (zero2)
200
+ num_param_groups = len(fp32_flat_groups[0])
201
+ merged_single_partition_of_fp32_groups = []
202
+ for i in range(num_param_groups):
203
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
204
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
205
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
206
+ avail_numel = sum([
207
+ full_single_fp32_vector.numel()
208
+ for full_single_fp32_vector in merged_single_partition_of_fp32_groups
209
+ ])
210
+
211
+ if debug:
212
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
213
+ wanted_numel = sum(
214
+ [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
215
+ # not asserting if there is a mismatch due to possible padding
216
+ print(f"Have {avail_numel} numels to process.")
217
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
218
+
219
+ state_dict = OrderedDict()
220
+
221
+ # buffers
222
+ state_dict.update(buffers)
223
+ if debug:
224
+ print(f"added {len(buffers)} buffers")
225
+
226
+ # params
227
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
228
+ # out-of-core computing solution
229
+ total_numel = 0
230
+ total_params = 0
231
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
232
+ offset = 0
233
+ avail_numel = full_single_fp32_vector.numel()
234
+ for name, shape in shapes.items():
235
+
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+ total_params += 1
239
+
240
+ if debug:
241
+ print(
242
+ f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
243
+ )
244
+ state_dict[name] = full_single_fp32_vector.narrow(
245
+ 0,
246
+ offset,
247
+ unpartitioned_numel).view(shape)
248
+ offset += unpartitioned_numel
249
+
250
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
251
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
252
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
253
+ # live optimizer object, so we are checking that the numbers are within the right range
254
+ align_to = 2 * world_size
255
+
256
+ def zero2_align(x):
257
+ return align_to * math.ceil(x / align_to)
258
+
259
+ if debug:
260
+ print(f"original offset={offset}, avail_numel={avail_numel}")
261
+
262
+ offset = zero2_align(offset)
263
+ avail_numel = zero2_align(avail_numel)
264
+
265
+ if debug:
266
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
267
+
268
+ # Sanity check
269
+ if offset != avail_numel:
270
+ raise ValueError(
271
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
272
+
273
+ print(
274
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
275
+ )
276
+
277
+ return state_dict
278
+
279
+
280
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
281
+ remainder = unpartitioned_numel % world_size
282
+ padding_numel = (world_size - remainder) if remainder else 0
283
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
284
+ return partitioned_numel, padding_numel
285
+
286
+
287
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
288
+ param_shapes,
289
+ fp32_flat_groups,
290
+ buffers):
291
+
292
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
293
+ # param, re-consolidating each param, while dealing with padding if any
294
+
295
+ avail_numel = fp32_flat_groups[0].numel() * world_size
296
+ # merge list of dicts, preserving order
297
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
298
+
299
+ if debug:
300
+ for i in range(world_size):
301
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
302
+
303
+ wanted_params = len(param_shapes)
304
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
305
+ # not asserting if there is a mismatch due to possible padding
306
+ print(f"Have {avail_numel} numels to process.")
307
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
308
+
309
+ state_dict = OrderedDict()
310
+
311
+ # buffers
312
+ state_dict.update(buffers)
313
+ if debug:
314
+ print(f"added {len(buffers)} buffers")
315
+
316
+ # params
317
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
318
+ # out-of-core computing solution
319
+ offset = 0
320
+ total_numel = 0
321
+ total_params = 0
322
+ for name, shape in param_shapes.items():
323
+
324
+ unpartitioned_numel = shape.numel()
325
+ total_numel += unpartitioned_numel
326
+ total_params += 1
327
+
328
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
329
+
330
+ if debug:
331
+ print(
332
+ f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
333
+ )
334
+
335
+ # XXX: memory usage doubles here
336
+ state_dict[name] = torch.cat(
337
+ tuple(fp32_flat_groups[i].narrow(0,
338
+ offset,
339
+ partitioned_numel)
340
+ for i in range(world_size)),
341
+ 0).narrow(0,
342
+ 0,
343
+ unpartitioned_numel).view(shape)
344
+ offset += partitioned_numel
345
+
346
+ offset *= world_size
347
+
348
+ # Sanity check
349
+ if offset != avail_numel:
350
+ raise ValueError(
351
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
352
+
353
+ print(
354
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
355
+ )
356
+
357
+ return state_dict
358
+
359
+
360
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
361
+ """
362
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
363
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
364
+ via a model hub.
365
+
366
+ Args:
367
+ - ``checkpoint_dir``: path to the desired checkpoint folder
368
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
369
+
370
+ Returns:
371
+ - pytorch ``state_dict``
372
+
373
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
374
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
375
+ the checkpoint.
376
+
377
+ A typical usage might be ::
378
+
379
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
380
+ # do the training and checkpoint saving
381
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
382
+ model = model.cpu() # move to cpu
383
+ model.load_state_dict(state_dict)
384
+ # submit to model hub or save the model to share with others
385
+
386
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
387
+ application. i.e. you will need to re-initialize the deepspeed engine, since
388
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
389
+
390
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
391
+
392
+ """
393
+ if tag is None:
394
+ latest_path = os.path.join(checkpoint_dir, 'latest')
395
+ if os.path.isfile(latest_path):
396
+ with open(latest_path, 'r') as fd:
397
+ tag = fd.read().strip()
398
+ else:
399
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
400
+
401
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
402
+
403
+ if not os.path.isdir(ds_checkpoint_dir):
404
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
405
+
406
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
407
+
408
+
409
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
410
+ """
411
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
412
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
413
+
414
+ Args:
415
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
416
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
417
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
418
+ """
419
+
420
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
421
+ print(f"Saving fp32 state dict to {output_file}")
422
+ torch.save(state_dict, output_file)
423
+
424
+
425
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
426
+ """
427
+ 1. Put the provided model to cpu
428
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
429
+ 3. Load it into the provided model
430
+
431
+ Args:
432
+ - ``model``: the model object to update
433
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
434
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
435
+
436
+ Returns:
437
+ - ``model`: modified model
438
+
439
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
440
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
441
+ conveniently placed for you in the checkpoint folder.
442
+
443
+ A typical usage might be ::
444
+
445
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
446
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
447
+ # submit to model hub or save the model to share with others
448
+
449
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
450
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
451
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
452
+
453
+ """
454
+ logger.info(f"Extracting fp32 weights")
455
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
456
+
457
+ logger.info(f"Overwriting model with fp32 weights")
458
+ model = model.cpu()
459
+ model.load_state_dict(state_dict, strict=False)
460
+
461
+ return model
462
+
463
+
464
+ if __name__ == "__main__":
465
+
466
+ parser = argparse.ArgumentParser()
467
+ parser.add_argument(
468
+ "checkpoint_dir",
469
+ type=str,
470
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
471
+ parser.add_argument(
472
+ "output_file",
473
+ type=str,
474
+ help=
475
+ "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
476
+ )
477
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
478
+ args = parser.parse_args()
479
+
480
+ debug = args.debug
481
+
482
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)
checkpoint-12000/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "emilios/whisper-medium-el-n2",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 24,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.1,
21
+ "encoder_attention_heads": 16,
22
+ "encoder_ffn_dim": 4096,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 24,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": null,
27
+ "init_std": 0.02,
28
+ "is_encoder_decoder": true,
29
+ "max_length": 448,
30
+ "max_source_positions": 1500,
31
+ "max_target_positions": 448,
32
+ "model_type": "whisper",
33
+ "num_hidden_layers": 24,
34
+ "num_mel_bins": 80,
35
+ "pad_token_id": 50257,
36
+ "scale_embedding": false,
37
+ "torch_dtype": "float16",
38
+ "transformers_version": "4.26.0.dev0",
39
+ "use_cache": false,
40
+ "vocab_size": 51865
41
+ }
checkpoint-12000/global_step12000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b1921338329bb83611c120057c3264ab1bec41f7243849ddbd999cc08e4388f
3
+ size 1527967899
checkpoint-12000/global_step12000/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25d3e4edd40f51bf3e4552546763afb7a68abde6020caca458418d54cace498d
3
+ size 9166378846
checkpoint-12000/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step12000
checkpoint-12000/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-12000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da3cc32424000ff954bf49215879ed2e1a0d4eaab55388c0687d6ddcca9269e4
3
+ size 1527847357
checkpoint-12000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c544d588f957f253e15a4fe9c58988611a6059c716db44b3095342db8e857f6
3
+ size 14575
checkpoint-12000/trainer_state.json ADDED
@@ -0,0 +1,3004 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 9.778974739970282,
3
+ "best_model_checkpoint": "./checkpoint-9000",
4
+ "epoch": 705.7647058823529,
5
+ "global_step": 12000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 2.78,
12
+ "learning_rate": 5.0453611334320685e-06,
13
+ "loss": 0.6804,
14
+ "step": 25
15
+ },
16
+ {
17
+ "epoch": 5.56,
18
+ "learning_rate": 6.229195710491767e-06,
19
+ "loss": 0.1847,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 8.33,
24
+ "learning_rate": 6.903829450223392e-06,
25
+ "loss": 0.0821,
26
+ "step": 75
27
+ },
28
+ {
29
+ "epoch": 11.11,
30
+ "learning_rate": 7.377725845391017e-06,
31
+ "loss": 0.0485,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 13.89,
36
+ "learning_rate": 7.743343231239583e-06,
37
+ "loss": 0.0432,
38
+ "step": 125
39
+ },
40
+ {
41
+ "epoch": 16.67,
42
+ "learning_rate": 8.041073861170494e-06,
43
+ "loss": 0.0328,
44
+ "step": 150
45
+ },
46
+ {
47
+ "epoch": 19.44,
48
+ "learning_rate": 8.292222957399574e-06,
49
+ "loss": 0.0291,
50
+ "step": 175
51
+ },
52
+ {
53
+ "epoch": 22.22,
54
+ "learning_rate": 8.509413541357755e-06,
55
+ "loss": 0.0298,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 25.0,
60
+ "learning_rate": 8.700744577655557e-06,
61
+ "loss": 0.0269,
62
+ "step": 225
63
+ },
64
+ {
65
+ "epoch": 27.78,
66
+ "learning_rate": 8.871723942761204e-06,
67
+ "loss": 0.0272,
68
+ "step": 250
69
+ },
70
+ {
71
+ "epoch": 30.56,
72
+ "learning_rate": 9.026267958246849e-06,
73
+ "loss": 0.027,
74
+ "step": 275
75
+ },
76
+ {
77
+ "epoch": 33.33,
78
+ "learning_rate": 9.16726106663399e-06,
79
+ "loss": 0.0213,
80
+ "step": 300
81
+ },
82
+ {
83
+ "epoch": 36.11,
84
+ "learning_rate": 9.296889251455016e-06,
85
+ "loss": 0.0215,
86
+ "step": 325
87
+ },
88
+ {
89
+ "epoch": 38.89,
90
+ "learning_rate": 9.416848797368692e-06,
91
+ "loss": 0.0195,
92
+ "step": 350
93
+ },
94
+ {
95
+ "epoch": 41.67,
96
+ "learning_rate": 9.528482449516371e-06,
97
+ "loss": 0.0167,
98
+ "step": 375
99
+ },
100
+ {
101
+ "epoch": 44.44,
102
+ "learning_rate": 9.632871309784314e-06,
103
+ "loss": 0.0184,
104
+ "step": 400
105
+ },
106
+ {
107
+ "epoch": 47.22,
108
+ "learning_rate": 9.73089868785391e-06,
109
+ "loss": 0.0159,
110
+ "step": 425
111
+ },
112
+ {
113
+ "epoch": 50.0,
114
+ "learning_rate": 9.823295589572114e-06,
115
+ "loss": 0.0172,
116
+ "step": 450
117
+ },
118
+ {
119
+ "epoch": 52.78,
120
+ "learning_rate": 9.910673836465484e-06,
121
+ "loss": 0.0123,
122
+ "step": 475
123
+ },
124
+ {
125
+ "epoch": 55.56,
126
+ "learning_rate": 9.993550644973805e-06,
127
+ "loss": 0.0144,
128
+ "step": 500
129
+ },
130
+ {
131
+ "epoch": 58.33,
132
+ "learning_rate": 9.951111111111111e-06,
133
+ "loss": 0.0135,
134
+ "step": 525
135
+ },
136
+ {
137
+ "epoch": 61.11,
138
+ "learning_rate": 9.895555555555557e-06,
139
+ "loss": 0.0128,
140
+ "step": 550
141
+ },
142
+ {
143
+ "epoch": 63.89,
144
+ "learning_rate": 9.84e-06,
145
+ "loss": 0.0115,
146
+ "step": 575
147
+ },
148
+ {
149
+ "epoch": 66.67,
150
+ "learning_rate": 9.784444444444445e-06,
151
+ "loss": 0.0105,
152
+ "step": 600
153
+ },
154
+ {
155
+ "epoch": 69.44,
156
+ "learning_rate": 9.72888888888889e-06,
157
+ "loss": 0.0104,
158
+ "step": 625
159
+ },
160
+ {
161
+ "epoch": 72.22,
162
+ "learning_rate": 9.673333333333334e-06,
163
+ "loss": 0.0087,
164
+ "step": 650
165
+ },
166
+ {
167
+ "epoch": 75.0,
168
+ "learning_rate": 9.617777777777778e-06,
169
+ "loss": 0.0091,
170
+ "step": 675
171
+ },
172
+ {
173
+ "epoch": 77.78,
174
+ "learning_rate": 9.562222222222223e-06,
175
+ "loss": 0.0085,
176
+ "step": 700
177
+ },
178
+ {
179
+ "epoch": 80.56,
180
+ "learning_rate": 9.506666666666667e-06,
181
+ "loss": 0.011,
182
+ "step": 725
183
+ },
184
+ {
185
+ "epoch": 83.33,
186
+ "learning_rate": 9.451111111111112e-06,
187
+ "loss": 0.0117,
188
+ "step": 750
189
+ },
190
+ {
191
+ "epoch": 86.11,
192
+ "learning_rate": 9.395555555555556e-06,
193
+ "loss": 0.0088,
194
+ "step": 775
195
+ },
196
+ {
197
+ "epoch": 88.89,
198
+ "learning_rate": 9.340000000000002e-06,
199
+ "loss": 0.0077,
200
+ "step": 800
201
+ },
202
+ {
203
+ "epoch": 91.67,
204
+ "learning_rate": 9.284444444444444e-06,
205
+ "loss": 0.0091,
206
+ "step": 825
207
+ },
208
+ {
209
+ "epoch": 94.44,
210
+ "learning_rate": 9.22888888888889e-06,
211
+ "loss": 0.0067,
212
+ "step": 850
213
+ },
214
+ {
215
+ "epoch": 97.22,
216
+ "learning_rate": 9.173333333333334e-06,
217
+ "loss": 0.0082,
218
+ "step": 875
219
+ },
220
+ {
221
+ "epoch": 100.0,
222
+ "learning_rate": 9.117777777777778e-06,
223
+ "loss": 0.0055,
224
+ "step": 900
225
+ },
226
+ {
227
+ "epoch": 102.78,
228
+ "learning_rate": 9.062222222222224e-06,
229
+ "loss": 0.0077,
230
+ "step": 925
231
+ },
232
+ {
233
+ "epoch": 105.56,
234
+ "learning_rate": 9.006666666666666e-06,
235
+ "loss": 0.0055,
236
+ "step": 950
237
+ },
238
+ {
239
+ "epoch": 108.33,
240
+ "learning_rate": 8.951111111111112e-06,
241
+ "loss": 0.005,
242
+ "step": 975
243
+ },
244
+ {
245
+ "epoch": 111.11,
246
+ "learning_rate": 8.895555555555556e-06,
247
+ "loss": 0.0066,
248
+ "step": 1000
249
+ },
250
+ {
251
+ "epoch": 111.11,
252
+ "eval_loss": 0.2357177734375,
253
+ "eval_runtime": 64.7785,
254
+ "eval_samples_per_second": 2.022,
255
+ "eval_steps_per_second": 0.139,
256
+ "eval_wer": 23.044096728307252,
257
+ "step": 1000
258
+ },
259
+ {
260
+ "epoch": 113.89,
261
+ "learning_rate": 8.844444444444445e-06,
262
+ "loss": 0.0057,
263
+ "step": 1025
264
+ },
265
+ {
266
+ "epoch": 116.67,
267
+ "learning_rate": 8.788888888888891e-06,
268
+ "loss": 0.0096,
269
+ "step": 1050
270
+ },
271
+ {
272
+ "epoch": 119.44,
273
+ "learning_rate": 8.733333333333333e-06,
274
+ "loss": 0.0063,
275
+ "step": 1075
276
+ },
277
+ {
278
+ "epoch": 122.22,
279
+ "learning_rate": 8.677777777777779e-06,
280
+ "loss": 0.0069,
281
+ "step": 1100
282
+ },
283
+ {
284
+ "epoch": 125.0,
285
+ "learning_rate": 8.622222222222223e-06,
286
+ "loss": 0.0069,
287
+ "step": 1125
288
+ },
289
+ {
290
+ "epoch": 127.78,
291
+ "learning_rate": 8.566666666666667e-06,
292
+ "loss": 0.0046,
293
+ "step": 1150
294
+ },
295
+ {
296
+ "epoch": 130.56,
297
+ "learning_rate": 8.511111111111113e-06,
298
+ "loss": 0.0051,
299
+ "step": 1175
300
+ },
301
+ {
302
+ "epoch": 133.33,
303
+ "learning_rate": 8.455555555555555e-06,
304
+ "loss": 0.0055,
305
+ "step": 1200
306
+ },
307
+ {
308
+ "epoch": 136.11,
309
+ "learning_rate": 8.400000000000001e-06,
310
+ "loss": 0.0042,
311
+ "step": 1225
312
+ },
313
+ {
314
+ "epoch": 138.89,
315
+ "learning_rate": 8.344444444444445e-06,
316
+ "loss": 0.0042,
317
+ "step": 1250
318
+ },
319
+ {
320
+ "epoch": 141.67,
321
+ "learning_rate": 8.288888888888889e-06,
322
+ "loss": 0.005,
323
+ "step": 1275
324
+ },
325
+ {
326
+ "epoch": 144.44,
327
+ "learning_rate": 8.233333333333335e-06,
328
+ "loss": 0.0054,
329
+ "step": 1300
330
+ },
331
+ {
332
+ "epoch": 147.22,
333
+ "learning_rate": 8.177777777777779e-06,
334
+ "loss": 0.0052,
335
+ "step": 1325
336
+ },
337
+ {
338
+ "epoch": 150.0,
339
+ "learning_rate": 8.122222222222223e-06,
340
+ "loss": 0.0057,
341
+ "step": 1350
342
+ },
343
+ {
344
+ "epoch": 152.78,
345
+ "learning_rate": 8.066666666666667e-06,
346
+ "loss": 0.0039,
347
+ "step": 1375
348
+ },
349
+ {
350
+ "epoch": 155.56,
351
+ "learning_rate": 8.011111111111113e-06,
352
+ "loss": 0.0032,
353
+ "step": 1400
354
+ },
355
+ {
356
+ "epoch": 158.33,
357
+ "learning_rate": 7.955555555555557e-06,
358
+ "loss": 0.0034,
359
+ "step": 1425
360
+ },
361
+ {
362
+ "epoch": 161.11,
363
+ "learning_rate": 7.902222222222223e-06,
364
+ "loss": 0.0068,
365
+ "step": 1450
366
+ },
367
+ {
368
+ "epoch": 163.89,
369
+ "learning_rate": 7.846666666666667e-06,
370
+ "loss": 0.0034,
371
+ "step": 1475
372
+ },
373
+ {
374
+ "epoch": 166.67,
375
+ "learning_rate": 7.791111111111111e-06,
376
+ "loss": 0.0026,
377
+ "step": 1500
378
+ },
379
+ {
380
+ "epoch": 169.44,
381
+ "learning_rate": 7.735555555555557e-06,
382
+ "loss": 0.0036,
383
+ "step": 1525
384
+ },
385
+ {
386
+ "epoch": 172.22,
387
+ "learning_rate": 7.680000000000001e-06,
388
+ "loss": 0.0033,
389
+ "step": 1550
390
+ },
391
+ {
392
+ "epoch": 175.0,
393
+ "learning_rate": 7.624444444444445e-06,
394
+ "loss": 0.0021,
395
+ "step": 1575
396
+ },
397
+ {
398
+ "epoch": 177.78,
399
+ "learning_rate": 7.56888888888889e-06,
400
+ "loss": 0.0033,
401
+ "step": 1600
402
+ },
403
+ {
404
+ "epoch": 180.56,
405
+ "learning_rate": 7.513333333333334e-06,
406
+ "loss": 0.0037,
407
+ "step": 1625
408
+ },
409
+ {
410
+ "epoch": 183.33,
411
+ "learning_rate": 7.457777777777778e-06,
412
+ "loss": 0.0032,
413
+ "step": 1650
414
+ },
415
+ {
416
+ "epoch": 186.11,
417
+ "learning_rate": 7.402222222222223e-06,
418
+ "loss": 0.0037,
419
+ "step": 1675
420
+ },
421
+ {
422
+ "epoch": 188.89,
423
+ "learning_rate": 7.346666666666668e-06,
424
+ "loss": 0.0022,
425
+ "step": 1700
426
+ },
427
+ {
428
+ "epoch": 191.67,
429
+ "learning_rate": 7.291111111111112e-06,
430
+ "loss": 0.0024,
431
+ "step": 1725
432
+ },
433
+ {
434
+ "epoch": 194.44,
435
+ "learning_rate": 7.235555555555556e-06,
436
+ "loss": 0.0026,
437
+ "step": 1750
438
+ },
439
+ {
440
+ "epoch": 197.22,
441
+ "learning_rate": 7.180000000000001e-06,
442
+ "loss": 0.0022,
443
+ "step": 1775
444
+ },
445
+ {
446
+ "epoch": 200.0,
447
+ "learning_rate": 7.124444444444445e-06,
448
+ "loss": 0.0026,
449
+ "step": 1800
450
+ },
451
+ {
452
+ "epoch": 202.78,
453
+ "learning_rate": 7.06888888888889e-06,
454
+ "loss": 0.0032,
455
+ "step": 1825
456
+ },
457
+ {
458
+ "epoch": 205.56,
459
+ "learning_rate": 7.0133333333333345e-06,
460
+ "loss": 0.0033,
461
+ "step": 1850
462
+ },
463
+ {
464
+ "epoch": 208.33,
465
+ "learning_rate": 6.9577777777777785e-06,
466
+ "loss": 0.0027,
467
+ "step": 1875
468
+ },
469
+ {
470
+ "epoch": 211.11,
471
+ "learning_rate": 6.902222222222223e-06,
472
+ "loss": 0.0043,
473
+ "step": 1900
474
+ },
475
+ {
476
+ "epoch": 213.89,
477
+ "learning_rate": 6.846666666666667e-06,
478
+ "loss": 0.0028,
479
+ "step": 1925
480
+ },
481
+ {
482
+ "epoch": 216.67,
483
+ "learning_rate": 6.7911111111111115e-06,
484
+ "loss": 0.0012,
485
+ "step": 1950
486
+ },
487
+ {
488
+ "epoch": 219.44,
489
+ "learning_rate": 6.735555555555556e-06,
490
+ "loss": 0.0015,
491
+ "step": 1975
492
+ },
493
+ {
494
+ "epoch": 222.22,
495
+ "learning_rate": 6.680000000000001e-06,
496
+ "loss": 0.0024,
497
+ "step": 2000
498
+ },
499
+ {
500
+ "epoch": 222.22,
501
+ "eval_loss": 0.2607421875,
502
+ "eval_runtime": 57.0802,
503
+ "eval_samples_per_second": 2.295,
504
+ "eval_steps_per_second": 0.158,
505
+ "eval_wer": 19.665718349928877,
506
+ "step": 2000
507
+ },
508
+ {
509
+ "epoch": 225.0,
510
+ "learning_rate": 6.6244444444444445e-06,
511
+ "loss": 0.0029,
512
+ "step": 2025
513
+ },
514
+ {
515
+ "epoch": 227.78,
516
+ "learning_rate": 6.568888888888889e-06,
517
+ "loss": 0.0021,
518
+ "step": 2050
519
+ },
520
+ {
521
+ "epoch": 230.56,
522
+ "learning_rate": 6.513333333333333e-06,
523
+ "loss": 0.0022,
524
+ "step": 2075
525
+ },
526
+ {
527
+ "epoch": 233.33,
528
+ "learning_rate": 6.457777777777778e-06,
529
+ "loss": 0.0022,
530
+ "step": 2100
531
+ },
532
+ {
533
+ "epoch": 236.11,
534
+ "learning_rate": 6.402222222222223e-06,
535
+ "loss": 0.0011,
536
+ "step": 2125
537
+ },
538
+ {
539
+ "epoch": 238.89,
540
+ "learning_rate": 6.346666666666668e-06,
541
+ "loss": 0.0026,
542
+ "step": 2150
543
+ },
544
+ {
545
+ "epoch": 241.67,
546
+ "learning_rate": 6.291111111111111e-06,
547
+ "loss": 0.0021,
548
+ "step": 2175
549
+ },
550
+ {
551
+ "epoch": 244.44,
552
+ "learning_rate": 6.235555555555556e-06,
553
+ "loss": 0.0016,
554
+ "step": 2200
555
+ },
556
+ {
557
+ "epoch": 247.22,
558
+ "learning_rate": 6.18e-06,
559
+ "loss": 0.0024,
560
+ "step": 2225
561
+ },
562
+ {
563
+ "epoch": 250.0,
564
+ "learning_rate": 6.124444444444445e-06,
565
+ "loss": 0.0046,
566
+ "step": 2250
567
+ },
568
+ {
569
+ "epoch": 252.78,
570
+ "learning_rate": 6.06888888888889e-06,
571
+ "loss": 0.0018,
572
+ "step": 2275
573
+ },
574
+ {
575
+ "epoch": 255.56,
576
+ "learning_rate": 6.013333333333335e-06,
577
+ "loss": 0.0012,
578
+ "step": 2300
579
+ },
580
+ {
581
+ "epoch": 258.33,
582
+ "learning_rate": 5.957777777777778e-06,
583
+ "loss": 0.0014,
584
+ "step": 2325
585
+ },
586
+ {
587
+ "epoch": 261.11,
588
+ "learning_rate": 5.902222222222223e-06,
589
+ "loss": 0.0007,
590
+ "step": 2350
591
+ },
592
+ {
593
+ "epoch": 263.89,
594
+ "learning_rate": 5.846666666666667e-06,
595
+ "loss": 0.0014,
596
+ "step": 2375
597
+ },
598
+ {
599
+ "epoch": 266.67,
600
+ "learning_rate": 5.791111111111112e-06,
601
+ "loss": 0.0009,
602
+ "step": 2400
603
+ },
604
+ {
605
+ "epoch": 269.44,
606
+ "learning_rate": 5.735555555555557e-06,
607
+ "loss": 0.0008,
608
+ "step": 2425
609
+ },
610
+ {
611
+ "epoch": 272.22,
612
+ "learning_rate": 5.68e-06,
613
+ "loss": 0.0028,
614
+ "step": 2450
615
+ },
616
+ {
617
+ "epoch": 275.0,
618
+ "learning_rate": 5.624444444444445e-06,
619
+ "loss": 0.002,
620
+ "step": 2475
621
+ },
622
+ {
623
+ "epoch": 277.78,
624
+ "learning_rate": 5.56888888888889e-06,
625
+ "loss": 0.0011,
626
+ "step": 2500
627
+ },
628
+ {
629
+ "epoch": 280.56,
630
+ "learning_rate": 5.513333333333334e-06,
631
+ "loss": 0.001,
632
+ "step": 2525
633
+ },
634
+ {
635
+ "epoch": 283.33,
636
+ "learning_rate": 5.4577777777777785e-06,
637
+ "loss": 0.0007,
638
+ "step": 2550
639
+ },
640
+ {
641
+ "epoch": 286.11,
642
+ "learning_rate": 5.402222222222223e-06,
643
+ "loss": 0.0007,
644
+ "step": 2575
645
+ },
646
+ {
647
+ "epoch": 288.89,
648
+ "learning_rate": 5.346666666666667e-06,
649
+ "loss": 0.0008,
650
+ "step": 2600
651
+ },
652
+ {
653
+ "epoch": 291.67,
654
+ "learning_rate": 5.2911111111111115e-06,
655
+ "loss": 0.0012,
656
+ "step": 2625
657
+ },
658
+ {
659
+ "epoch": 294.44,
660
+ "learning_rate": 5.235555555555556e-06,
661
+ "loss": 0.0016,
662
+ "step": 2650
663
+ },
664
+ {
665
+ "epoch": 297.22,
666
+ "learning_rate": 5.18e-06,
667
+ "loss": 0.0012,
668
+ "step": 2675
669
+ },
670
+ {
671
+ "epoch": 300.0,
672
+ "learning_rate": 5.124444444444445e-06,
673
+ "loss": 0.001,
674
+ "step": 2700
675
+ },
676
+ {
677
+ "epoch": 302.78,
678
+ "learning_rate": 5.06888888888889e-06,
679
+ "loss": 0.0012,
680
+ "step": 2725
681
+ },
682
+ {
683
+ "epoch": 305.56,
684
+ "learning_rate": 5.013333333333333e-06,
685
+ "loss": 0.001,
686
+ "step": 2750
687
+ },
688
+ {
689
+ "epoch": 308.33,
690
+ "learning_rate": 4.957777777777778e-06,
691
+ "loss": 0.0013,
692
+ "step": 2775
693
+ },
694
+ {
695
+ "epoch": 311.11,
696
+ "learning_rate": 4.902222222222222e-06,
697
+ "loss": 0.0015,
698
+ "step": 2800
699
+ },
700
+ {
701
+ "epoch": 313.89,
702
+ "learning_rate": 4.846666666666667e-06,
703
+ "loss": 0.0014,
704
+ "step": 2825
705
+ },
706
+ {
707
+ "epoch": 316.67,
708
+ "learning_rate": 4.791111111111111e-06,
709
+ "loss": 0.0007,
710
+ "step": 2850
711
+ },
712
+ {
713
+ "epoch": 319.44,
714
+ "learning_rate": 4.735555555555556e-06,
715
+ "loss": 0.0009,
716
+ "step": 2875
717
+ },
718
+ {
719
+ "epoch": 322.22,
720
+ "learning_rate": 4.680000000000001e-06,
721
+ "loss": 0.0021,
722
+ "step": 2900
723
+ },
724
+ {
725
+ "epoch": 325.0,
726
+ "learning_rate": 4.624444444444445e-06,
727
+ "loss": 0.0015,
728
+ "step": 2925
729
+ },
730
+ {
731
+ "epoch": 327.78,
732
+ "learning_rate": 4.568888888888889e-06,
733
+ "loss": 0.0012,
734
+ "step": 2950
735
+ },
736
+ {
737
+ "epoch": 330.56,
738
+ "learning_rate": 4.513333333333333e-06,
739
+ "loss": 0.0009,
740
+ "step": 2975
741
+ },
742
+ {
743
+ "epoch": 333.33,
744
+ "learning_rate": 4.457777777777778e-06,
745
+ "loss": 0.0011,
746
+ "step": 3000
747
+ },
748
+ {
749
+ "epoch": 333.33,
750
+ "eval_loss": 0.277099609375,
751
+ "eval_runtime": 58.1634,
752
+ "eval_samples_per_second": 2.252,
753
+ "eval_steps_per_second": 0.155,
754
+ "eval_wer": 20.874822190611663,
755
+ "step": 3000
756
+ },
757
+ {
758
+ "epoch": 177.47,
759
+ "learning_rate": 1.760888888888889e-06,
760
+ "loss": 0.5801,
761
+ "step": 3025
762
+ },
763
+ {
764
+ "epoch": 178.94,
765
+ "learning_rate": 1.7386666666666666e-06,
766
+ "loss": 0.1501,
767
+ "step": 3050
768
+ },
769
+ {
770
+ "epoch": 180.41,
771
+ "learning_rate": 1.7164444444444444e-06,
772
+ "loss": 0.0789,
773
+ "step": 3075
774
+ },
775
+ {
776
+ "epoch": 181.88,
777
+ "learning_rate": 1.6942222222222222e-06,
778
+ "loss": 0.0531,
779
+ "step": 3100
780
+ },
781
+ {
782
+ "epoch": 183.35,
783
+ "learning_rate": 1.6719999999999998e-06,
784
+ "loss": 0.0409,
785
+ "step": 3125
786
+ },
787
+ {
788
+ "epoch": 184.82,
789
+ "learning_rate": 1.6497777777777777e-06,
790
+ "loss": 0.032,
791
+ "step": 3150
792
+ },
793
+ {
794
+ "epoch": 186.29,
795
+ "learning_rate": 1.6275555555555555e-06,
796
+ "loss": 0.0251,
797
+ "step": 3175
798
+ },
799
+ {
800
+ "epoch": 187.76,
801
+ "learning_rate": 1.6053333333333333e-06,
802
+ "loss": 0.0203,
803
+ "step": 3200
804
+ },
805
+ {
806
+ "epoch": 189.24,
807
+ "learning_rate": 1.5831111111111111e-06,
808
+ "loss": 0.0167,
809
+ "step": 3225
810
+ },
811
+ {
812
+ "epoch": 190.71,
813
+ "learning_rate": 1.560888888888889e-06,
814
+ "loss": 0.0159,
815
+ "step": 3250
816
+ },
817
+ {
818
+ "epoch": 192.18,
819
+ "learning_rate": 1.5386666666666666e-06,
820
+ "loss": 0.0137,
821
+ "step": 3275
822
+ },
823
+ {
824
+ "epoch": 193.65,
825
+ "learning_rate": 1.5164444444444444e-06,
826
+ "loss": 0.0122,
827
+ "step": 3300
828
+ },
829
+ {
830
+ "epoch": 195.12,
831
+ "learning_rate": 1.494222222222222e-06,
832
+ "loss": 0.0106,
833
+ "step": 3325
834
+ },
835
+ {
836
+ "epoch": 196.59,
837
+ "learning_rate": 1.4719999999999998e-06,
838
+ "loss": 0.0094,
839
+ "step": 3350
840
+ },
841
+ {
842
+ "epoch": 198.06,
843
+ "learning_rate": 1.4497777777777777e-06,
844
+ "loss": 0.009,
845
+ "step": 3375
846
+ },
847
+ {
848
+ "epoch": 199.53,
849
+ "learning_rate": 1.4275555555555555e-06,
850
+ "loss": 0.0104,
851
+ "step": 3400
852
+ },
853
+ {
854
+ "epoch": 201.0,
855
+ "learning_rate": 1.4053333333333333e-06,
856
+ "loss": 0.0069,
857
+ "step": 3425
858
+ },
859
+ {
860
+ "epoch": 202.47,
861
+ "learning_rate": 1.3848888888888889e-06,
862
+ "loss": 0.0073,
863
+ "step": 3450
864
+ },
865
+ {
866
+ "epoch": 203.94,
867
+ "learning_rate": 1.3626666666666667e-06,
868
+ "loss": 0.0073,
869
+ "step": 3475
870
+ },
871
+ {
872
+ "epoch": 205.41,
873
+ "learning_rate": 1.3404444444444445e-06,
874
+ "loss": 0.0063,
875
+ "step": 3500
876
+ },
877
+ {
878
+ "epoch": 206.88,
879
+ "learning_rate": 1.3182222222222221e-06,
880
+ "loss": 0.007,
881
+ "step": 3525
882
+ },
883
+ {
884
+ "epoch": 208.35,
885
+ "learning_rate": 1.296e-06,
886
+ "loss": 0.0061,
887
+ "step": 3550
888
+ },
889
+ {
890
+ "epoch": 209.82,
891
+ "learning_rate": 1.2737777777777776e-06,
892
+ "loss": 0.0053,
893
+ "step": 3575
894
+ },
895
+ {
896
+ "epoch": 211.29,
897
+ "learning_rate": 1.2515555555555554e-06,
898
+ "loss": 0.0056,
899
+ "step": 3600
900
+ },
901
+ {
902
+ "epoch": 212.76,
903
+ "learning_rate": 1.2293333333333334e-06,
904
+ "loss": 0.005,
905
+ "step": 3625
906
+ },
907
+ {
908
+ "epoch": 214.24,
909
+ "learning_rate": 1.207111111111111e-06,
910
+ "loss": 0.0047,
911
+ "step": 3650
912
+ },
913
+ {
914
+ "epoch": 215.71,
915
+ "learning_rate": 1.1848888888888889e-06,
916
+ "loss": 0.0052,
917
+ "step": 3675
918
+ },
919
+ {
920
+ "epoch": 217.18,
921
+ "learning_rate": 1.1626666666666667e-06,
922
+ "loss": 0.0044,
923
+ "step": 3700
924
+ },
925
+ {
926
+ "epoch": 218.65,
927
+ "learning_rate": 1.1404444444444443e-06,
928
+ "loss": 0.0046,
929
+ "step": 3725
930
+ },
931
+ {
932
+ "epoch": 220.12,
933
+ "learning_rate": 1.1182222222222221e-06,
934
+ "loss": 0.0045,
935
+ "step": 3750
936
+ },
937
+ {
938
+ "epoch": 221.59,
939
+ "learning_rate": 1.096e-06,
940
+ "loss": 0.0041,
941
+ "step": 3775
942
+ },
943
+ {
944
+ "epoch": 223.06,
945
+ "learning_rate": 1.0737777777777776e-06,
946
+ "loss": 0.0054,
947
+ "step": 3800
948
+ },
949
+ {
950
+ "epoch": 224.53,
951
+ "learning_rate": 1.0515555555555556e-06,
952
+ "loss": 0.0038,
953
+ "step": 3825
954
+ },
955
+ {
956
+ "epoch": 226.0,
957
+ "learning_rate": 1.0293333333333334e-06,
958
+ "loss": 0.0038,
959
+ "step": 3850
960
+ },
961
+ {
962
+ "epoch": 227.47,
963
+ "learning_rate": 1.007111111111111e-06,
964
+ "loss": 0.004,
965
+ "step": 3875
966
+ },
967
+ {
968
+ "epoch": 228.94,
969
+ "learning_rate": 9.848888888888889e-07,
970
+ "loss": 0.0036,
971
+ "step": 3900
972
+ },
973
+ {
974
+ "epoch": 230.41,
975
+ "learning_rate": 9.626666666666667e-07,
976
+ "loss": 0.0041,
977
+ "step": 3925
978
+ },
979
+ {
980
+ "epoch": 231.88,
981
+ "learning_rate": 9.404444444444443e-07,
982
+ "loss": 0.0032,
983
+ "step": 3950
984
+ },
985
+ {
986
+ "epoch": 233.35,
987
+ "learning_rate": 9.182222222222223e-07,
988
+ "loss": 0.0038,
989
+ "step": 3975
990
+ },
991
+ {
992
+ "epoch": 234.82,
993
+ "learning_rate": 8.96e-07,
994
+ "loss": 0.0043,
995
+ "step": 4000
996
+ },
997
+ {
998
+ "epoch": 234.82,
999
+ "eval_loss": 0.45361328125,
1000
+ "eval_runtime": 157.593,
1001
+ "eval_samples_per_second": 1.726,
1002
+ "eval_steps_per_second": 0.108,
1003
+ "eval_wer": 10.707652303120357,
1004
+ "step": 4000
1005
+ },
1006
+ {
1007
+ "epoch": 236.29,
1008
+ "learning_rate": 8.737777777777777e-07,
1009
+ "loss": 0.004,
1010
+ "step": 4025
1011
+ },
1012
+ {
1013
+ "epoch": 237.76,
1014
+ "learning_rate": 8.515555555555555e-07,
1015
+ "loss": 0.0029,
1016
+ "step": 4050
1017
+ },
1018
+ {
1019
+ "epoch": 239.24,
1020
+ "learning_rate": 8.293333333333333e-07,
1021
+ "loss": 0.0034,
1022
+ "step": 4075
1023
+ },
1024
+ {
1025
+ "epoch": 240.71,
1026
+ "learning_rate": 8.071111111111111e-07,
1027
+ "loss": 0.0032,
1028
+ "step": 4100
1029
+ },
1030
+ {
1031
+ "epoch": 242.18,
1032
+ "learning_rate": 7.848888888888888e-07,
1033
+ "loss": 0.003,
1034
+ "step": 4125
1035
+ },
1036
+ {
1037
+ "epoch": 243.65,
1038
+ "learning_rate": 7.626666666666667e-07,
1039
+ "loss": 0.0034,
1040
+ "step": 4150
1041
+ },
1042
+ {
1043
+ "epoch": 245.12,
1044
+ "learning_rate": 7.404444444444444e-07,
1045
+ "loss": 0.0032,
1046
+ "step": 4175
1047
+ },
1048
+ {
1049
+ "epoch": 246.59,
1050
+ "learning_rate": 7.182222222222222e-07,
1051
+ "loss": 0.0032,
1052
+ "step": 4200
1053
+ },
1054
+ {
1055
+ "epoch": 248.06,
1056
+ "learning_rate": 6.959999999999999e-07,
1057
+ "loss": 0.0028,
1058
+ "step": 4225
1059
+ },
1060
+ {
1061
+ "epoch": 249.53,
1062
+ "learning_rate": 6.737777777777778e-07,
1063
+ "loss": 0.0028,
1064
+ "step": 4250
1065
+ },
1066
+ {
1067
+ "epoch": 251.0,
1068
+ "learning_rate": 6.515555555555555e-07,
1069
+ "loss": 0.0025,
1070
+ "step": 4275
1071
+ },
1072
+ {
1073
+ "epoch": 252.47,
1074
+ "learning_rate": 6.293333333333333e-07,
1075
+ "loss": 0.0026,
1076
+ "step": 4300
1077
+ },
1078
+ {
1079
+ "epoch": 253.94,
1080
+ "learning_rate": 6.071111111111111e-07,
1081
+ "loss": 0.003,
1082
+ "step": 4325
1083
+ },
1084
+ {
1085
+ "epoch": 255.41,
1086
+ "learning_rate": 5.848888888888889e-07,
1087
+ "loss": 0.0026,
1088
+ "step": 4350
1089
+ },
1090
+ {
1091
+ "epoch": 256.88,
1092
+ "learning_rate": 5.626666666666666e-07,
1093
+ "loss": 0.0027,
1094
+ "step": 4375
1095
+ },
1096
+ {
1097
+ "epoch": 258.35,
1098
+ "learning_rate": 5.404444444444443e-07,
1099
+ "loss": 0.003,
1100
+ "step": 4400
1101
+ },
1102
+ {
1103
+ "epoch": 259.82,
1104
+ "learning_rate": 5.182222222222223e-07,
1105
+ "loss": 0.0027,
1106
+ "step": 4425
1107
+ },
1108
+ {
1109
+ "epoch": 261.29,
1110
+ "learning_rate": 4.977777777777777e-07,
1111
+ "loss": 0.0026,
1112
+ "step": 4450
1113
+ },
1114
+ {
1115
+ "epoch": 262.76,
1116
+ "learning_rate": 4.7555555555555554e-07,
1117
+ "loss": 0.0023,
1118
+ "step": 4475
1119
+ },
1120
+ {
1121
+ "epoch": 264.24,
1122
+ "learning_rate": 4.5333333333333326e-07,
1123
+ "loss": 0.0021,
1124
+ "step": 4500
1125
+ },
1126
+ {
1127
+ "epoch": 265.71,
1128
+ "learning_rate": 4.311111111111111e-07,
1129
+ "loss": 0.0022,
1130
+ "step": 4525
1131
+ },
1132
+ {
1133
+ "epoch": 267.18,
1134
+ "learning_rate": 4.088888888888889e-07,
1135
+ "loss": 0.0034,
1136
+ "step": 4550
1137
+ },
1138
+ {
1139
+ "epoch": 268.65,
1140
+ "learning_rate": 3.8666666666666664e-07,
1141
+ "loss": 0.0023,
1142
+ "step": 4575
1143
+ },
1144
+ {
1145
+ "epoch": 270.12,
1146
+ "learning_rate": 3.6444444444444446e-07,
1147
+ "loss": 0.0022,
1148
+ "step": 4600
1149
+ },
1150
+ {
1151
+ "epoch": 271.59,
1152
+ "learning_rate": 3.422222222222222e-07,
1153
+ "loss": 0.0022,
1154
+ "step": 4625
1155
+ },
1156
+ {
1157
+ "epoch": 273.06,
1158
+ "learning_rate": 3.2e-07,
1159
+ "loss": 0.0024,
1160
+ "step": 4650
1161
+ },
1162
+ {
1163
+ "epoch": 274.53,
1164
+ "learning_rate": 2.9777777777777773e-07,
1165
+ "loss": 0.0031,
1166
+ "step": 4675
1167
+ },
1168
+ {
1169
+ "epoch": 276.0,
1170
+ "learning_rate": 2.7555555555555555e-07,
1171
+ "loss": 0.0022,
1172
+ "step": 4700
1173
+ },
1174
+ {
1175
+ "epoch": 277.47,
1176
+ "learning_rate": 2.533333333333333e-07,
1177
+ "loss": 0.0022,
1178
+ "step": 4725
1179
+ },
1180
+ {
1181
+ "epoch": 278.94,
1182
+ "learning_rate": 2.311111111111111e-07,
1183
+ "loss": 0.0021,
1184
+ "step": 4750
1185
+ },
1186
+ {
1187
+ "epoch": 280.41,
1188
+ "learning_rate": 2.088888888888889e-07,
1189
+ "loss": 0.0023,
1190
+ "step": 4775
1191
+ },
1192
+ {
1193
+ "epoch": 281.88,
1194
+ "learning_rate": 1.8666666666666667e-07,
1195
+ "loss": 0.0025,
1196
+ "step": 4800
1197
+ },
1198
+ {
1199
+ "epoch": 283.35,
1200
+ "learning_rate": 1.6444444444444444e-07,
1201
+ "loss": 0.0022,
1202
+ "step": 4825
1203
+ },
1204
+ {
1205
+ "epoch": 284.82,
1206
+ "learning_rate": 1.4222222222222222e-07,
1207
+ "loss": 0.0022,
1208
+ "step": 4850
1209
+ },
1210
+ {
1211
+ "epoch": 286.29,
1212
+ "learning_rate": 1.2e-07,
1213
+ "loss": 0.0021,
1214
+ "step": 4875
1215
+ },
1216
+ {
1217
+ "epoch": 287.76,
1218
+ "learning_rate": 9.777777777777778e-08,
1219
+ "loss": 0.0023,
1220
+ "step": 4900
1221
+ },
1222
+ {
1223
+ "epoch": 289.24,
1224
+ "learning_rate": 7.555555555555555e-08,
1225
+ "loss": 0.002,
1226
+ "step": 4925
1227
+ },
1228
+ {
1229
+ "epoch": 290.71,
1230
+ "learning_rate": 5.3333333333333334e-08,
1231
+ "loss": 0.0025,
1232
+ "step": 4950
1233
+ },
1234
+ {
1235
+ "epoch": 292.18,
1236
+ "learning_rate": 3.111111111111111e-08,
1237
+ "loss": 0.002,
1238
+ "step": 4975
1239
+ },
1240
+ {
1241
+ "epoch": 293.65,
1242
+ "learning_rate": 8.888888888888889e-09,
1243
+ "loss": 0.0024,
1244
+ "step": 5000
1245
+ },
1246
+ {
1247
+ "epoch": 293.65,
1248
+ "eval_loss": 0.465576171875,
1249
+ "eval_runtime": 158.123,
1250
+ "eval_samples_per_second": 1.72,
1251
+ "eval_steps_per_second": 0.108,
1252
+ "eval_wer": 10.642644873699851,
1253
+ "step": 5000
1254
+ },
1255
+ {
1256
+ "epoch": 295.47,
1257
+ "learning_rate": 2.7544827586206896e-06,
1258
+ "loss": 0.0021,
1259
+ "step": 5025
1260
+ },
1261
+ {
1262
+ "epoch": 296.94,
1263
+ "learning_rate": 2.7475862068965512e-06,
1264
+ "loss": 0.0024,
1265
+ "step": 5050
1266
+ },
1267
+ {
1268
+ "epoch": 298.41,
1269
+ "learning_rate": 2.7406896551724137e-06,
1270
+ "loss": 0.0025,
1271
+ "step": 5075
1272
+ },
1273
+ {
1274
+ "epoch": 299.88,
1275
+ "learning_rate": 2.7337931034482757e-06,
1276
+ "loss": 0.0022,
1277
+ "step": 5100
1278
+ },
1279
+ {
1280
+ "epoch": 301.35,
1281
+ "learning_rate": 2.7268965517241378e-06,
1282
+ "loss": 0.0027,
1283
+ "step": 5125
1284
+ },
1285
+ {
1286
+ "epoch": 302.82,
1287
+ "learning_rate": 2.7200000000000002e-06,
1288
+ "loss": 0.0024,
1289
+ "step": 5150
1290
+ },
1291
+ {
1292
+ "epoch": 304.29,
1293
+ "learning_rate": 2.713103448275862e-06,
1294
+ "loss": 0.0024,
1295
+ "step": 5175
1296
+ },
1297
+ {
1298
+ "epoch": 305.76,
1299
+ "learning_rate": 2.7062068965517243e-06,
1300
+ "loss": 0.0023,
1301
+ "step": 5200
1302
+ },
1303
+ {
1304
+ "epoch": 307.24,
1305
+ "learning_rate": 2.699310344827586e-06,
1306
+ "loss": 0.0027,
1307
+ "step": 5225
1308
+ },
1309
+ {
1310
+ "epoch": 308.71,
1311
+ "learning_rate": 2.6924137931034483e-06,
1312
+ "loss": 0.0023,
1313
+ "step": 5250
1314
+ },
1315
+ {
1316
+ "epoch": 310.18,
1317
+ "learning_rate": 2.68551724137931e-06,
1318
+ "loss": 0.0021,
1319
+ "step": 5275
1320
+ },
1321
+ {
1322
+ "epoch": 311.65,
1323
+ "learning_rate": 2.6786206896551724e-06,
1324
+ "loss": 0.0025,
1325
+ "step": 5300
1326
+ },
1327
+ {
1328
+ "epoch": 313.12,
1329
+ "learning_rate": 2.6717241379310344e-06,
1330
+ "loss": 0.0021,
1331
+ "step": 5325
1332
+ },
1333
+ {
1334
+ "epoch": 314.59,
1335
+ "learning_rate": 2.6648275862068965e-06,
1336
+ "loss": 0.0019,
1337
+ "step": 5350
1338
+ },
1339
+ {
1340
+ "epoch": 316.06,
1341
+ "learning_rate": 2.6579310344827585e-06,
1342
+ "loss": 0.0019,
1343
+ "step": 5375
1344
+ },
1345
+ {
1346
+ "epoch": 317.53,
1347
+ "learning_rate": 2.6510344827586205e-06,
1348
+ "loss": 0.0018,
1349
+ "step": 5400
1350
+ },
1351
+ {
1352
+ "epoch": 319.0,
1353
+ "learning_rate": 2.6441379310344826e-06,
1354
+ "loss": 0.0022,
1355
+ "step": 5425
1356
+ },
1357
+ {
1358
+ "epoch": 320.47,
1359
+ "learning_rate": 2.6377931034482757e-06,
1360
+ "loss": 0.0019,
1361
+ "step": 5450
1362
+ },
1363
+ {
1364
+ "epoch": 321.94,
1365
+ "learning_rate": 2.6308965517241377e-06,
1366
+ "loss": 0.0016,
1367
+ "step": 5475
1368
+ },
1369
+ {
1370
+ "epoch": 323.41,
1371
+ "learning_rate": 2.624e-06,
1372
+ "loss": 0.0013,
1373
+ "step": 5500
1374
+ },
1375
+ {
1376
+ "epoch": 324.88,
1377
+ "learning_rate": 2.617103448275862e-06,
1378
+ "loss": 0.0019,
1379
+ "step": 5525
1380
+ },
1381
+ {
1382
+ "epoch": 326.35,
1383
+ "learning_rate": 2.6102068965517243e-06,
1384
+ "loss": 0.0017,
1385
+ "step": 5550
1386
+ },
1387
+ {
1388
+ "epoch": 327.82,
1389
+ "learning_rate": 2.603310344827586e-06,
1390
+ "loss": 0.0018,
1391
+ "step": 5575
1392
+ },
1393
+ {
1394
+ "epoch": 329.29,
1395
+ "learning_rate": 2.5964137931034483e-06,
1396
+ "loss": 0.0013,
1397
+ "step": 5600
1398
+ },
1399
+ {
1400
+ "epoch": 330.76,
1401
+ "learning_rate": 2.58951724137931e-06,
1402
+ "loss": 0.0016,
1403
+ "step": 5625
1404
+ },
1405
+ {
1406
+ "epoch": 332.24,
1407
+ "learning_rate": 2.5826206896551724e-06,
1408
+ "loss": 0.0013,
1409
+ "step": 5650
1410
+ },
1411
+ {
1412
+ "epoch": 333.71,
1413
+ "learning_rate": 2.575724137931034e-06,
1414
+ "loss": 0.0018,
1415
+ "step": 5675
1416
+ },
1417
+ {
1418
+ "epoch": 335.18,
1419
+ "learning_rate": 2.5688275862068965e-06,
1420
+ "loss": 0.0014,
1421
+ "step": 5700
1422
+ },
1423
+ {
1424
+ "epoch": 336.65,
1425
+ "learning_rate": 2.561931034482759e-06,
1426
+ "loss": 0.0013,
1427
+ "step": 5725
1428
+ },
1429
+ {
1430
+ "epoch": 338.12,
1431
+ "learning_rate": 2.5550344827586205e-06,
1432
+ "loss": 0.0011,
1433
+ "step": 5750
1434
+ },
1435
+ {
1436
+ "epoch": 339.59,
1437
+ "learning_rate": 2.548137931034483e-06,
1438
+ "loss": 0.0018,
1439
+ "step": 5775
1440
+ },
1441
+ {
1442
+ "epoch": 341.06,
1443
+ "learning_rate": 2.5412413793103446e-06,
1444
+ "loss": 0.0013,
1445
+ "step": 5800
1446
+ },
1447
+ {
1448
+ "epoch": 342.53,
1449
+ "learning_rate": 2.534344827586207e-06,
1450
+ "loss": 0.0012,
1451
+ "step": 5825
1452
+ },
1453
+ {
1454
+ "epoch": 344.0,
1455
+ "learning_rate": 2.5274482758620687e-06,
1456
+ "loss": 0.0014,
1457
+ "step": 5850
1458
+ },
1459
+ {
1460
+ "epoch": 345.47,
1461
+ "learning_rate": 2.520551724137931e-06,
1462
+ "loss": 0.001,
1463
+ "step": 5875
1464
+ },
1465
+ {
1466
+ "epoch": 346.94,
1467
+ "learning_rate": 2.5136551724137927e-06,
1468
+ "loss": 0.0012,
1469
+ "step": 5900
1470
+ },
1471
+ {
1472
+ "epoch": 348.41,
1473
+ "learning_rate": 2.506758620689655e-06,
1474
+ "loss": 0.0012,
1475
+ "step": 5925
1476
+ },
1477
+ {
1478
+ "epoch": 349.88,
1479
+ "learning_rate": 2.499862068965517e-06,
1480
+ "loss": 0.0012,
1481
+ "step": 5950
1482
+ },
1483
+ {
1484
+ "epoch": 351.35,
1485
+ "learning_rate": 2.4929655172413792e-06,
1486
+ "loss": 0.0013,
1487
+ "step": 5975
1488
+ },
1489
+ {
1490
+ "epoch": 352.82,
1491
+ "learning_rate": 2.4860689655172413e-06,
1492
+ "loss": 0.0015,
1493
+ "step": 6000
1494
+ },
1495
+ {
1496
+ "epoch": 352.82,
1497
+ "eval_loss": 0.497802734375,
1498
+ "eval_runtime": 156.7207,
1499
+ "eval_samples_per_second": 1.736,
1500
+ "eval_steps_per_second": 0.108,
1501
+ "eval_wer": 10.503343239227341,
1502
+ "step": 6000
1503
+ },
1504
+ {
1505
+ "epoch": 354.29,
1506
+ "learning_rate": 2.4791724137931033e-06,
1507
+ "loss": 0.0013,
1508
+ "step": 6025
1509
+ },
1510
+ {
1511
+ "epoch": 355.76,
1512
+ "learning_rate": 2.4722758620689653e-06,
1513
+ "loss": 0.0012,
1514
+ "step": 6050
1515
+ },
1516
+ {
1517
+ "epoch": 357.24,
1518
+ "learning_rate": 2.4653793103448274e-06,
1519
+ "loss": 0.0011,
1520
+ "step": 6075
1521
+ },
1522
+ {
1523
+ "epoch": 358.71,
1524
+ "learning_rate": 2.4584827586206894e-06,
1525
+ "loss": 0.0008,
1526
+ "step": 6100
1527
+ },
1528
+ {
1529
+ "epoch": 360.18,
1530
+ "learning_rate": 2.4515862068965514e-06,
1531
+ "loss": 0.0008,
1532
+ "step": 6125
1533
+ },
1534
+ {
1535
+ "epoch": 361.65,
1536
+ "learning_rate": 2.444689655172414e-06,
1537
+ "loss": 0.0011,
1538
+ "step": 6150
1539
+ },
1540
+ {
1541
+ "epoch": 363.12,
1542
+ "learning_rate": 2.4377931034482755e-06,
1543
+ "loss": 0.0012,
1544
+ "step": 6175
1545
+ },
1546
+ {
1547
+ "epoch": 364.59,
1548
+ "learning_rate": 2.430896551724138e-06,
1549
+ "loss": 0.0013,
1550
+ "step": 6200
1551
+ },
1552
+ {
1553
+ "epoch": 366.06,
1554
+ "learning_rate": 2.424e-06,
1555
+ "loss": 0.0011,
1556
+ "step": 6225
1557
+ },
1558
+ {
1559
+ "epoch": 367.53,
1560
+ "learning_rate": 2.417103448275862e-06,
1561
+ "loss": 0.0012,
1562
+ "step": 6250
1563
+ },
1564
+ {
1565
+ "epoch": 369.0,
1566
+ "learning_rate": 2.410206896551724e-06,
1567
+ "loss": 0.0011,
1568
+ "step": 6275
1569
+ },
1570
+ {
1571
+ "epoch": 370.47,
1572
+ "learning_rate": 2.403310344827586e-06,
1573
+ "loss": 0.0009,
1574
+ "step": 6300
1575
+ },
1576
+ {
1577
+ "epoch": 371.94,
1578
+ "learning_rate": 2.396413793103448e-06,
1579
+ "loss": 0.0014,
1580
+ "step": 6325
1581
+ },
1582
+ {
1583
+ "epoch": 373.41,
1584
+ "learning_rate": 2.38951724137931e-06,
1585
+ "loss": 0.0018,
1586
+ "step": 6350
1587
+ },
1588
+ {
1589
+ "epoch": 374.88,
1590
+ "learning_rate": 2.382620689655172e-06,
1591
+ "loss": 0.0009,
1592
+ "step": 6375
1593
+ },
1594
+ {
1595
+ "epoch": 376.35,
1596
+ "learning_rate": 2.3757241379310342e-06,
1597
+ "loss": 0.001,
1598
+ "step": 6400
1599
+ },
1600
+ {
1601
+ "epoch": 377.82,
1602
+ "learning_rate": 2.3688275862068963e-06,
1603
+ "loss": 0.0009,
1604
+ "step": 6425
1605
+ },
1606
+ {
1607
+ "epoch": 379.29,
1608
+ "learning_rate": 2.36248275862069e-06,
1609
+ "loss": 0.0008,
1610
+ "step": 6450
1611
+ },
1612
+ {
1613
+ "epoch": 380.76,
1614
+ "learning_rate": 2.3555862068965514e-06,
1615
+ "loss": 0.0009,
1616
+ "step": 6475
1617
+ },
1618
+ {
1619
+ "epoch": 382.24,
1620
+ "learning_rate": 2.348689655172414e-06,
1621
+ "loss": 0.0009,
1622
+ "step": 6500
1623
+ },
1624
+ {
1625
+ "epoch": 383.71,
1626
+ "learning_rate": 2.3417931034482755e-06,
1627
+ "loss": 0.0011,
1628
+ "step": 6525
1629
+ },
1630
+ {
1631
+ "epoch": 385.18,
1632
+ "learning_rate": 2.334896551724138e-06,
1633
+ "loss": 0.0008,
1634
+ "step": 6550
1635
+ },
1636
+ {
1637
+ "epoch": 386.65,
1638
+ "learning_rate": 2.3279999999999996e-06,
1639
+ "loss": 0.0006,
1640
+ "step": 6575
1641
+ },
1642
+ {
1643
+ "epoch": 388.12,
1644
+ "learning_rate": 2.321103448275862e-06,
1645
+ "loss": 0.001,
1646
+ "step": 6600
1647
+ },
1648
+ {
1649
+ "epoch": 389.59,
1650
+ "learning_rate": 2.314206896551724e-06,
1651
+ "loss": 0.0009,
1652
+ "step": 6625
1653
+ },
1654
+ {
1655
+ "epoch": 391.06,
1656
+ "learning_rate": 2.307310344827586e-06,
1657
+ "loss": 0.0008,
1658
+ "step": 6650
1659
+ },
1660
+ {
1661
+ "epoch": 392.53,
1662
+ "learning_rate": 2.300413793103448e-06,
1663
+ "loss": 0.001,
1664
+ "step": 6675
1665
+ },
1666
+ {
1667
+ "epoch": 394.0,
1668
+ "learning_rate": 2.29351724137931e-06,
1669
+ "loss": 0.0009,
1670
+ "step": 6700
1671
+ },
1672
+ {
1673
+ "epoch": 395.47,
1674
+ "learning_rate": 2.2866206896551726e-06,
1675
+ "loss": 0.0011,
1676
+ "step": 6725
1677
+ },
1678
+ {
1679
+ "epoch": 396.94,
1680
+ "learning_rate": 2.2797241379310342e-06,
1681
+ "loss": 0.0008,
1682
+ "step": 6750
1683
+ },
1684
+ {
1685
+ "epoch": 398.41,
1686
+ "learning_rate": 2.2728275862068967e-06,
1687
+ "loss": 0.0007,
1688
+ "step": 6775
1689
+ },
1690
+ {
1691
+ "epoch": 399.88,
1692
+ "learning_rate": 2.2659310344827583e-06,
1693
+ "loss": 0.0006,
1694
+ "step": 6800
1695
+ },
1696
+ {
1697
+ "epoch": 401.35,
1698
+ "learning_rate": 2.2590344827586207e-06,
1699
+ "loss": 0.0007,
1700
+ "step": 6825
1701
+ },
1702
+ {
1703
+ "epoch": 402.82,
1704
+ "learning_rate": 2.2521379310344828e-06,
1705
+ "loss": 0.0011,
1706
+ "step": 6850
1707
+ },
1708
+ {
1709
+ "epoch": 404.29,
1710
+ "learning_rate": 2.245241379310345e-06,
1711
+ "loss": 0.001,
1712
+ "step": 6875
1713
+ },
1714
+ {
1715
+ "epoch": 405.76,
1716
+ "learning_rate": 2.238344827586207e-06,
1717
+ "loss": 0.0007,
1718
+ "step": 6900
1719
+ },
1720
+ {
1721
+ "epoch": 407.24,
1722
+ "learning_rate": 2.231448275862069e-06,
1723
+ "loss": 0.0008,
1724
+ "step": 6925
1725
+ },
1726
+ {
1727
+ "epoch": 408.71,
1728
+ "learning_rate": 2.224551724137931e-06,
1729
+ "loss": 0.0007,
1730
+ "step": 6950
1731
+ },
1732
+ {
1733
+ "epoch": 410.18,
1734
+ "learning_rate": 2.217655172413793e-06,
1735
+ "loss": 0.0008,
1736
+ "step": 6975
1737
+ },
1738
+ {
1739
+ "epoch": 411.65,
1740
+ "learning_rate": 2.210758620689655e-06,
1741
+ "loss": 0.0007,
1742
+ "step": 7000
1743
+ },
1744
+ {
1745
+ "epoch": 411.65,
1746
+ "eval_loss": 0.5146484375,
1747
+ "eval_runtime": 159.9051,
1748
+ "eval_samples_per_second": 1.701,
1749
+ "eval_steps_per_second": 0.106,
1750
+ "eval_wer": 10.057578008915305,
1751
+ "step": 7000
1752
+ },
1753
+ {
1754
+ "epoch": 413.12,
1755
+ "learning_rate": 2.203862068965517e-06,
1756
+ "loss": 0.0007,
1757
+ "step": 7025
1758
+ },
1759
+ {
1760
+ "epoch": 414.59,
1761
+ "learning_rate": 2.196965517241379e-06,
1762
+ "loss": 0.0006,
1763
+ "step": 7050
1764
+ },
1765
+ {
1766
+ "epoch": 416.06,
1767
+ "learning_rate": 2.1900689655172415e-06,
1768
+ "loss": 0.0009,
1769
+ "step": 7075
1770
+ },
1771
+ {
1772
+ "epoch": 417.53,
1773
+ "learning_rate": 2.183172413793103e-06,
1774
+ "loss": 0.0008,
1775
+ "step": 7100
1776
+ },
1777
+ {
1778
+ "epoch": 419.0,
1779
+ "learning_rate": 2.1762758620689656e-06,
1780
+ "loss": 0.0007,
1781
+ "step": 7125
1782
+ },
1783
+ {
1784
+ "epoch": 420.47,
1785
+ "learning_rate": 2.1693793103448276e-06,
1786
+ "loss": 0.0008,
1787
+ "step": 7150
1788
+ },
1789
+ {
1790
+ "epoch": 421.94,
1791
+ "learning_rate": 2.1624827586206896e-06,
1792
+ "loss": 0.0007,
1793
+ "step": 7175
1794
+ },
1795
+ {
1796
+ "epoch": 423.41,
1797
+ "learning_rate": 2.1555862068965517e-06,
1798
+ "loss": 0.0005,
1799
+ "step": 7200
1800
+ },
1801
+ {
1802
+ "epoch": 424.88,
1803
+ "learning_rate": 2.1486896551724137e-06,
1804
+ "loss": 0.0008,
1805
+ "step": 7225
1806
+ },
1807
+ {
1808
+ "epoch": 426.35,
1809
+ "learning_rate": 2.1417931034482757e-06,
1810
+ "loss": 0.0009,
1811
+ "step": 7250
1812
+ },
1813
+ {
1814
+ "epoch": 427.82,
1815
+ "learning_rate": 2.1348965517241378e-06,
1816
+ "loss": 0.0009,
1817
+ "step": 7275
1818
+ },
1819
+ {
1820
+ "epoch": 429.29,
1821
+ "learning_rate": 2.128e-06,
1822
+ "loss": 0.0006,
1823
+ "step": 7300
1824
+ },
1825
+ {
1826
+ "epoch": 430.76,
1827
+ "learning_rate": 2.121103448275862e-06,
1828
+ "loss": 0.0006,
1829
+ "step": 7325
1830
+ },
1831
+ {
1832
+ "epoch": 432.24,
1833
+ "learning_rate": 2.1142068965517243e-06,
1834
+ "loss": 0.0006,
1835
+ "step": 7350
1836
+ },
1837
+ {
1838
+ "epoch": 433.71,
1839
+ "learning_rate": 2.107310344827586e-06,
1840
+ "loss": 0.0006,
1841
+ "step": 7375
1842
+ },
1843
+ {
1844
+ "epoch": 435.18,
1845
+ "learning_rate": 2.1004137931034483e-06,
1846
+ "loss": 0.0007,
1847
+ "step": 7400
1848
+ },
1849
+ {
1850
+ "epoch": 436.65,
1851
+ "learning_rate": 2.09351724137931e-06,
1852
+ "loss": 0.0006,
1853
+ "step": 7425
1854
+ },
1855
+ {
1856
+ "epoch": 438.12,
1857
+ "learning_rate": 2.0871724137931035e-06,
1858
+ "loss": 0.0007,
1859
+ "step": 7450
1860
+ },
1861
+ {
1862
+ "epoch": 439.59,
1863
+ "learning_rate": 2.080275862068965e-06,
1864
+ "loss": 0.0006,
1865
+ "step": 7475
1866
+ },
1867
+ {
1868
+ "epoch": 441.06,
1869
+ "learning_rate": 2.0733793103448276e-06,
1870
+ "loss": 0.0009,
1871
+ "step": 7500
1872
+ },
1873
+ {
1874
+ "epoch": 442.53,
1875
+ "learning_rate": 2.0664827586206896e-06,
1876
+ "loss": 0.0008,
1877
+ "step": 7525
1878
+ },
1879
+ {
1880
+ "epoch": 444.0,
1881
+ "learning_rate": 2.0595862068965516e-06,
1882
+ "loss": 0.0005,
1883
+ "step": 7550
1884
+ },
1885
+ {
1886
+ "epoch": 445.47,
1887
+ "learning_rate": 2.0526896551724137e-06,
1888
+ "loss": 0.0004,
1889
+ "step": 7575
1890
+ },
1891
+ {
1892
+ "epoch": 446.94,
1893
+ "learning_rate": 2.0457931034482757e-06,
1894
+ "loss": 0.0006,
1895
+ "step": 7600
1896
+ },
1897
+ {
1898
+ "epoch": 448.41,
1899
+ "learning_rate": 2.0388965517241377e-06,
1900
+ "loss": 0.0007,
1901
+ "step": 7625
1902
+ },
1903
+ {
1904
+ "epoch": 449.88,
1905
+ "learning_rate": 2.0319999999999998e-06,
1906
+ "loss": 0.0005,
1907
+ "step": 7650
1908
+ },
1909
+ {
1910
+ "epoch": 451.35,
1911
+ "learning_rate": 2.025103448275862e-06,
1912
+ "loss": 0.0005,
1913
+ "step": 7675
1914
+ },
1915
+ {
1916
+ "epoch": 452.82,
1917
+ "learning_rate": 2.018206896551724e-06,
1918
+ "loss": 0.0009,
1919
+ "step": 7700
1920
+ },
1921
+ {
1922
+ "epoch": 454.29,
1923
+ "learning_rate": 2.0113103448275863e-06,
1924
+ "loss": 0.0005,
1925
+ "step": 7725
1926
+ },
1927
+ {
1928
+ "epoch": 455.76,
1929
+ "learning_rate": 2.0044137931034483e-06,
1930
+ "loss": 0.0005,
1931
+ "step": 7750
1932
+ },
1933
+ {
1934
+ "epoch": 457.24,
1935
+ "learning_rate": 1.9975172413793104e-06,
1936
+ "loss": 0.0006,
1937
+ "step": 7775
1938
+ },
1939
+ {
1940
+ "epoch": 458.71,
1941
+ "learning_rate": 1.9906206896551724e-06,
1942
+ "loss": 0.0005,
1943
+ "step": 7800
1944
+ },
1945
+ {
1946
+ "epoch": 460.18,
1947
+ "learning_rate": 1.9837241379310344e-06,
1948
+ "loss": 0.0005,
1949
+ "step": 7825
1950
+ },
1951
+ {
1952
+ "epoch": 461.65,
1953
+ "learning_rate": 1.9768275862068965e-06,
1954
+ "loss": 0.0006,
1955
+ "step": 7850
1956
+ },
1957
+ {
1958
+ "epoch": 463.12,
1959
+ "learning_rate": 1.9699310344827585e-06,
1960
+ "loss": 0.0004,
1961
+ "step": 7875
1962
+ },
1963
+ {
1964
+ "epoch": 464.59,
1965
+ "learning_rate": 1.9630344827586205e-06,
1966
+ "loss": 0.0007,
1967
+ "step": 7900
1968
+ },
1969
+ {
1970
+ "epoch": 466.06,
1971
+ "learning_rate": 1.956137931034483e-06,
1972
+ "loss": 0.0005,
1973
+ "step": 7925
1974
+ },
1975
+ {
1976
+ "epoch": 467.53,
1977
+ "learning_rate": 1.949241379310345e-06,
1978
+ "loss": 0.0006,
1979
+ "step": 7950
1980
+ },
1981
+ {
1982
+ "epoch": 469.0,
1983
+ "learning_rate": 1.942344827586207e-06,
1984
+ "loss": 0.0006,
1985
+ "step": 7975
1986
+ },
1987
+ {
1988
+ "epoch": 470.47,
1989
+ "learning_rate": 1.935448275862069e-06,
1990
+ "loss": 0.0007,
1991
+ "step": 8000
1992
+ },
1993
+ {
1994
+ "epoch": 470.47,
1995
+ "eval_loss": 0.53857421875,
1996
+ "eval_runtime": 158.4391,
1997
+ "eval_samples_per_second": 1.717,
1998
+ "eval_steps_per_second": 0.107,
1999
+ "eval_wer": 10.131872213967311,
2000
+ "step": 8000
2001
+ },
2002
+ {
2003
+ "epoch": 471.94,
2004
+ "learning_rate": 1.928551724137931e-06,
2005
+ "loss": 0.0005,
2006
+ "step": 8025
2007
+ },
2008
+ {
2009
+ "epoch": 473.41,
2010
+ "learning_rate": 1.921655172413793e-06,
2011
+ "loss": 0.0008,
2012
+ "step": 8050
2013
+ },
2014
+ {
2015
+ "epoch": 474.88,
2016
+ "learning_rate": 1.914758620689655e-06,
2017
+ "loss": 0.0005,
2018
+ "step": 8075
2019
+ },
2020
+ {
2021
+ "epoch": 476.35,
2022
+ "learning_rate": 1.907862068965517e-06,
2023
+ "loss": 0.0004,
2024
+ "step": 8100
2025
+ },
2026
+ {
2027
+ "epoch": 477.82,
2028
+ "learning_rate": 1.9009655172413792e-06,
2029
+ "loss": 0.0005,
2030
+ "step": 8125
2031
+ },
2032
+ {
2033
+ "epoch": 479.29,
2034
+ "learning_rate": 1.8940689655172413e-06,
2035
+ "loss": 0.0004,
2036
+ "step": 8150
2037
+ },
2038
+ {
2039
+ "epoch": 480.76,
2040
+ "learning_rate": 1.8871724137931033e-06,
2041
+ "loss": 0.0007,
2042
+ "step": 8175
2043
+ },
2044
+ {
2045
+ "epoch": 482.24,
2046
+ "learning_rate": 1.8802758620689653e-06,
2047
+ "loss": 0.0005,
2048
+ "step": 8200
2049
+ },
2050
+ {
2051
+ "epoch": 483.71,
2052
+ "learning_rate": 1.8733793103448274e-06,
2053
+ "loss": 0.0007,
2054
+ "step": 8225
2055
+ },
2056
+ {
2057
+ "epoch": 485.18,
2058
+ "learning_rate": 1.8664827586206894e-06,
2059
+ "loss": 0.0005,
2060
+ "step": 8250
2061
+ },
2062
+ {
2063
+ "epoch": 486.65,
2064
+ "learning_rate": 1.8595862068965517e-06,
2065
+ "loss": 0.0004,
2066
+ "step": 8275
2067
+ },
2068
+ {
2069
+ "epoch": 488.12,
2070
+ "learning_rate": 1.8526896551724137e-06,
2071
+ "loss": 0.0005,
2072
+ "step": 8300
2073
+ },
2074
+ {
2075
+ "epoch": 489.59,
2076
+ "learning_rate": 1.845793103448276e-06,
2077
+ "loss": 0.0004,
2078
+ "step": 8325
2079
+ },
2080
+ {
2081
+ "epoch": 491.06,
2082
+ "learning_rate": 1.838896551724138e-06,
2083
+ "loss": 0.0004,
2084
+ "step": 8350
2085
+ },
2086
+ {
2087
+ "epoch": 492.53,
2088
+ "learning_rate": 1.832e-06,
2089
+ "loss": 0.0005,
2090
+ "step": 8375
2091
+ },
2092
+ {
2093
+ "epoch": 494.0,
2094
+ "learning_rate": 1.825103448275862e-06,
2095
+ "loss": 0.0004,
2096
+ "step": 8400
2097
+ },
2098
+ {
2099
+ "epoch": 495.47,
2100
+ "learning_rate": 1.818206896551724e-06,
2101
+ "loss": 0.0007,
2102
+ "step": 8425
2103
+ },
2104
+ {
2105
+ "epoch": 496.94,
2106
+ "learning_rate": 1.811862068965517e-06,
2107
+ "loss": 0.0008,
2108
+ "step": 8450
2109
+ },
2110
+ {
2111
+ "epoch": 498.41,
2112
+ "learning_rate": 1.8049655172413792e-06,
2113
+ "loss": 0.0005,
2114
+ "step": 8475
2115
+ },
2116
+ {
2117
+ "epoch": 499.88,
2118
+ "learning_rate": 1.7980689655172413e-06,
2119
+ "loss": 0.0006,
2120
+ "step": 8500
2121
+ },
2122
+ {
2123
+ "epoch": 501.35,
2124
+ "learning_rate": 1.7911724137931035e-06,
2125
+ "loss": 0.0004,
2126
+ "step": 8525
2127
+ },
2128
+ {
2129
+ "epoch": 502.82,
2130
+ "learning_rate": 1.7842758620689655e-06,
2131
+ "loss": 0.0004,
2132
+ "step": 8550
2133
+ },
2134
+ {
2135
+ "epoch": 504.29,
2136
+ "learning_rate": 1.7773793103448276e-06,
2137
+ "loss": 0.0006,
2138
+ "step": 8575
2139
+ },
2140
+ {
2141
+ "epoch": 505.76,
2142
+ "learning_rate": 1.7704827586206896e-06,
2143
+ "loss": 0.0004,
2144
+ "step": 8600
2145
+ },
2146
+ {
2147
+ "epoch": 507.24,
2148
+ "learning_rate": 1.7635862068965516e-06,
2149
+ "loss": 0.0004,
2150
+ "step": 8625
2151
+ },
2152
+ {
2153
+ "epoch": 508.71,
2154
+ "learning_rate": 1.7566896551724137e-06,
2155
+ "loss": 0.0006,
2156
+ "step": 8650
2157
+ },
2158
+ {
2159
+ "epoch": 510.18,
2160
+ "learning_rate": 1.7497931034482757e-06,
2161
+ "loss": 0.0004,
2162
+ "step": 8675
2163
+ },
2164
+ {
2165
+ "epoch": 511.65,
2166
+ "learning_rate": 1.742896551724138e-06,
2167
+ "loss": 0.0005,
2168
+ "step": 8700
2169
+ },
2170
+ {
2171
+ "epoch": 513.12,
2172
+ "learning_rate": 1.736e-06,
2173
+ "loss": 0.0006,
2174
+ "step": 8725
2175
+ },
2176
+ {
2177
+ "epoch": 514.59,
2178
+ "learning_rate": 1.729103448275862e-06,
2179
+ "loss": 0.0006,
2180
+ "step": 8750
2181
+ },
2182
+ {
2183
+ "epoch": 516.06,
2184
+ "learning_rate": 1.722206896551724e-06,
2185
+ "loss": 0.0004,
2186
+ "step": 8775
2187
+ },
2188
+ {
2189
+ "epoch": 517.53,
2190
+ "learning_rate": 1.715310344827586e-06,
2191
+ "loss": 0.0003,
2192
+ "step": 8800
2193
+ },
2194
+ {
2195
+ "epoch": 519.0,
2196
+ "learning_rate": 1.7084137931034481e-06,
2197
+ "loss": 0.0003,
2198
+ "step": 8825
2199
+ },
2200
+ {
2201
+ "epoch": 520.47,
2202
+ "learning_rate": 1.7015172413793101e-06,
2203
+ "loss": 0.0004,
2204
+ "step": 8850
2205
+ },
2206
+ {
2207
+ "epoch": 521.94,
2208
+ "learning_rate": 1.6946206896551722e-06,
2209
+ "loss": 0.0006,
2210
+ "step": 8875
2211
+ },
2212
+ {
2213
+ "epoch": 523.41,
2214
+ "learning_rate": 1.6877241379310342e-06,
2215
+ "loss": 0.0005,
2216
+ "step": 8900
2217
+ },
2218
+ {
2219
+ "epoch": 524.88,
2220
+ "learning_rate": 1.6808275862068967e-06,
2221
+ "loss": 0.0029,
2222
+ "step": 8925
2223
+ },
2224
+ {
2225
+ "epoch": 526.35,
2226
+ "learning_rate": 1.6739310344827587e-06,
2227
+ "loss": 0.0004,
2228
+ "step": 8950
2229
+ },
2230
+ {
2231
+ "epoch": 527.82,
2232
+ "learning_rate": 1.6670344827586207e-06,
2233
+ "loss": 0.0003,
2234
+ "step": 8975
2235
+ },
2236
+ {
2237
+ "epoch": 529.29,
2238
+ "learning_rate": 1.6601379310344828e-06,
2239
+ "loss": 0.0004,
2240
+ "step": 9000
2241
+ },
2242
+ {
2243
+ "epoch": 529.29,
2244
+ "eval_loss": 0.5361328125,
2245
+ "eval_runtime": 156.9399,
2246
+ "eval_samples_per_second": 1.733,
2247
+ "eval_steps_per_second": 0.108,
2248
+ "eval_wer": 9.778974739970282,
2249
+ "step": 9000
2250
+ },
2251
+ {
2252
+ "epoch": 530.76,
2253
+ "learning_rate": 1.6532413793103448e-06,
2254
+ "loss": 0.0006,
2255
+ "step": 9025
2256
+ },
2257
+ {
2258
+ "epoch": 532.24,
2259
+ "learning_rate": 1.6463448275862068e-06,
2260
+ "loss": 0.0003,
2261
+ "step": 9050
2262
+ },
2263
+ {
2264
+ "epoch": 533.71,
2265
+ "learning_rate": 1.6394482758620689e-06,
2266
+ "loss": 0.0003,
2267
+ "step": 9075
2268
+ },
2269
+ {
2270
+ "epoch": 535.18,
2271
+ "learning_rate": 1.632551724137931e-06,
2272
+ "loss": 0.0005,
2273
+ "step": 9100
2274
+ },
2275
+ {
2276
+ "epoch": 536.65,
2277
+ "learning_rate": 1.625655172413793e-06,
2278
+ "loss": 0.0006,
2279
+ "step": 9125
2280
+ },
2281
+ {
2282
+ "epoch": 538.12,
2283
+ "learning_rate": 1.6187586206896552e-06,
2284
+ "loss": 0.0003,
2285
+ "step": 9150
2286
+ },
2287
+ {
2288
+ "epoch": 539.59,
2289
+ "learning_rate": 1.6118620689655172e-06,
2290
+ "loss": 0.0004,
2291
+ "step": 9175
2292
+ },
2293
+ {
2294
+ "epoch": 541.06,
2295
+ "learning_rate": 1.6049655172413792e-06,
2296
+ "loss": 0.0003,
2297
+ "step": 9200
2298
+ },
2299
+ {
2300
+ "epoch": 542.53,
2301
+ "learning_rate": 1.5980689655172413e-06,
2302
+ "loss": 0.0004,
2303
+ "step": 9225
2304
+ },
2305
+ {
2306
+ "epoch": 544.0,
2307
+ "learning_rate": 1.5911724137931033e-06,
2308
+ "loss": 0.0006,
2309
+ "step": 9250
2310
+ },
2311
+ {
2312
+ "epoch": 545.47,
2313
+ "learning_rate": 1.5842758620689653e-06,
2314
+ "loss": 0.0002,
2315
+ "step": 9275
2316
+ },
2317
+ {
2318
+ "epoch": 546.94,
2319
+ "learning_rate": 1.5773793103448274e-06,
2320
+ "loss": 0.0003,
2321
+ "step": 9300
2322
+ },
2323
+ {
2324
+ "epoch": 548.41,
2325
+ "learning_rate": 1.5704827586206896e-06,
2326
+ "loss": 0.0003,
2327
+ "step": 9325
2328
+ },
2329
+ {
2330
+ "epoch": 549.88,
2331
+ "learning_rate": 1.5635862068965516e-06,
2332
+ "loss": 0.0003,
2333
+ "step": 9350
2334
+ },
2335
+ {
2336
+ "epoch": 551.35,
2337
+ "learning_rate": 1.5566896551724139e-06,
2338
+ "loss": 0.0004,
2339
+ "step": 9375
2340
+ },
2341
+ {
2342
+ "epoch": 552.82,
2343
+ "learning_rate": 1.549793103448276e-06,
2344
+ "loss": 0.0004,
2345
+ "step": 9400
2346
+ },
2347
+ {
2348
+ "epoch": 554.29,
2349
+ "learning_rate": 1.542896551724138e-06,
2350
+ "loss": 0.0005,
2351
+ "step": 9425
2352
+ },
2353
+ {
2354
+ "epoch": 555.76,
2355
+ "learning_rate": 1.5365517241379309e-06,
2356
+ "loss": 0.0004,
2357
+ "step": 9450
2358
+ },
2359
+ {
2360
+ "epoch": 557.24,
2361
+ "learning_rate": 1.529655172413793e-06,
2362
+ "loss": 0.0003,
2363
+ "step": 9475
2364
+ },
2365
+ {
2366
+ "epoch": 558.71,
2367
+ "learning_rate": 1.522758620689655e-06,
2368
+ "loss": 0.0003,
2369
+ "step": 9500
2370
+ },
2371
+ {
2372
+ "epoch": 560.18,
2373
+ "learning_rate": 1.5158620689655172e-06,
2374
+ "loss": 0.0003,
2375
+ "step": 9525
2376
+ },
2377
+ {
2378
+ "epoch": 561.65,
2379
+ "learning_rate": 1.5089655172413792e-06,
2380
+ "loss": 0.0005,
2381
+ "step": 9550
2382
+ },
2383
+ {
2384
+ "epoch": 563.12,
2385
+ "learning_rate": 1.5020689655172415e-06,
2386
+ "loss": 0.0004,
2387
+ "step": 9575
2388
+ },
2389
+ {
2390
+ "epoch": 564.59,
2391
+ "learning_rate": 1.4951724137931035e-06,
2392
+ "loss": 0.0004,
2393
+ "step": 9600
2394
+ },
2395
+ {
2396
+ "epoch": 566.06,
2397
+ "learning_rate": 1.4882758620689655e-06,
2398
+ "loss": 0.0003,
2399
+ "step": 9625
2400
+ },
2401
+ {
2402
+ "epoch": 567.53,
2403
+ "learning_rate": 1.4813793103448276e-06,
2404
+ "loss": 0.0005,
2405
+ "step": 9650
2406
+ },
2407
+ {
2408
+ "epoch": 569.0,
2409
+ "learning_rate": 1.4744827586206896e-06,
2410
+ "loss": 0.0003,
2411
+ "step": 9675
2412
+ },
2413
+ {
2414
+ "epoch": 570.47,
2415
+ "learning_rate": 1.4675862068965516e-06,
2416
+ "loss": 0.0003,
2417
+ "step": 9700
2418
+ },
2419
+ {
2420
+ "epoch": 571.94,
2421
+ "learning_rate": 1.4606896551724137e-06,
2422
+ "loss": 0.0003,
2423
+ "step": 9725
2424
+ },
2425
+ {
2426
+ "epoch": 573.41,
2427
+ "learning_rate": 1.4537931034482757e-06,
2428
+ "loss": 0.0002,
2429
+ "step": 9750
2430
+ },
2431
+ {
2432
+ "epoch": 574.88,
2433
+ "learning_rate": 1.4468965517241377e-06,
2434
+ "loss": 0.0002,
2435
+ "step": 9775
2436
+ },
2437
+ {
2438
+ "epoch": 576.35,
2439
+ "learning_rate": 1.44e-06,
2440
+ "loss": 0.0004,
2441
+ "step": 9800
2442
+ },
2443
+ {
2444
+ "epoch": 577.82,
2445
+ "learning_rate": 1.433103448275862e-06,
2446
+ "loss": 0.0002,
2447
+ "step": 9825
2448
+ },
2449
+ {
2450
+ "epoch": 579.29,
2451
+ "learning_rate": 1.426206896551724e-06,
2452
+ "loss": 0.0005,
2453
+ "step": 9850
2454
+ },
2455
+ {
2456
+ "epoch": 580.76,
2457
+ "learning_rate": 1.419310344827586e-06,
2458
+ "loss": 0.0004,
2459
+ "step": 9875
2460
+ },
2461
+ {
2462
+ "epoch": 582.24,
2463
+ "learning_rate": 1.4124137931034481e-06,
2464
+ "loss": 0.0003,
2465
+ "step": 9900
2466
+ },
2467
+ {
2468
+ "epoch": 583.71,
2469
+ "learning_rate": 1.4055172413793104e-06,
2470
+ "loss": 0.0004,
2471
+ "step": 9925
2472
+ },
2473
+ {
2474
+ "epoch": 585.18,
2475
+ "learning_rate": 1.3986206896551724e-06,
2476
+ "loss": 0.0004,
2477
+ "step": 9950
2478
+ },
2479
+ {
2480
+ "epoch": 586.65,
2481
+ "learning_rate": 1.3917241379310344e-06,
2482
+ "loss": 0.0004,
2483
+ "step": 9975
2484
+ },
2485
+ {
2486
+ "epoch": 588.12,
2487
+ "learning_rate": 1.3848275862068965e-06,
2488
+ "loss": 0.0003,
2489
+ "step": 10000
2490
+ },
2491
+ {
2492
+ "epoch": 588.12,
2493
+ "eval_loss": 0.54296875,
2494
+ "eval_runtime": 156.5622,
2495
+ "eval_samples_per_second": 1.737,
2496
+ "eval_steps_per_second": 0.109,
2497
+ "eval_wer": 9.973997028231798,
2498
+ "step": 10000
2499
+ },
2500
+ {
2501
+ "epoch": 589.59,
2502
+ "learning_rate": 1.3779310344827587e-06,
2503
+ "loss": 0.0002,
2504
+ "step": 10025
2505
+ },
2506
+ {
2507
+ "epoch": 591.06,
2508
+ "learning_rate": 1.3710344827586207e-06,
2509
+ "loss": 0.0003,
2510
+ "step": 10050
2511
+ },
2512
+ {
2513
+ "epoch": 592.53,
2514
+ "learning_rate": 1.3641379310344828e-06,
2515
+ "loss": 0.0002,
2516
+ "step": 10075
2517
+ },
2518
+ {
2519
+ "epoch": 594.0,
2520
+ "learning_rate": 1.3572413793103448e-06,
2521
+ "loss": 0.0003,
2522
+ "step": 10100
2523
+ },
2524
+ {
2525
+ "epoch": 595.47,
2526
+ "learning_rate": 1.3503448275862068e-06,
2527
+ "loss": 0.0003,
2528
+ "step": 10125
2529
+ },
2530
+ {
2531
+ "epoch": 596.94,
2532
+ "learning_rate": 1.3434482758620689e-06,
2533
+ "loss": 0.0002,
2534
+ "step": 10150
2535
+ },
2536
+ {
2537
+ "epoch": 598.41,
2538
+ "learning_rate": 1.3365517241379309e-06,
2539
+ "loss": 0.0004,
2540
+ "step": 10175
2541
+ },
2542
+ {
2543
+ "epoch": 599.88,
2544
+ "learning_rate": 1.329655172413793e-06,
2545
+ "loss": 0.0002,
2546
+ "step": 10200
2547
+ },
2548
+ {
2549
+ "epoch": 601.35,
2550
+ "learning_rate": 1.322758620689655e-06,
2551
+ "loss": 0.0003,
2552
+ "step": 10225
2553
+ },
2554
+ {
2555
+ "epoch": 602.82,
2556
+ "learning_rate": 1.3158620689655172e-06,
2557
+ "loss": 0.0003,
2558
+ "step": 10250
2559
+ },
2560
+ {
2561
+ "epoch": 604.29,
2562
+ "learning_rate": 1.3089655172413792e-06,
2563
+ "loss": 0.0002,
2564
+ "step": 10275
2565
+ },
2566
+ {
2567
+ "epoch": 605.76,
2568
+ "learning_rate": 1.3020689655172413e-06,
2569
+ "loss": 0.0002,
2570
+ "step": 10300
2571
+ },
2572
+ {
2573
+ "epoch": 607.24,
2574
+ "learning_rate": 1.2951724137931035e-06,
2575
+ "loss": 0.0003,
2576
+ "step": 10325
2577
+ },
2578
+ {
2579
+ "epoch": 608.71,
2580
+ "learning_rate": 1.2882758620689655e-06,
2581
+ "loss": 0.0002,
2582
+ "step": 10350
2583
+ },
2584
+ {
2585
+ "epoch": 610.18,
2586
+ "learning_rate": 1.2813793103448276e-06,
2587
+ "loss": 0.0003,
2588
+ "step": 10375
2589
+ },
2590
+ {
2591
+ "epoch": 611.65,
2592
+ "learning_rate": 1.2744827586206896e-06,
2593
+ "loss": 0.0003,
2594
+ "step": 10400
2595
+ },
2596
+ {
2597
+ "epoch": 613.12,
2598
+ "learning_rate": 1.2675862068965516e-06,
2599
+ "loss": 0.0003,
2600
+ "step": 10425
2601
+ },
2602
+ {
2603
+ "epoch": 614.59,
2604
+ "learning_rate": 1.2612413793103448e-06,
2605
+ "loss": 0.0005,
2606
+ "step": 10450
2607
+ },
2608
+ {
2609
+ "epoch": 616.06,
2610
+ "learning_rate": 1.2543448275862068e-06,
2611
+ "loss": 0.0003,
2612
+ "step": 10475
2613
+ },
2614
+ {
2615
+ "epoch": 617.53,
2616
+ "learning_rate": 1.2474482758620688e-06,
2617
+ "loss": 0.0003,
2618
+ "step": 10500
2619
+ },
2620
+ {
2621
+ "epoch": 619.0,
2622
+ "learning_rate": 1.240551724137931e-06,
2623
+ "loss": 0.0001,
2624
+ "step": 10525
2625
+ },
2626
+ {
2627
+ "epoch": 620.47,
2628
+ "learning_rate": 1.2336551724137931e-06,
2629
+ "loss": 0.0002,
2630
+ "step": 10550
2631
+ },
2632
+ {
2633
+ "epoch": 621.94,
2634
+ "learning_rate": 1.2267586206896552e-06,
2635
+ "loss": 0.0005,
2636
+ "step": 10575
2637
+ },
2638
+ {
2639
+ "epoch": 623.41,
2640
+ "learning_rate": 1.2198620689655172e-06,
2641
+ "loss": 0.0002,
2642
+ "step": 10600
2643
+ },
2644
+ {
2645
+ "epoch": 624.88,
2646
+ "learning_rate": 1.2129655172413792e-06,
2647
+ "loss": 0.0003,
2648
+ "step": 10625
2649
+ },
2650
+ {
2651
+ "epoch": 626.35,
2652
+ "learning_rate": 1.2060689655172413e-06,
2653
+ "loss": 0.0002,
2654
+ "step": 10650
2655
+ },
2656
+ {
2657
+ "epoch": 627.82,
2658
+ "learning_rate": 1.1991724137931035e-06,
2659
+ "loss": 0.0003,
2660
+ "step": 10675
2661
+ },
2662
+ {
2663
+ "epoch": 629.29,
2664
+ "learning_rate": 1.1922758620689655e-06,
2665
+ "loss": 0.0003,
2666
+ "step": 10700
2667
+ },
2668
+ {
2669
+ "epoch": 630.76,
2670
+ "learning_rate": 1.1853793103448276e-06,
2671
+ "loss": 0.0003,
2672
+ "step": 10725
2673
+ },
2674
+ {
2675
+ "epoch": 632.24,
2676
+ "learning_rate": 1.1784827586206896e-06,
2677
+ "loss": 0.0002,
2678
+ "step": 10750
2679
+ },
2680
+ {
2681
+ "epoch": 633.71,
2682
+ "learning_rate": 1.1715862068965516e-06,
2683
+ "loss": 0.0002,
2684
+ "step": 10775
2685
+ },
2686
+ {
2687
+ "epoch": 635.18,
2688
+ "learning_rate": 1.1646896551724137e-06,
2689
+ "loss": 0.0004,
2690
+ "step": 10800
2691
+ },
2692
+ {
2693
+ "epoch": 636.65,
2694
+ "learning_rate": 1.1577931034482757e-06,
2695
+ "loss": 0.0003,
2696
+ "step": 10825
2697
+ },
2698
+ {
2699
+ "epoch": 638.12,
2700
+ "learning_rate": 1.1508965517241377e-06,
2701
+ "loss": 0.0002,
2702
+ "step": 10850
2703
+ },
2704
+ {
2705
+ "epoch": 639.59,
2706
+ "learning_rate": 1.1439999999999998e-06,
2707
+ "loss": 0.0002,
2708
+ "step": 10875
2709
+ },
2710
+ {
2711
+ "epoch": 641.06,
2712
+ "learning_rate": 1.137103448275862e-06,
2713
+ "loss": 0.0003,
2714
+ "step": 10900
2715
+ },
2716
+ {
2717
+ "epoch": 642.53,
2718
+ "learning_rate": 1.1302068965517243e-06,
2719
+ "loss": 0.0002,
2720
+ "step": 10925
2721
+ },
2722
+ {
2723
+ "epoch": 644.0,
2724
+ "learning_rate": 1.1233103448275863e-06,
2725
+ "loss": 0.0004,
2726
+ "step": 10950
2727
+ },
2728
+ {
2729
+ "epoch": 645.47,
2730
+ "learning_rate": 1.1164137931034483e-06,
2731
+ "loss": 0.0004,
2732
+ "step": 10975
2733
+ },
2734
+ {
2735
+ "epoch": 646.94,
2736
+ "learning_rate": 1.1095172413793103e-06,
2737
+ "loss": 0.0002,
2738
+ "step": 11000
2739
+ },
2740
+ {
2741
+ "epoch": 646.94,
2742
+ "eval_loss": 0.5458984375,
2743
+ "eval_runtime": 157.5866,
2744
+ "eval_samples_per_second": 1.726,
2745
+ "eval_steps_per_second": 0.108,
2746
+ "eval_wer": 9.955423476968797,
2747
+ "step": 11000
2748
+ },
2749
+ {
2750
+ "epoch": 648.41,
2751
+ "learning_rate": 1.1026206896551724e-06,
2752
+ "loss": 0.0003,
2753
+ "step": 11025
2754
+ },
2755
+ {
2756
+ "epoch": 649.88,
2757
+ "learning_rate": 1.0957241379310344e-06,
2758
+ "loss": 0.0002,
2759
+ "step": 11050
2760
+ },
2761
+ {
2762
+ "epoch": 651.35,
2763
+ "learning_rate": 1.0888275862068964e-06,
2764
+ "loss": 0.0002,
2765
+ "step": 11075
2766
+ },
2767
+ {
2768
+ "epoch": 652.82,
2769
+ "learning_rate": 1.0819310344827585e-06,
2770
+ "loss": 0.0003,
2771
+ "step": 11100
2772
+ },
2773
+ {
2774
+ "epoch": 654.29,
2775
+ "learning_rate": 1.0750344827586207e-06,
2776
+ "loss": 0.0002,
2777
+ "step": 11125
2778
+ },
2779
+ {
2780
+ "epoch": 655.76,
2781
+ "learning_rate": 1.0681379310344828e-06,
2782
+ "loss": 0.0003,
2783
+ "step": 11150
2784
+ },
2785
+ {
2786
+ "epoch": 657.24,
2787
+ "learning_rate": 1.0612413793103448e-06,
2788
+ "loss": 0.0003,
2789
+ "step": 11175
2790
+ },
2791
+ {
2792
+ "epoch": 658.71,
2793
+ "learning_rate": 1.0543448275862068e-06,
2794
+ "loss": 0.0005,
2795
+ "step": 11200
2796
+ },
2797
+ {
2798
+ "epoch": 660.18,
2799
+ "learning_rate": 1.0474482758620689e-06,
2800
+ "loss": 0.0002,
2801
+ "step": 11225
2802
+ },
2803
+ {
2804
+ "epoch": 661.65,
2805
+ "learning_rate": 1.0405517241379309e-06,
2806
+ "loss": 0.0002,
2807
+ "step": 11250
2808
+ },
2809
+ {
2810
+ "epoch": 663.12,
2811
+ "learning_rate": 1.033655172413793e-06,
2812
+ "loss": 0.0003,
2813
+ "step": 11275
2814
+ },
2815
+ {
2816
+ "epoch": 664.59,
2817
+ "learning_rate": 1.026758620689655e-06,
2818
+ "loss": 0.0002,
2819
+ "step": 11300
2820
+ },
2821
+ {
2822
+ "epoch": 666.06,
2823
+ "learning_rate": 1.0198620689655172e-06,
2824
+ "loss": 0.0002,
2825
+ "step": 11325
2826
+ },
2827
+ {
2828
+ "epoch": 667.53,
2829
+ "learning_rate": 1.0129655172413794e-06,
2830
+ "loss": 0.0003,
2831
+ "step": 11350
2832
+ },
2833
+ {
2834
+ "epoch": 669.0,
2835
+ "learning_rate": 1.0060689655172415e-06,
2836
+ "loss": 0.0009,
2837
+ "step": 11375
2838
+ },
2839
+ {
2840
+ "epoch": 670.47,
2841
+ "learning_rate": 9.991724137931033e-07,
2842
+ "loss": 0.0002,
2843
+ "step": 11400
2844
+ },
2845
+ {
2846
+ "epoch": 671.94,
2847
+ "learning_rate": 9.922758620689655e-07,
2848
+ "loss": 0.0002,
2849
+ "step": 11425
2850
+ },
2851
+ {
2852
+ "epoch": 673.41,
2853
+ "learning_rate": 9.859310344827587e-07,
2854
+ "loss": 0.0003,
2855
+ "step": 11450
2856
+ },
2857
+ {
2858
+ "epoch": 674.88,
2859
+ "learning_rate": 9.790344827586207e-07,
2860
+ "loss": 0.0002,
2861
+ "step": 11475
2862
+ },
2863
+ {
2864
+ "epoch": 676.35,
2865
+ "learning_rate": 9.721379310344827e-07,
2866
+ "loss": 0.0002,
2867
+ "step": 11500
2868
+ },
2869
+ {
2870
+ "epoch": 677.82,
2871
+ "learning_rate": 9.652413793103448e-07,
2872
+ "loss": 0.0002,
2873
+ "step": 11525
2874
+ },
2875
+ {
2876
+ "epoch": 679.29,
2877
+ "learning_rate": 9.583448275862068e-07,
2878
+ "loss": 0.0003,
2879
+ "step": 11550
2880
+ },
2881
+ {
2882
+ "epoch": 680.76,
2883
+ "learning_rate": 9.514482758620688e-07,
2884
+ "loss": 0.0003,
2885
+ "step": 11575
2886
+ },
2887
+ {
2888
+ "epoch": 682.24,
2889
+ "learning_rate": 9.44551724137931e-07,
2890
+ "loss": 0.0003,
2891
+ "step": 11600
2892
+ },
2893
+ {
2894
+ "epoch": 683.71,
2895
+ "learning_rate": 9.376551724137931e-07,
2896
+ "loss": 0.0002,
2897
+ "step": 11625
2898
+ },
2899
+ {
2900
+ "epoch": 685.18,
2901
+ "learning_rate": 9.307586206896552e-07,
2902
+ "loss": 0.0002,
2903
+ "step": 11650
2904
+ },
2905
+ {
2906
+ "epoch": 686.65,
2907
+ "learning_rate": 9.238620689655172e-07,
2908
+ "loss": 0.0003,
2909
+ "step": 11675
2910
+ },
2911
+ {
2912
+ "epoch": 688.12,
2913
+ "learning_rate": 9.169655172413792e-07,
2914
+ "loss": 0.0003,
2915
+ "step": 11700
2916
+ },
2917
+ {
2918
+ "epoch": 689.59,
2919
+ "learning_rate": 9.100689655172414e-07,
2920
+ "loss": 0.0001,
2921
+ "step": 11725
2922
+ },
2923
+ {
2924
+ "epoch": 691.06,
2925
+ "learning_rate": 9.031724137931034e-07,
2926
+ "loss": 0.0004,
2927
+ "step": 11750
2928
+ },
2929
+ {
2930
+ "epoch": 692.53,
2931
+ "learning_rate": 8.962758620689654e-07,
2932
+ "loss": 0.0003,
2933
+ "step": 11775
2934
+ },
2935
+ {
2936
+ "epoch": 694.0,
2937
+ "learning_rate": 8.893793103448275e-07,
2938
+ "loss": 0.0005,
2939
+ "step": 11800
2940
+ },
2941
+ {
2942
+ "epoch": 695.47,
2943
+ "learning_rate": 8.824827586206897e-07,
2944
+ "loss": 0.0002,
2945
+ "step": 11825
2946
+ },
2947
+ {
2948
+ "epoch": 696.94,
2949
+ "learning_rate": 8.755862068965517e-07,
2950
+ "loss": 0.0002,
2951
+ "step": 11850
2952
+ },
2953
+ {
2954
+ "epoch": 698.41,
2955
+ "learning_rate": 8.686896551724138e-07,
2956
+ "loss": 0.0002,
2957
+ "step": 11875
2958
+ },
2959
+ {
2960
+ "epoch": 699.88,
2961
+ "learning_rate": 8.617931034482758e-07,
2962
+ "loss": 0.0002,
2963
+ "step": 11900
2964
+ },
2965
+ {
2966
+ "epoch": 701.35,
2967
+ "learning_rate": 8.548965517241378e-07,
2968
+ "loss": 0.0003,
2969
+ "step": 11925
2970
+ },
2971
+ {
2972
+ "epoch": 702.82,
2973
+ "learning_rate": 8.48e-07,
2974
+ "loss": 0.0002,
2975
+ "step": 11950
2976
+ },
2977
+ {
2978
+ "epoch": 704.29,
2979
+ "learning_rate": 8.41103448275862e-07,
2980
+ "loss": 0.0002,
2981
+ "step": 11975
2982
+ },
2983
+ {
2984
+ "epoch": 705.76,
2985
+ "learning_rate": 8.34206896551724e-07,
2986
+ "loss": 0.0003,
2987
+ "step": 12000
2988
+ },
2989
+ {
2990
+ "epoch": 705.76,
2991
+ "eval_loss": 0.55615234375,
2992
+ "eval_runtime": 158.1148,
2993
+ "eval_samples_per_second": 1.72,
2994
+ "eval_steps_per_second": 0.108,
2995
+ "eval_wer": 9.9832838038633,
2996
+ "step": 12000
2997
+ }
2998
+ ],
2999
+ "max_steps": 15000,
3000
+ "num_train_epochs": 883,
3001
+ "total_flos": 3.7112868308030325e+20,
3002
+ "trial_name": null,
3003
+ "trial_params": null
3004
+ }
checkpoint-12000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ace94bd686a3b86a956daef6b1762ce143186f5cba98160acd0413dfd90d6e56
3
+ size 4795
checkpoint-12000/zero_to_fp32.py ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
4
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
5
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
6
+ # application.
7
+ #
8
+ # example: python zero_to_fp32.py . pytorch_model.bin
9
+
10
+ import argparse
11
+ import torch
12
+ import glob
13
+ import math
14
+ import os
15
+ import re
16
+ from collections import OrderedDict
17
+
18
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
19
+ # DeepSpeed data structures it has to be available in the current python environment.
20
+ from deepspeed.utils import logger
21
+ from deepspeed.checkpoint.constants import (DS_VERSION,
22
+ OPTIMIZER_STATE_DICT,
23
+ SINGLE_PARTITION_OF_FP32_GROUPS,
24
+ FP32_FLAT_GROUPS,
25
+ ZERO_STAGE,
26
+ PARTITION_COUNT,
27
+ PARAM_SHAPES,
28
+ BUFFER_NAMES)
29
+
30
+ debug = 0
31
+
32
+ # load to cpu
33
+ device = torch.device('cpu')
34
+
35
+
36
+ def atoi(text):
37
+ return int(text) if text.isdigit() else text
38
+
39
+
40
+ def natural_keys(text):
41
+ '''
42
+ alist.sort(key=natural_keys) sorts in human order
43
+ http://nedbatchelder.com/blog/200712/human_sorting.html
44
+ (See Toothy's implementation in the comments)
45
+ '''
46
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
47
+
48
+
49
+ def get_model_state_file(checkpoint_dir, zero_stage):
50
+ if not os.path.isdir(checkpoint_dir):
51
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
52
+
53
+ # there should be only one file
54
+ if zero_stage == 2:
55
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
56
+ elif zero_stage == 3:
57
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
58
+
59
+ if not os.path.exists(file):
60
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
61
+
62
+ return file
63
+
64
+
65
+ def get_optim_files(checkpoint_dir):
66
+ # XXX: need to test that this simple glob rule works for multi-node setup too
67
+ optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
68
+ "*_optim_states.pt")),
69
+ key=natural_keys)
70
+
71
+ if len(optim_files) == 0:
72
+ raise FileNotFoundError(
73
+ f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
74
+
75
+ return optim_files
76
+
77
+
78
+ def parse_model_state(file):
79
+ state_dict = torch.load(file, map_location=device)
80
+
81
+ if BUFFER_NAMES not in state_dict:
82
+ raise ValueError(f"{file} is not a model state checkpoint")
83
+ buffer_names = state_dict[BUFFER_NAMES]
84
+ if debug:
85
+ print("Found buffers:", buffer_names)
86
+
87
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
88
+ buffers = {
89
+ k: v.float()
90
+ for k,
91
+ v in state_dict["module"].items() if k in buffer_names
92
+ }
93
+ param_shapes = state_dict[PARAM_SHAPES]
94
+
95
+ ds_version = state_dict.get(DS_VERSION, None)
96
+
97
+ return buffers, param_shapes, ds_version
98
+
99
+
100
+ def parse_optim_states(files, ds_checkpoint_dir):
101
+
102
+ total_files = len(files)
103
+ state_dicts = []
104
+ for f in files:
105
+ state_dicts.append(torch.load(f, map_location=device))
106
+
107
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
108
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
109
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
110
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
111
+
112
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
113
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
114
+ # use the max of the partition_count to get the dp world_size.
115
+
116
+ if type(world_size) is list:
117
+ world_size = max(world_size)
118
+
119
+ if world_size != total_files:
120
+ raise ValueError(
121
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
122
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
123
+ )
124
+
125
+ # the groups are named differently in each stage
126
+ if zero_stage == 2:
127
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
128
+ elif zero_stage == 3:
129
+ fp32_groups_key = FP32_FLAT_GROUPS
130
+ else:
131
+ raise ValueError(f"unknown zero stage {zero_stage}")
132
+
133
+ if zero_stage == 2:
134
+ fp32_flat_groups = [
135
+ state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
136
+ for i in range(len(state_dicts))
137
+ ]
138
+ elif zero_stage == 3:
139
+ # if there is more than one param group, there will be multiple flattened tensors - one
140
+ # flattened tensor per group - for simplicity merge them into a single tensor
141
+ #
142
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
143
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
144
+
145
+ fp32_flat_groups = [
146
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
147
+ 0) for i in range(len(state_dicts))
148
+ ]
149
+
150
+ return zero_stage, world_size, fp32_flat_groups
151
+
152
+
153
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
154
+ """
155
+ Returns fp32 state_dict reconstructed from ds checkpoint
156
+
157
+ Args:
158
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
159
+
160
+ """
161
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
162
+
163
+ optim_files = get_optim_files(ds_checkpoint_dir)
164
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
165
+ print(
166
+ f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
167
+
168
+ model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
169
+ buffers, param_shapes, ds_version = parse_model_state(model_file)
170
+ print(f'Parsing checkpoint created by deepspeed=={ds_version}')
171
+
172
+ if zero_stage == 2:
173
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
174
+ param_shapes,
175
+ fp32_flat_groups,
176
+ buffers)
177
+ elif zero_stage == 3:
178
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
179
+ param_shapes,
180
+ fp32_flat_groups,
181
+ buffers)
182
+
183
+
184
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
185
+ param_shapes,
186
+ fp32_flat_groups,
187
+ buffers):
188
+
189
+ # Reconstruction protocol:
190
+ #
191
+ # XXX: document this
192
+
193
+ if debug:
194
+ for i in range(world_size):
195
+ for j in range(len(fp32_flat_groups[0])):
196
+ print(
197
+ f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
198
+
199
+ # XXX: memory usage doubles here (zero2)
200
+ num_param_groups = len(fp32_flat_groups[0])
201
+ merged_single_partition_of_fp32_groups = []
202
+ for i in range(num_param_groups):
203
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
204
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
205
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
206
+ avail_numel = sum([
207
+ full_single_fp32_vector.numel()
208
+ for full_single_fp32_vector in merged_single_partition_of_fp32_groups
209
+ ])
210
+
211
+ if debug:
212
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
213
+ wanted_numel = sum(
214
+ [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
215
+ # not asserting if there is a mismatch due to possible padding
216
+ print(f"Have {avail_numel} numels to process.")
217
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
218
+
219
+ state_dict = OrderedDict()
220
+
221
+ # buffers
222
+ state_dict.update(buffers)
223
+ if debug:
224
+ print(f"added {len(buffers)} buffers")
225
+
226
+ # params
227
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
228
+ # out-of-core computing solution
229
+ total_numel = 0
230
+ total_params = 0
231
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
232
+ offset = 0
233
+ avail_numel = full_single_fp32_vector.numel()
234
+ for name, shape in shapes.items():
235
+
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+ total_params += 1
239
+
240
+ if debug:
241
+ print(
242
+ f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
243
+ )
244
+ state_dict[name] = full_single_fp32_vector.narrow(
245
+ 0,
246
+ offset,
247
+ unpartitioned_numel).view(shape)
248
+ offset += unpartitioned_numel
249
+
250
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
251
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
252
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
253
+ # live optimizer object, so we are checking that the numbers are within the right range
254
+ align_to = 2 * world_size
255
+
256
+ def zero2_align(x):
257
+ return align_to * math.ceil(x / align_to)
258
+
259
+ if debug:
260
+ print(f"original offset={offset}, avail_numel={avail_numel}")
261
+
262
+ offset = zero2_align(offset)
263
+ avail_numel = zero2_align(avail_numel)
264
+
265
+ if debug:
266
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
267
+
268
+ # Sanity check
269
+ if offset != avail_numel:
270
+ raise ValueError(
271
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
272
+
273
+ print(
274
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
275
+ )
276
+
277
+ return state_dict
278
+
279
+
280
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
281
+ remainder = unpartitioned_numel % world_size
282
+ padding_numel = (world_size - remainder) if remainder else 0
283
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
284
+ return partitioned_numel, padding_numel
285
+
286
+
287
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
288
+ param_shapes,
289
+ fp32_flat_groups,
290
+ buffers):
291
+
292
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
293
+ # param, re-consolidating each param, while dealing with padding if any
294
+
295
+ avail_numel = fp32_flat_groups[0].numel() * world_size
296
+ # merge list of dicts, preserving order
297
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
298
+
299
+ if debug:
300
+ for i in range(world_size):
301
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
302
+
303
+ wanted_params = len(param_shapes)
304
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
305
+ # not asserting if there is a mismatch due to possible padding
306
+ print(f"Have {avail_numel} numels to process.")
307
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
308
+
309
+ state_dict = OrderedDict()
310
+
311
+ # buffers
312
+ state_dict.update(buffers)
313
+ if debug:
314
+ print(f"added {len(buffers)} buffers")
315
+
316
+ # params
317
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
318
+ # out-of-core computing solution
319
+ offset = 0
320
+ total_numel = 0
321
+ total_params = 0
322
+ for name, shape in param_shapes.items():
323
+
324
+ unpartitioned_numel = shape.numel()
325
+ total_numel += unpartitioned_numel
326
+ total_params += 1
327
+
328
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
329
+
330
+ if debug:
331
+ print(
332
+ f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
333
+ )
334
+
335
+ # XXX: memory usage doubles here
336
+ state_dict[name] = torch.cat(
337
+ tuple(fp32_flat_groups[i].narrow(0,
338
+ offset,
339
+ partitioned_numel)
340
+ for i in range(world_size)),
341
+ 0).narrow(0,
342
+ 0,
343
+ unpartitioned_numel).view(shape)
344
+ offset += partitioned_numel
345
+
346
+ offset *= world_size
347
+
348
+ # Sanity check
349
+ if offset != avail_numel:
350
+ raise ValueError(
351
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
352
+
353
+ print(
354
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
355
+ )
356
+
357
+ return state_dict
358
+
359
+
360
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
361
+ """
362
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
363
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
364
+ via a model hub.
365
+
366
+ Args:
367
+ - ``checkpoint_dir``: path to the desired checkpoint folder
368
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
369
+
370
+ Returns:
371
+ - pytorch ``state_dict``
372
+
373
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
374
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
375
+ the checkpoint.
376
+
377
+ A typical usage might be ::
378
+
379
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
380
+ # do the training and checkpoint saving
381
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
382
+ model = model.cpu() # move to cpu
383
+ model.load_state_dict(state_dict)
384
+ # submit to model hub or save the model to share with others
385
+
386
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
387
+ application. i.e. you will need to re-initialize the deepspeed engine, since
388
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
389
+
390
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
391
+
392
+ """
393
+ if tag is None:
394
+ latest_path = os.path.join(checkpoint_dir, 'latest')
395
+ if os.path.isfile(latest_path):
396
+ with open(latest_path, 'r') as fd:
397
+ tag = fd.read().strip()
398
+ else:
399
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
400
+
401
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
402
+
403
+ if not os.path.isdir(ds_checkpoint_dir):
404
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
405
+
406
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
407
+
408
+
409
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
410
+ """
411
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
412
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
413
+
414
+ Args:
415
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
416
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
417
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
418
+ """
419
+
420
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
421
+ print(f"Saving fp32 state dict to {output_file}")
422
+ torch.save(state_dict, output_file)
423
+
424
+
425
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
426
+ """
427
+ 1. Put the provided model to cpu
428
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
429
+ 3. Load it into the provided model
430
+
431
+ Args:
432
+ - ``model``: the model object to update
433
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
434
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
435
+
436
+ Returns:
437
+ - ``model`: modified model
438
+
439
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
440
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
441
+ conveniently placed for you in the checkpoint folder.
442
+
443
+ A typical usage might be ::
444
+
445
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
446
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
447
+ # submit to model hub or save the model to share with others
448
+
449
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
450
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
451
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
452
+
453
+ """
454
+ logger.info(f"Extracting fp32 weights")
455
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
456
+
457
+ logger.info(f"Overwriting model with fp32 weights")
458
+ model = model.cpu()
459
+ model.load_state_dict(state_dict, strict=False)
460
+
461
+ return model
462
+
463
+
464
+ if __name__ == "__main__":
465
+
466
+ parser = argparse.ArgumentParser()
467
+ parser.add_argument(
468
+ "checkpoint_dir",
469
+ type=str,
470
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
471
+ parser.add_argument(
472
+ "output_file",
473
+ type=str,
474
+ help=
475
+ "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
476
+ )
477
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
478
+ args = parser.parse_args()
479
+
480
+ debug = args.debug
481
+
482
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)
checkpoint-17000/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "emilios/whisper-medium-el-n2",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 24,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.1,
21
+ "encoder_attention_heads": 16,
22
+ "encoder_ffn_dim": 4096,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 24,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": null,
27
+ "init_std": 0.02,
28
+ "is_encoder_decoder": true,
29
+ "max_length": 448,
30
+ "max_source_positions": 1500,
31
+ "max_target_positions": 448,
32
+ "model_type": "whisper",
33
+ "num_hidden_layers": 24,
34
+ "num_mel_bins": 80,
35
+ "pad_token_id": 50257,
36
+ "scale_embedding": false,
37
+ "torch_dtype": "float16",
38
+ "transformers_version": "4.26.0.dev0",
39
+ "use_cache": false,
40
+ "vocab_size": 51865
41
+ }
checkpoint-17000/global_step17000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96607ca6b5b82f3f0d2424f2865fcce2168723ca17a494e51444d00f0dcaf232
3
+ size 1527967899
checkpoint-17000/global_step17000/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:032229e6f538be4579c8648b034364d5f2107de2701e361cde50d96bfe61d009
3
+ size 9166378846
checkpoint-17000/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step17000
checkpoint-17000/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-17000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9fa2de829d24e99eb20c8dd7280e503f96b588ca8051fab51af5f6ba304a842
3
+ size 1527847357
checkpoint-17000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ec4fc26fdaa6459af915c01326877a645e4e4b0be7d9a250bf019a689d1dba3
3
+ size 14639
checkpoint-17000/trainer_state.json ADDED
@@ -0,0 +1,4249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 9.778974739970282,
3
+ "best_model_checkpoint": "./checkpoint-9000",
4
+ "epoch": 999.4705882352941,
5
+ "global_step": 17000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 2.78,
12
+ "learning_rate": 5.0453611334320685e-06,
13
+ "loss": 0.6804,
14
+ "step": 25
15
+ },
16
+ {
17
+ "epoch": 5.56,
18
+ "learning_rate": 6.229195710491767e-06,
19
+ "loss": 0.1847,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 8.33,
24
+ "learning_rate": 6.903829450223392e-06,
25
+ "loss": 0.0821,
26
+ "step": 75
27
+ },
28
+ {
29
+ "epoch": 11.11,
30
+ "learning_rate": 7.377725845391017e-06,
31
+ "loss": 0.0485,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 13.89,
36
+ "learning_rate": 7.743343231239583e-06,
37
+ "loss": 0.0432,
38
+ "step": 125
39
+ },
40
+ {
41
+ "epoch": 16.67,
42
+ "learning_rate": 8.041073861170494e-06,
43
+ "loss": 0.0328,
44
+ "step": 150
45
+ },
46
+ {
47
+ "epoch": 19.44,
48
+ "learning_rate": 8.292222957399574e-06,
49
+ "loss": 0.0291,
50
+ "step": 175
51
+ },
52
+ {
53
+ "epoch": 22.22,
54
+ "learning_rate": 8.509413541357755e-06,
55
+ "loss": 0.0298,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 25.0,
60
+ "learning_rate": 8.700744577655557e-06,
61
+ "loss": 0.0269,
62
+ "step": 225
63
+ },
64
+ {
65
+ "epoch": 27.78,
66
+ "learning_rate": 8.871723942761204e-06,
67
+ "loss": 0.0272,
68
+ "step": 250
69
+ },
70
+ {
71
+ "epoch": 30.56,
72
+ "learning_rate": 9.026267958246849e-06,
73
+ "loss": 0.027,
74
+ "step": 275
75
+ },
76
+ {
77
+ "epoch": 33.33,
78
+ "learning_rate": 9.16726106663399e-06,
79
+ "loss": 0.0213,
80
+ "step": 300
81
+ },
82
+ {
83
+ "epoch": 36.11,
84
+ "learning_rate": 9.296889251455016e-06,
85
+ "loss": 0.0215,
86
+ "step": 325
87
+ },
88
+ {
89
+ "epoch": 38.89,
90
+ "learning_rate": 9.416848797368692e-06,
91
+ "loss": 0.0195,
92
+ "step": 350
93
+ },
94
+ {
95
+ "epoch": 41.67,
96
+ "learning_rate": 9.528482449516371e-06,
97
+ "loss": 0.0167,
98
+ "step": 375
99
+ },
100
+ {
101
+ "epoch": 44.44,
102
+ "learning_rate": 9.632871309784314e-06,
103
+ "loss": 0.0184,
104
+ "step": 400
105
+ },
106
+ {
107
+ "epoch": 47.22,
108
+ "learning_rate": 9.73089868785391e-06,
109
+ "loss": 0.0159,
110
+ "step": 425
111
+ },
112
+ {
113
+ "epoch": 50.0,
114
+ "learning_rate": 9.823295589572114e-06,
115
+ "loss": 0.0172,
116
+ "step": 450
117
+ },
118
+ {
119
+ "epoch": 52.78,
120
+ "learning_rate": 9.910673836465484e-06,
121
+ "loss": 0.0123,
122
+ "step": 475
123
+ },
124
+ {
125
+ "epoch": 55.56,
126
+ "learning_rate": 9.993550644973805e-06,
127
+ "loss": 0.0144,
128
+ "step": 500
129
+ },
130
+ {
131
+ "epoch": 58.33,
132
+ "learning_rate": 9.951111111111111e-06,
133
+ "loss": 0.0135,
134
+ "step": 525
135
+ },
136
+ {
137
+ "epoch": 61.11,
138
+ "learning_rate": 9.895555555555557e-06,
139
+ "loss": 0.0128,
140
+ "step": 550
141
+ },
142
+ {
143
+ "epoch": 63.89,
144
+ "learning_rate": 9.84e-06,
145
+ "loss": 0.0115,
146
+ "step": 575
147
+ },
148
+ {
149
+ "epoch": 66.67,
150
+ "learning_rate": 9.784444444444445e-06,
151
+ "loss": 0.0105,
152
+ "step": 600
153
+ },
154
+ {
155
+ "epoch": 69.44,
156
+ "learning_rate": 9.72888888888889e-06,
157
+ "loss": 0.0104,
158
+ "step": 625
159
+ },
160
+ {
161
+ "epoch": 72.22,
162
+ "learning_rate": 9.673333333333334e-06,
163
+ "loss": 0.0087,
164
+ "step": 650
165
+ },
166
+ {
167
+ "epoch": 75.0,
168
+ "learning_rate": 9.617777777777778e-06,
169
+ "loss": 0.0091,
170
+ "step": 675
171
+ },
172
+ {
173
+ "epoch": 77.78,
174
+ "learning_rate": 9.562222222222223e-06,
175
+ "loss": 0.0085,
176
+ "step": 700
177
+ },
178
+ {
179
+ "epoch": 80.56,
180
+ "learning_rate": 9.506666666666667e-06,
181
+ "loss": 0.011,
182
+ "step": 725
183
+ },
184
+ {
185
+ "epoch": 83.33,
186
+ "learning_rate": 9.451111111111112e-06,
187
+ "loss": 0.0117,
188
+ "step": 750
189
+ },
190
+ {
191
+ "epoch": 86.11,
192
+ "learning_rate": 9.395555555555556e-06,
193
+ "loss": 0.0088,
194
+ "step": 775
195
+ },
196
+ {
197
+ "epoch": 88.89,
198
+ "learning_rate": 9.340000000000002e-06,
199
+ "loss": 0.0077,
200
+ "step": 800
201
+ },
202
+ {
203
+ "epoch": 91.67,
204
+ "learning_rate": 9.284444444444444e-06,
205
+ "loss": 0.0091,
206
+ "step": 825
207
+ },
208
+ {
209
+ "epoch": 94.44,
210
+ "learning_rate": 9.22888888888889e-06,
211
+ "loss": 0.0067,
212
+ "step": 850
213
+ },
214
+ {
215
+ "epoch": 97.22,
216
+ "learning_rate": 9.173333333333334e-06,
217
+ "loss": 0.0082,
218
+ "step": 875
219
+ },
220
+ {
221
+ "epoch": 100.0,
222
+ "learning_rate": 9.117777777777778e-06,
223
+ "loss": 0.0055,
224
+ "step": 900
225
+ },
226
+ {
227
+ "epoch": 102.78,
228
+ "learning_rate": 9.062222222222224e-06,
229
+ "loss": 0.0077,
230
+ "step": 925
231
+ },
232
+ {
233
+ "epoch": 105.56,
234
+ "learning_rate": 9.006666666666666e-06,
235
+ "loss": 0.0055,
236
+ "step": 950
237
+ },
238
+ {
239
+ "epoch": 108.33,
240
+ "learning_rate": 8.951111111111112e-06,
241
+ "loss": 0.005,
242
+ "step": 975
243
+ },
244
+ {
245
+ "epoch": 111.11,
246
+ "learning_rate": 8.895555555555556e-06,
247
+ "loss": 0.0066,
248
+ "step": 1000
249
+ },
250
+ {
251
+ "epoch": 111.11,
252
+ "eval_loss": 0.2357177734375,
253
+ "eval_runtime": 64.7785,
254
+ "eval_samples_per_second": 2.022,
255
+ "eval_steps_per_second": 0.139,
256
+ "eval_wer": 23.044096728307252,
257
+ "step": 1000
258
+ },
259
+ {
260
+ "epoch": 113.89,
261
+ "learning_rate": 8.844444444444445e-06,
262
+ "loss": 0.0057,
263
+ "step": 1025
264
+ },
265
+ {
266
+ "epoch": 116.67,
267
+ "learning_rate": 8.788888888888891e-06,
268
+ "loss": 0.0096,
269
+ "step": 1050
270
+ },
271
+ {
272
+ "epoch": 119.44,
273
+ "learning_rate": 8.733333333333333e-06,
274
+ "loss": 0.0063,
275
+ "step": 1075
276
+ },
277
+ {
278
+ "epoch": 122.22,
279
+ "learning_rate": 8.677777777777779e-06,
280
+ "loss": 0.0069,
281
+ "step": 1100
282
+ },
283
+ {
284
+ "epoch": 125.0,
285
+ "learning_rate": 8.622222222222223e-06,
286
+ "loss": 0.0069,
287
+ "step": 1125
288
+ },
289
+ {
290
+ "epoch": 127.78,
291
+ "learning_rate": 8.566666666666667e-06,
292
+ "loss": 0.0046,
293
+ "step": 1150
294
+ },
295
+ {
296
+ "epoch": 130.56,
297
+ "learning_rate": 8.511111111111113e-06,
298
+ "loss": 0.0051,
299
+ "step": 1175
300
+ },
301
+ {
302
+ "epoch": 133.33,
303
+ "learning_rate": 8.455555555555555e-06,
304
+ "loss": 0.0055,
305
+ "step": 1200
306
+ },
307
+ {
308
+ "epoch": 136.11,
309
+ "learning_rate": 8.400000000000001e-06,
310
+ "loss": 0.0042,
311
+ "step": 1225
312
+ },
313
+ {
314
+ "epoch": 138.89,
315
+ "learning_rate": 8.344444444444445e-06,
316
+ "loss": 0.0042,
317
+ "step": 1250
318
+ },
319
+ {
320
+ "epoch": 141.67,
321
+ "learning_rate": 8.288888888888889e-06,
322
+ "loss": 0.005,
323
+ "step": 1275
324
+ },
325
+ {
326
+ "epoch": 144.44,
327
+ "learning_rate": 8.233333333333335e-06,
328
+ "loss": 0.0054,
329
+ "step": 1300
330
+ },
331
+ {
332
+ "epoch": 147.22,
333
+ "learning_rate": 8.177777777777779e-06,
334
+ "loss": 0.0052,
335
+ "step": 1325
336
+ },
337
+ {
338
+ "epoch": 150.0,
339
+ "learning_rate": 8.122222222222223e-06,
340
+ "loss": 0.0057,
341
+ "step": 1350
342
+ },
343
+ {
344
+ "epoch": 152.78,
345
+ "learning_rate": 8.066666666666667e-06,
346
+ "loss": 0.0039,
347
+ "step": 1375
348
+ },
349
+ {
350
+ "epoch": 155.56,
351
+ "learning_rate": 8.011111111111113e-06,
352
+ "loss": 0.0032,
353
+ "step": 1400
354
+ },
355
+ {
356
+ "epoch": 158.33,
357
+ "learning_rate": 7.955555555555557e-06,
358
+ "loss": 0.0034,
359
+ "step": 1425
360
+ },
361
+ {
362
+ "epoch": 161.11,
363
+ "learning_rate": 7.902222222222223e-06,
364
+ "loss": 0.0068,
365
+ "step": 1450
366
+ },
367
+ {
368
+ "epoch": 163.89,
369
+ "learning_rate": 7.846666666666667e-06,
370
+ "loss": 0.0034,
371
+ "step": 1475
372
+ },
373
+ {
374
+ "epoch": 166.67,
375
+ "learning_rate": 7.791111111111111e-06,
376
+ "loss": 0.0026,
377
+ "step": 1500
378
+ },
379
+ {
380
+ "epoch": 169.44,
381
+ "learning_rate": 7.735555555555557e-06,
382
+ "loss": 0.0036,
383
+ "step": 1525
384
+ },
385
+ {
386
+ "epoch": 172.22,
387
+ "learning_rate": 7.680000000000001e-06,
388
+ "loss": 0.0033,
389
+ "step": 1550
390
+ },
391
+ {
392
+ "epoch": 175.0,
393
+ "learning_rate": 7.624444444444445e-06,
394
+ "loss": 0.0021,
395
+ "step": 1575
396
+ },
397
+ {
398
+ "epoch": 177.78,
399
+ "learning_rate": 7.56888888888889e-06,
400
+ "loss": 0.0033,
401
+ "step": 1600
402
+ },
403
+ {
404
+ "epoch": 180.56,
405
+ "learning_rate": 7.513333333333334e-06,
406
+ "loss": 0.0037,
407
+ "step": 1625
408
+ },
409
+ {
410
+ "epoch": 183.33,
411
+ "learning_rate": 7.457777777777778e-06,
412
+ "loss": 0.0032,
413
+ "step": 1650
414
+ },
415
+ {
416
+ "epoch": 186.11,
417
+ "learning_rate": 7.402222222222223e-06,
418
+ "loss": 0.0037,
419
+ "step": 1675
420
+ },
421
+ {
422
+ "epoch": 188.89,
423
+ "learning_rate": 7.346666666666668e-06,
424
+ "loss": 0.0022,
425
+ "step": 1700
426
+ },
427
+ {
428
+ "epoch": 191.67,
429
+ "learning_rate": 7.291111111111112e-06,
430
+ "loss": 0.0024,
431
+ "step": 1725
432
+ },
433
+ {
434
+ "epoch": 194.44,
435
+ "learning_rate": 7.235555555555556e-06,
436
+ "loss": 0.0026,
437
+ "step": 1750
438
+ },
439
+ {
440
+ "epoch": 197.22,
441
+ "learning_rate": 7.180000000000001e-06,
442
+ "loss": 0.0022,
443
+ "step": 1775
444
+ },
445
+ {
446
+ "epoch": 200.0,
447
+ "learning_rate": 7.124444444444445e-06,
448
+ "loss": 0.0026,
449
+ "step": 1800
450
+ },
451
+ {
452
+ "epoch": 202.78,
453
+ "learning_rate": 7.06888888888889e-06,
454
+ "loss": 0.0032,
455
+ "step": 1825
456
+ },
457
+ {
458
+ "epoch": 205.56,
459
+ "learning_rate": 7.0133333333333345e-06,
460
+ "loss": 0.0033,
461
+ "step": 1850
462
+ },
463
+ {
464
+ "epoch": 208.33,
465
+ "learning_rate": 6.9577777777777785e-06,
466
+ "loss": 0.0027,
467
+ "step": 1875
468
+ },
469
+ {
470
+ "epoch": 211.11,
471
+ "learning_rate": 6.902222222222223e-06,
472
+ "loss": 0.0043,
473
+ "step": 1900
474
+ },
475
+ {
476
+ "epoch": 213.89,
477
+ "learning_rate": 6.846666666666667e-06,
478
+ "loss": 0.0028,
479
+ "step": 1925
480
+ },
481
+ {
482
+ "epoch": 216.67,
483
+ "learning_rate": 6.7911111111111115e-06,
484
+ "loss": 0.0012,
485
+ "step": 1950
486
+ },
487
+ {
488
+ "epoch": 219.44,
489
+ "learning_rate": 6.735555555555556e-06,
490
+ "loss": 0.0015,
491
+ "step": 1975
492
+ },
493
+ {
494
+ "epoch": 222.22,
495
+ "learning_rate": 6.680000000000001e-06,
496
+ "loss": 0.0024,
497
+ "step": 2000
498
+ },
499
+ {
500
+ "epoch": 222.22,
501
+ "eval_loss": 0.2607421875,
502
+ "eval_runtime": 57.0802,
503
+ "eval_samples_per_second": 2.295,
504
+ "eval_steps_per_second": 0.158,
505
+ "eval_wer": 19.665718349928877,
506
+ "step": 2000
507
+ },
508
+ {
509
+ "epoch": 225.0,
510
+ "learning_rate": 6.6244444444444445e-06,
511
+ "loss": 0.0029,
512
+ "step": 2025
513
+ },
514
+ {
515
+ "epoch": 227.78,
516
+ "learning_rate": 6.568888888888889e-06,
517
+ "loss": 0.0021,
518
+ "step": 2050
519
+ },
520
+ {
521
+ "epoch": 230.56,
522
+ "learning_rate": 6.513333333333333e-06,
523
+ "loss": 0.0022,
524
+ "step": 2075
525
+ },
526
+ {
527
+ "epoch": 233.33,
528
+ "learning_rate": 6.457777777777778e-06,
529
+ "loss": 0.0022,
530
+ "step": 2100
531
+ },
532
+ {
533
+ "epoch": 236.11,
534
+ "learning_rate": 6.402222222222223e-06,
535
+ "loss": 0.0011,
536
+ "step": 2125
537
+ },
538
+ {
539
+ "epoch": 238.89,
540
+ "learning_rate": 6.346666666666668e-06,
541
+ "loss": 0.0026,
542
+ "step": 2150
543
+ },
544
+ {
545
+ "epoch": 241.67,
546
+ "learning_rate": 6.291111111111111e-06,
547
+ "loss": 0.0021,
548
+ "step": 2175
549
+ },
550
+ {
551
+ "epoch": 244.44,
552
+ "learning_rate": 6.235555555555556e-06,
553
+ "loss": 0.0016,
554
+ "step": 2200
555
+ },
556
+ {
557
+ "epoch": 247.22,
558
+ "learning_rate": 6.18e-06,
559
+ "loss": 0.0024,
560
+ "step": 2225
561
+ },
562
+ {
563
+ "epoch": 250.0,
564
+ "learning_rate": 6.124444444444445e-06,
565
+ "loss": 0.0046,
566
+ "step": 2250
567
+ },
568
+ {
569
+ "epoch": 252.78,
570
+ "learning_rate": 6.06888888888889e-06,
571
+ "loss": 0.0018,
572
+ "step": 2275
573
+ },
574
+ {
575
+ "epoch": 255.56,
576
+ "learning_rate": 6.013333333333335e-06,
577
+ "loss": 0.0012,
578
+ "step": 2300
579
+ },
580
+ {
581
+ "epoch": 258.33,
582
+ "learning_rate": 5.957777777777778e-06,
583
+ "loss": 0.0014,
584
+ "step": 2325
585
+ },
586
+ {
587
+ "epoch": 261.11,
588
+ "learning_rate": 5.902222222222223e-06,
589
+ "loss": 0.0007,
590
+ "step": 2350
591
+ },
592
+ {
593
+ "epoch": 263.89,
594
+ "learning_rate": 5.846666666666667e-06,
595
+ "loss": 0.0014,
596
+ "step": 2375
597
+ },
598
+ {
599
+ "epoch": 266.67,
600
+ "learning_rate": 5.791111111111112e-06,
601
+ "loss": 0.0009,
602
+ "step": 2400
603
+ },
604
+ {
605
+ "epoch": 269.44,
606
+ "learning_rate": 5.735555555555557e-06,
607
+ "loss": 0.0008,
608
+ "step": 2425
609
+ },
610
+ {
611
+ "epoch": 272.22,
612
+ "learning_rate": 5.68e-06,
613
+ "loss": 0.0028,
614
+ "step": 2450
615
+ },
616
+ {
617
+ "epoch": 275.0,
618
+ "learning_rate": 5.624444444444445e-06,
619
+ "loss": 0.002,
620
+ "step": 2475
621
+ },
622
+ {
623
+ "epoch": 277.78,
624
+ "learning_rate": 5.56888888888889e-06,
625
+ "loss": 0.0011,
626
+ "step": 2500
627
+ },
628
+ {
629
+ "epoch": 280.56,
630
+ "learning_rate": 5.513333333333334e-06,
631
+ "loss": 0.001,
632
+ "step": 2525
633
+ },
634
+ {
635
+ "epoch": 283.33,
636
+ "learning_rate": 5.4577777777777785e-06,
637
+ "loss": 0.0007,
638
+ "step": 2550
639
+ },
640
+ {
641
+ "epoch": 286.11,
642
+ "learning_rate": 5.402222222222223e-06,
643
+ "loss": 0.0007,
644
+ "step": 2575
645
+ },
646
+ {
647
+ "epoch": 288.89,
648
+ "learning_rate": 5.346666666666667e-06,
649
+ "loss": 0.0008,
650
+ "step": 2600
651
+ },
652
+ {
653
+ "epoch": 291.67,
654
+ "learning_rate": 5.2911111111111115e-06,
655
+ "loss": 0.0012,
656
+ "step": 2625
657
+ },
658
+ {
659
+ "epoch": 294.44,
660
+ "learning_rate": 5.235555555555556e-06,
661
+ "loss": 0.0016,
662
+ "step": 2650
663
+ },
664
+ {
665
+ "epoch": 297.22,
666
+ "learning_rate": 5.18e-06,
667
+ "loss": 0.0012,
668
+ "step": 2675
669
+ },
670
+ {
671
+ "epoch": 300.0,
672
+ "learning_rate": 5.124444444444445e-06,
673
+ "loss": 0.001,
674
+ "step": 2700
675
+ },
676
+ {
677
+ "epoch": 302.78,
678
+ "learning_rate": 5.06888888888889e-06,
679
+ "loss": 0.0012,
680
+ "step": 2725
681
+ },
682
+ {
683
+ "epoch": 305.56,
684
+ "learning_rate": 5.013333333333333e-06,
685
+ "loss": 0.001,
686
+ "step": 2750
687
+ },
688
+ {
689
+ "epoch": 308.33,
690
+ "learning_rate": 4.957777777777778e-06,
691
+ "loss": 0.0013,
692
+ "step": 2775
693
+ },
694
+ {
695
+ "epoch": 311.11,
696
+ "learning_rate": 4.902222222222222e-06,
697
+ "loss": 0.0015,
698
+ "step": 2800
699
+ },
700
+ {
701
+ "epoch": 313.89,
702
+ "learning_rate": 4.846666666666667e-06,
703
+ "loss": 0.0014,
704
+ "step": 2825
705
+ },
706
+ {
707
+ "epoch": 316.67,
708
+ "learning_rate": 4.791111111111111e-06,
709
+ "loss": 0.0007,
710
+ "step": 2850
711
+ },
712
+ {
713
+ "epoch": 319.44,
714
+ "learning_rate": 4.735555555555556e-06,
715
+ "loss": 0.0009,
716
+ "step": 2875
717
+ },
718
+ {
719
+ "epoch": 322.22,
720
+ "learning_rate": 4.680000000000001e-06,
721
+ "loss": 0.0021,
722
+ "step": 2900
723
+ },
724
+ {
725
+ "epoch": 325.0,
726
+ "learning_rate": 4.624444444444445e-06,
727
+ "loss": 0.0015,
728
+ "step": 2925
729
+ },
730
+ {
731
+ "epoch": 327.78,
732
+ "learning_rate": 4.568888888888889e-06,
733
+ "loss": 0.0012,
734
+ "step": 2950
735
+ },
736
+ {
737
+ "epoch": 330.56,
738
+ "learning_rate": 4.513333333333333e-06,
739
+ "loss": 0.0009,
740
+ "step": 2975
741
+ },
742
+ {
743
+ "epoch": 333.33,
744
+ "learning_rate": 4.457777777777778e-06,
745
+ "loss": 0.0011,
746
+ "step": 3000
747
+ },
748
+ {
749
+ "epoch": 333.33,
750
+ "eval_loss": 0.277099609375,
751
+ "eval_runtime": 58.1634,
752
+ "eval_samples_per_second": 2.252,
753
+ "eval_steps_per_second": 0.155,
754
+ "eval_wer": 20.874822190611663,
755
+ "step": 3000
756
+ },
757
+ {
758
+ "epoch": 177.47,
759
+ "learning_rate": 1.760888888888889e-06,
760
+ "loss": 0.5801,
761
+ "step": 3025
762
+ },
763
+ {
764
+ "epoch": 178.94,
765
+ "learning_rate": 1.7386666666666666e-06,
766
+ "loss": 0.1501,
767
+ "step": 3050
768
+ },
769
+ {
770
+ "epoch": 180.41,
771
+ "learning_rate": 1.7164444444444444e-06,
772
+ "loss": 0.0789,
773
+ "step": 3075
774
+ },
775
+ {
776
+ "epoch": 181.88,
777
+ "learning_rate": 1.6942222222222222e-06,
778
+ "loss": 0.0531,
779
+ "step": 3100
780
+ },
781
+ {
782
+ "epoch": 183.35,
783
+ "learning_rate": 1.6719999999999998e-06,
784
+ "loss": 0.0409,
785
+ "step": 3125
786
+ },
787
+ {
788
+ "epoch": 184.82,
789
+ "learning_rate": 1.6497777777777777e-06,
790
+ "loss": 0.032,
791
+ "step": 3150
792
+ },
793
+ {
794
+ "epoch": 186.29,
795
+ "learning_rate": 1.6275555555555555e-06,
796
+ "loss": 0.0251,
797
+ "step": 3175
798
+ },
799
+ {
800
+ "epoch": 187.76,
801
+ "learning_rate": 1.6053333333333333e-06,
802
+ "loss": 0.0203,
803
+ "step": 3200
804
+ },
805
+ {
806
+ "epoch": 189.24,
807
+ "learning_rate": 1.5831111111111111e-06,
808
+ "loss": 0.0167,
809
+ "step": 3225
810
+ },
811
+ {
812
+ "epoch": 190.71,
813
+ "learning_rate": 1.560888888888889e-06,
814
+ "loss": 0.0159,
815
+ "step": 3250
816
+ },
817
+ {
818
+ "epoch": 192.18,
819
+ "learning_rate": 1.5386666666666666e-06,
820
+ "loss": 0.0137,
821
+ "step": 3275
822
+ },
823
+ {
824
+ "epoch": 193.65,
825
+ "learning_rate": 1.5164444444444444e-06,
826
+ "loss": 0.0122,
827
+ "step": 3300
828
+ },
829
+ {
830
+ "epoch": 195.12,
831
+ "learning_rate": 1.494222222222222e-06,
832
+ "loss": 0.0106,
833
+ "step": 3325
834
+ },
835
+ {
836
+ "epoch": 196.59,
837
+ "learning_rate": 1.4719999999999998e-06,
838
+ "loss": 0.0094,
839
+ "step": 3350
840
+ },
841
+ {
842
+ "epoch": 198.06,
843
+ "learning_rate": 1.4497777777777777e-06,
844
+ "loss": 0.009,
845
+ "step": 3375
846
+ },
847
+ {
848
+ "epoch": 199.53,
849
+ "learning_rate": 1.4275555555555555e-06,
850
+ "loss": 0.0104,
851
+ "step": 3400
852
+ },
853
+ {
854
+ "epoch": 201.0,
855
+ "learning_rate": 1.4053333333333333e-06,
856
+ "loss": 0.0069,
857
+ "step": 3425
858
+ },
859
+ {
860
+ "epoch": 202.47,
861
+ "learning_rate": 1.3848888888888889e-06,
862
+ "loss": 0.0073,
863
+ "step": 3450
864
+ },
865
+ {
866
+ "epoch": 203.94,
867
+ "learning_rate": 1.3626666666666667e-06,
868
+ "loss": 0.0073,
869
+ "step": 3475
870
+ },
871
+ {
872
+ "epoch": 205.41,
873
+ "learning_rate": 1.3404444444444445e-06,
874
+ "loss": 0.0063,
875
+ "step": 3500
876
+ },
877
+ {
878
+ "epoch": 206.88,
879
+ "learning_rate": 1.3182222222222221e-06,
880
+ "loss": 0.007,
881
+ "step": 3525
882
+ },
883
+ {
884
+ "epoch": 208.35,
885
+ "learning_rate": 1.296e-06,
886
+ "loss": 0.0061,
887
+ "step": 3550
888
+ },
889
+ {
890
+ "epoch": 209.82,
891
+ "learning_rate": 1.2737777777777776e-06,
892
+ "loss": 0.0053,
893
+ "step": 3575
894
+ },
895
+ {
896
+ "epoch": 211.29,
897
+ "learning_rate": 1.2515555555555554e-06,
898
+ "loss": 0.0056,
899
+ "step": 3600
900
+ },
901
+ {
902
+ "epoch": 212.76,
903
+ "learning_rate": 1.2293333333333334e-06,
904
+ "loss": 0.005,
905
+ "step": 3625
906
+ },
907
+ {
908
+ "epoch": 214.24,
909
+ "learning_rate": 1.207111111111111e-06,
910
+ "loss": 0.0047,
911
+ "step": 3650
912
+ },
913
+ {
914
+ "epoch": 215.71,
915
+ "learning_rate": 1.1848888888888889e-06,
916
+ "loss": 0.0052,
917
+ "step": 3675
918
+ },
919
+ {
920
+ "epoch": 217.18,
921
+ "learning_rate": 1.1626666666666667e-06,
922
+ "loss": 0.0044,
923
+ "step": 3700
924
+ },
925
+ {
926
+ "epoch": 218.65,
927
+ "learning_rate": 1.1404444444444443e-06,
928
+ "loss": 0.0046,
929
+ "step": 3725
930
+ },
931
+ {
932
+ "epoch": 220.12,
933
+ "learning_rate": 1.1182222222222221e-06,
934
+ "loss": 0.0045,
935
+ "step": 3750
936
+ },
937
+ {
938
+ "epoch": 221.59,
939
+ "learning_rate": 1.096e-06,
940
+ "loss": 0.0041,
941
+ "step": 3775
942
+ },
943
+ {
944
+ "epoch": 223.06,
945
+ "learning_rate": 1.0737777777777776e-06,
946
+ "loss": 0.0054,
947
+ "step": 3800
948
+ },
949
+ {
950
+ "epoch": 224.53,
951
+ "learning_rate": 1.0515555555555556e-06,
952
+ "loss": 0.0038,
953
+ "step": 3825
954
+ },
955
+ {
956
+ "epoch": 226.0,
957
+ "learning_rate": 1.0293333333333334e-06,
958
+ "loss": 0.0038,
959
+ "step": 3850
960
+ },
961
+ {
962
+ "epoch": 227.47,
963
+ "learning_rate": 1.007111111111111e-06,
964
+ "loss": 0.004,
965
+ "step": 3875
966
+ },
967
+ {
968
+ "epoch": 228.94,
969
+ "learning_rate": 9.848888888888889e-07,
970
+ "loss": 0.0036,
971
+ "step": 3900
972
+ },
973
+ {
974
+ "epoch": 230.41,
975
+ "learning_rate": 9.626666666666667e-07,
976
+ "loss": 0.0041,
977
+ "step": 3925
978
+ },
979
+ {
980
+ "epoch": 231.88,
981
+ "learning_rate": 9.404444444444443e-07,
982
+ "loss": 0.0032,
983
+ "step": 3950
984
+ },
985
+ {
986
+ "epoch": 233.35,
987
+ "learning_rate": 9.182222222222223e-07,
988
+ "loss": 0.0038,
989
+ "step": 3975
990
+ },
991
+ {
992
+ "epoch": 234.82,
993
+ "learning_rate": 8.96e-07,
994
+ "loss": 0.0043,
995
+ "step": 4000
996
+ },
997
+ {
998
+ "epoch": 234.82,
999
+ "eval_loss": 0.45361328125,
1000
+ "eval_runtime": 157.593,
1001
+ "eval_samples_per_second": 1.726,
1002
+ "eval_steps_per_second": 0.108,
1003
+ "eval_wer": 10.707652303120357,
1004
+ "step": 4000
1005
+ },
1006
+ {
1007
+ "epoch": 236.29,
1008
+ "learning_rate": 8.737777777777777e-07,
1009
+ "loss": 0.004,
1010
+ "step": 4025
1011
+ },
1012
+ {
1013
+ "epoch": 237.76,
1014
+ "learning_rate": 8.515555555555555e-07,
1015
+ "loss": 0.0029,
1016
+ "step": 4050
1017
+ },
1018
+ {
1019
+ "epoch": 239.24,
1020
+ "learning_rate": 8.293333333333333e-07,
1021
+ "loss": 0.0034,
1022
+ "step": 4075
1023
+ },
1024
+ {
1025
+ "epoch": 240.71,
1026
+ "learning_rate": 8.071111111111111e-07,
1027
+ "loss": 0.0032,
1028
+ "step": 4100
1029
+ },
1030
+ {
1031
+ "epoch": 242.18,
1032
+ "learning_rate": 7.848888888888888e-07,
1033
+ "loss": 0.003,
1034
+ "step": 4125
1035
+ },
1036
+ {
1037
+ "epoch": 243.65,
1038
+ "learning_rate": 7.626666666666667e-07,
1039
+ "loss": 0.0034,
1040
+ "step": 4150
1041
+ },
1042
+ {
1043
+ "epoch": 245.12,
1044
+ "learning_rate": 7.404444444444444e-07,
1045
+ "loss": 0.0032,
1046
+ "step": 4175
1047
+ },
1048
+ {
1049
+ "epoch": 246.59,
1050
+ "learning_rate": 7.182222222222222e-07,
1051
+ "loss": 0.0032,
1052
+ "step": 4200
1053
+ },
1054
+ {
1055
+ "epoch": 248.06,
1056
+ "learning_rate": 6.959999999999999e-07,
1057
+ "loss": 0.0028,
1058
+ "step": 4225
1059
+ },
1060
+ {
1061
+ "epoch": 249.53,
1062
+ "learning_rate": 6.737777777777778e-07,
1063
+ "loss": 0.0028,
1064
+ "step": 4250
1065
+ },
1066
+ {
1067
+ "epoch": 251.0,
1068
+ "learning_rate": 6.515555555555555e-07,
1069
+ "loss": 0.0025,
1070
+ "step": 4275
1071
+ },
1072
+ {
1073
+ "epoch": 252.47,
1074
+ "learning_rate": 6.293333333333333e-07,
1075
+ "loss": 0.0026,
1076
+ "step": 4300
1077
+ },
1078
+ {
1079
+ "epoch": 253.94,
1080
+ "learning_rate": 6.071111111111111e-07,
1081
+ "loss": 0.003,
1082
+ "step": 4325
1083
+ },
1084
+ {
1085
+ "epoch": 255.41,
1086
+ "learning_rate": 5.848888888888889e-07,
1087
+ "loss": 0.0026,
1088
+ "step": 4350
1089
+ },
1090
+ {
1091
+ "epoch": 256.88,
1092
+ "learning_rate": 5.626666666666666e-07,
1093
+ "loss": 0.0027,
1094
+ "step": 4375
1095
+ },
1096
+ {
1097
+ "epoch": 258.35,
1098
+ "learning_rate": 5.404444444444443e-07,
1099
+ "loss": 0.003,
1100
+ "step": 4400
1101
+ },
1102
+ {
1103
+ "epoch": 259.82,
1104
+ "learning_rate": 5.182222222222223e-07,
1105
+ "loss": 0.0027,
1106
+ "step": 4425
1107
+ },
1108
+ {
1109
+ "epoch": 261.29,
1110
+ "learning_rate": 4.977777777777777e-07,
1111
+ "loss": 0.0026,
1112
+ "step": 4450
1113
+ },
1114
+ {
1115
+ "epoch": 262.76,
1116
+ "learning_rate": 4.7555555555555554e-07,
1117
+ "loss": 0.0023,
1118
+ "step": 4475
1119
+ },
1120
+ {
1121
+ "epoch": 264.24,
1122
+ "learning_rate": 4.5333333333333326e-07,
1123
+ "loss": 0.0021,
1124
+ "step": 4500
1125
+ },
1126
+ {
1127
+ "epoch": 265.71,
1128
+ "learning_rate": 4.311111111111111e-07,
1129
+ "loss": 0.0022,
1130
+ "step": 4525
1131
+ },
1132
+ {
1133
+ "epoch": 267.18,
1134
+ "learning_rate": 4.088888888888889e-07,
1135
+ "loss": 0.0034,
1136
+ "step": 4550
1137
+ },
1138
+ {
1139
+ "epoch": 268.65,
1140
+ "learning_rate": 3.8666666666666664e-07,
1141
+ "loss": 0.0023,
1142
+ "step": 4575
1143
+ },
1144
+ {
1145
+ "epoch": 270.12,
1146
+ "learning_rate": 3.6444444444444446e-07,
1147
+ "loss": 0.0022,
1148
+ "step": 4600
1149
+ },
1150
+ {
1151
+ "epoch": 271.59,
1152
+ "learning_rate": 3.422222222222222e-07,
1153
+ "loss": 0.0022,
1154
+ "step": 4625
1155
+ },
1156
+ {
1157
+ "epoch": 273.06,
1158
+ "learning_rate": 3.2e-07,
1159
+ "loss": 0.0024,
1160
+ "step": 4650
1161
+ },
1162
+ {
1163
+ "epoch": 274.53,
1164
+ "learning_rate": 2.9777777777777773e-07,
1165
+ "loss": 0.0031,
1166
+ "step": 4675
1167
+ },
1168
+ {
1169
+ "epoch": 276.0,
1170
+ "learning_rate": 2.7555555555555555e-07,
1171
+ "loss": 0.0022,
1172
+ "step": 4700
1173
+ },
1174
+ {
1175
+ "epoch": 277.47,
1176
+ "learning_rate": 2.533333333333333e-07,
1177
+ "loss": 0.0022,
1178
+ "step": 4725
1179
+ },
1180
+ {
1181
+ "epoch": 278.94,
1182
+ "learning_rate": 2.311111111111111e-07,
1183
+ "loss": 0.0021,
1184
+ "step": 4750
1185
+ },
1186
+ {
1187
+ "epoch": 280.41,
1188
+ "learning_rate": 2.088888888888889e-07,
1189
+ "loss": 0.0023,
1190
+ "step": 4775
1191
+ },
1192
+ {
1193
+ "epoch": 281.88,
1194
+ "learning_rate": 1.8666666666666667e-07,
1195
+ "loss": 0.0025,
1196
+ "step": 4800
1197
+ },
1198
+ {
1199
+ "epoch": 283.35,
1200
+ "learning_rate": 1.6444444444444444e-07,
1201
+ "loss": 0.0022,
1202
+ "step": 4825
1203
+ },
1204
+ {
1205
+ "epoch": 284.82,
1206
+ "learning_rate": 1.4222222222222222e-07,
1207
+ "loss": 0.0022,
1208
+ "step": 4850
1209
+ },
1210
+ {
1211
+ "epoch": 286.29,
1212
+ "learning_rate": 1.2e-07,
1213
+ "loss": 0.0021,
1214
+ "step": 4875
1215
+ },
1216
+ {
1217
+ "epoch": 287.76,
1218
+ "learning_rate": 9.777777777777778e-08,
1219
+ "loss": 0.0023,
1220
+ "step": 4900
1221
+ },
1222
+ {
1223
+ "epoch": 289.24,
1224
+ "learning_rate": 7.555555555555555e-08,
1225
+ "loss": 0.002,
1226
+ "step": 4925
1227
+ },
1228
+ {
1229
+ "epoch": 290.71,
1230
+ "learning_rate": 5.3333333333333334e-08,
1231
+ "loss": 0.0025,
1232
+ "step": 4950
1233
+ },
1234
+ {
1235
+ "epoch": 292.18,
1236
+ "learning_rate": 3.111111111111111e-08,
1237
+ "loss": 0.002,
1238
+ "step": 4975
1239
+ },
1240
+ {
1241
+ "epoch": 293.65,
1242
+ "learning_rate": 8.888888888888889e-09,
1243
+ "loss": 0.0024,
1244
+ "step": 5000
1245
+ },
1246
+ {
1247
+ "epoch": 293.65,
1248
+ "eval_loss": 0.465576171875,
1249
+ "eval_runtime": 158.123,
1250
+ "eval_samples_per_second": 1.72,
1251
+ "eval_steps_per_second": 0.108,
1252
+ "eval_wer": 10.642644873699851,
1253
+ "step": 5000
1254
+ },
1255
+ {
1256
+ "epoch": 295.47,
1257
+ "learning_rate": 2.7544827586206896e-06,
1258
+ "loss": 0.0021,
1259
+ "step": 5025
1260
+ },
1261
+ {
1262
+ "epoch": 296.94,
1263
+ "learning_rate": 2.7475862068965512e-06,
1264
+ "loss": 0.0024,
1265
+ "step": 5050
1266
+ },
1267
+ {
1268
+ "epoch": 298.41,
1269
+ "learning_rate": 2.7406896551724137e-06,
1270
+ "loss": 0.0025,
1271
+ "step": 5075
1272
+ },
1273
+ {
1274
+ "epoch": 299.88,
1275
+ "learning_rate": 2.7337931034482757e-06,
1276
+ "loss": 0.0022,
1277
+ "step": 5100
1278
+ },
1279
+ {
1280
+ "epoch": 301.35,
1281
+ "learning_rate": 2.7268965517241378e-06,
1282
+ "loss": 0.0027,
1283
+ "step": 5125
1284
+ },
1285
+ {
1286
+ "epoch": 302.82,
1287
+ "learning_rate": 2.7200000000000002e-06,
1288
+ "loss": 0.0024,
1289
+ "step": 5150
1290
+ },
1291
+ {
1292
+ "epoch": 304.29,
1293
+ "learning_rate": 2.713103448275862e-06,
1294
+ "loss": 0.0024,
1295
+ "step": 5175
1296
+ },
1297
+ {
1298
+ "epoch": 305.76,
1299
+ "learning_rate": 2.7062068965517243e-06,
1300
+ "loss": 0.0023,
1301
+ "step": 5200
1302
+ },
1303
+ {
1304
+ "epoch": 307.24,
1305
+ "learning_rate": 2.699310344827586e-06,
1306
+ "loss": 0.0027,
1307
+ "step": 5225
1308
+ },
1309
+ {
1310
+ "epoch": 308.71,
1311
+ "learning_rate": 2.6924137931034483e-06,
1312
+ "loss": 0.0023,
1313
+ "step": 5250
1314
+ },
1315
+ {
1316
+ "epoch": 310.18,
1317
+ "learning_rate": 2.68551724137931e-06,
1318
+ "loss": 0.0021,
1319
+ "step": 5275
1320
+ },
1321
+ {
1322
+ "epoch": 311.65,
1323
+ "learning_rate": 2.6786206896551724e-06,
1324
+ "loss": 0.0025,
1325
+ "step": 5300
1326
+ },
1327
+ {
1328
+ "epoch": 313.12,
1329
+ "learning_rate": 2.6717241379310344e-06,
1330
+ "loss": 0.0021,
1331
+ "step": 5325
1332
+ },
1333
+ {
1334
+ "epoch": 314.59,
1335
+ "learning_rate": 2.6648275862068965e-06,
1336
+ "loss": 0.0019,
1337
+ "step": 5350
1338
+ },
1339
+ {
1340
+ "epoch": 316.06,
1341
+ "learning_rate": 2.6579310344827585e-06,
1342
+ "loss": 0.0019,
1343
+ "step": 5375
1344
+ },
1345
+ {
1346
+ "epoch": 317.53,
1347
+ "learning_rate": 2.6510344827586205e-06,
1348
+ "loss": 0.0018,
1349
+ "step": 5400
1350
+ },
1351
+ {
1352
+ "epoch": 319.0,
1353
+ "learning_rate": 2.6441379310344826e-06,
1354
+ "loss": 0.0022,
1355
+ "step": 5425
1356
+ },
1357
+ {
1358
+ "epoch": 320.47,
1359
+ "learning_rate": 2.6377931034482757e-06,
1360
+ "loss": 0.0019,
1361
+ "step": 5450
1362
+ },
1363
+ {
1364
+ "epoch": 321.94,
1365
+ "learning_rate": 2.6308965517241377e-06,
1366
+ "loss": 0.0016,
1367
+ "step": 5475
1368
+ },
1369
+ {
1370
+ "epoch": 323.41,
1371
+ "learning_rate": 2.624e-06,
1372
+ "loss": 0.0013,
1373
+ "step": 5500
1374
+ },
1375
+ {
1376
+ "epoch": 324.88,
1377
+ "learning_rate": 2.617103448275862e-06,
1378
+ "loss": 0.0019,
1379
+ "step": 5525
1380
+ },
1381
+ {
1382
+ "epoch": 326.35,
1383
+ "learning_rate": 2.6102068965517243e-06,
1384
+ "loss": 0.0017,
1385
+ "step": 5550
1386
+ },
1387
+ {
1388
+ "epoch": 327.82,
1389
+ "learning_rate": 2.603310344827586e-06,
1390
+ "loss": 0.0018,
1391
+ "step": 5575
1392
+ },
1393
+ {
1394
+ "epoch": 329.29,
1395
+ "learning_rate": 2.5964137931034483e-06,
1396
+ "loss": 0.0013,
1397
+ "step": 5600
1398
+ },
1399
+ {
1400
+ "epoch": 330.76,
1401
+ "learning_rate": 2.58951724137931e-06,
1402
+ "loss": 0.0016,
1403
+ "step": 5625
1404
+ },
1405
+ {
1406
+ "epoch": 332.24,
1407
+ "learning_rate": 2.5826206896551724e-06,
1408
+ "loss": 0.0013,
1409
+ "step": 5650
1410
+ },
1411
+ {
1412
+ "epoch": 333.71,
1413
+ "learning_rate": 2.575724137931034e-06,
1414
+ "loss": 0.0018,
1415
+ "step": 5675
1416
+ },
1417
+ {
1418
+ "epoch": 335.18,
1419
+ "learning_rate": 2.5688275862068965e-06,
1420
+ "loss": 0.0014,
1421
+ "step": 5700
1422
+ },
1423
+ {
1424
+ "epoch": 336.65,
1425
+ "learning_rate": 2.561931034482759e-06,
1426
+ "loss": 0.0013,
1427
+ "step": 5725
1428
+ },
1429
+ {
1430
+ "epoch": 338.12,
1431
+ "learning_rate": 2.5550344827586205e-06,
1432
+ "loss": 0.0011,
1433
+ "step": 5750
1434
+ },
1435
+ {
1436
+ "epoch": 339.59,
1437
+ "learning_rate": 2.548137931034483e-06,
1438
+ "loss": 0.0018,
1439
+ "step": 5775
1440
+ },
1441
+ {
1442
+ "epoch": 341.06,
1443
+ "learning_rate": 2.5412413793103446e-06,
1444
+ "loss": 0.0013,
1445
+ "step": 5800
1446
+ },
1447
+ {
1448
+ "epoch": 342.53,
1449
+ "learning_rate": 2.534344827586207e-06,
1450
+ "loss": 0.0012,
1451
+ "step": 5825
1452
+ },
1453
+ {
1454
+ "epoch": 344.0,
1455
+ "learning_rate": 2.5274482758620687e-06,
1456
+ "loss": 0.0014,
1457
+ "step": 5850
1458
+ },
1459
+ {
1460
+ "epoch": 345.47,
1461
+ "learning_rate": 2.520551724137931e-06,
1462
+ "loss": 0.001,
1463
+ "step": 5875
1464
+ },
1465
+ {
1466
+ "epoch": 346.94,
1467
+ "learning_rate": 2.5136551724137927e-06,
1468
+ "loss": 0.0012,
1469
+ "step": 5900
1470
+ },
1471
+ {
1472
+ "epoch": 348.41,
1473
+ "learning_rate": 2.506758620689655e-06,
1474
+ "loss": 0.0012,
1475
+ "step": 5925
1476
+ },
1477
+ {
1478
+ "epoch": 349.88,
1479
+ "learning_rate": 2.499862068965517e-06,
1480
+ "loss": 0.0012,
1481
+ "step": 5950
1482
+ },
1483
+ {
1484
+ "epoch": 351.35,
1485
+ "learning_rate": 2.4929655172413792e-06,
1486
+ "loss": 0.0013,
1487
+ "step": 5975
1488
+ },
1489
+ {
1490
+ "epoch": 352.82,
1491
+ "learning_rate": 2.4860689655172413e-06,
1492
+ "loss": 0.0015,
1493
+ "step": 6000
1494
+ },
1495
+ {
1496
+ "epoch": 352.82,
1497
+ "eval_loss": 0.497802734375,
1498
+ "eval_runtime": 156.7207,
1499
+ "eval_samples_per_second": 1.736,
1500
+ "eval_steps_per_second": 0.108,
1501
+ "eval_wer": 10.503343239227341,
1502
+ "step": 6000
1503
+ },
1504
+ {
1505
+ "epoch": 354.29,
1506
+ "learning_rate": 2.4791724137931033e-06,
1507
+ "loss": 0.0013,
1508
+ "step": 6025
1509
+ },
1510
+ {
1511
+ "epoch": 355.76,
1512
+ "learning_rate": 2.4722758620689653e-06,
1513
+ "loss": 0.0012,
1514
+ "step": 6050
1515
+ },
1516
+ {
1517
+ "epoch": 357.24,
1518
+ "learning_rate": 2.4653793103448274e-06,
1519
+ "loss": 0.0011,
1520
+ "step": 6075
1521
+ },
1522
+ {
1523
+ "epoch": 358.71,
1524
+ "learning_rate": 2.4584827586206894e-06,
1525
+ "loss": 0.0008,
1526
+ "step": 6100
1527
+ },
1528
+ {
1529
+ "epoch": 360.18,
1530
+ "learning_rate": 2.4515862068965514e-06,
1531
+ "loss": 0.0008,
1532
+ "step": 6125
1533
+ },
1534
+ {
1535
+ "epoch": 361.65,
1536
+ "learning_rate": 2.444689655172414e-06,
1537
+ "loss": 0.0011,
1538
+ "step": 6150
1539
+ },
1540
+ {
1541
+ "epoch": 363.12,
1542
+ "learning_rate": 2.4377931034482755e-06,
1543
+ "loss": 0.0012,
1544
+ "step": 6175
1545
+ },
1546
+ {
1547
+ "epoch": 364.59,
1548
+ "learning_rate": 2.430896551724138e-06,
1549
+ "loss": 0.0013,
1550
+ "step": 6200
1551
+ },
1552
+ {
1553
+ "epoch": 366.06,
1554
+ "learning_rate": 2.424e-06,
1555
+ "loss": 0.0011,
1556
+ "step": 6225
1557
+ },
1558
+ {
1559
+ "epoch": 367.53,
1560
+ "learning_rate": 2.417103448275862e-06,
1561
+ "loss": 0.0012,
1562
+ "step": 6250
1563
+ },
1564
+ {
1565
+ "epoch": 369.0,
1566
+ "learning_rate": 2.410206896551724e-06,
1567
+ "loss": 0.0011,
1568
+ "step": 6275
1569
+ },
1570
+ {
1571
+ "epoch": 370.47,
1572
+ "learning_rate": 2.403310344827586e-06,
1573
+ "loss": 0.0009,
1574
+ "step": 6300
1575
+ },
1576
+ {
1577
+ "epoch": 371.94,
1578
+ "learning_rate": 2.396413793103448e-06,
1579
+ "loss": 0.0014,
1580
+ "step": 6325
1581
+ },
1582
+ {
1583
+ "epoch": 373.41,
1584
+ "learning_rate": 2.38951724137931e-06,
1585
+ "loss": 0.0018,
1586
+ "step": 6350
1587
+ },
1588
+ {
1589
+ "epoch": 374.88,
1590
+ "learning_rate": 2.382620689655172e-06,
1591
+ "loss": 0.0009,
1592
+ "step": 6375
1593
+ },
1594
+ {
1595
+ "epoch": 376.35,
1596
+ "learning_rate": 2.3757241379310342e-06,
1597
+ "loss": 0.001,
1598
+ "step": 6400
1599
+ },
1600
+ {
1601
+ "epoch": 377.82,
1602
+ "learning_rate": 2.3688275862068963e-06,
1603
+ "loss": 0.0009,
1604
+ "step": 6425
1605
+ },
1606
+ {
1607
+ "epoch": 379.29,
1608
+ "learning_rate": 2.36248275862069e-06,
1609
+ "loss": 0.0008,
1610
+ "step": 6450
1611
+ },
1612
+ {
1613
+ "epoch": 380.76,
1614
+ "learning_rate": 2.3555862068965514e-06,
1615
+ "loss": 0.0009,
1616
+ "step": 6475
1617
+ },
1618
+ {
1619
+ "epoch": 382.24,
1620
+ "learning_rate": 2.348689655172414e-06,
1621
+ "loss": 0.0009,
1622
+ "step": 6500
1623
+ },
1624
+ {
1625
+ "epoch": 383.71,
1626
+ "learning_rate": 2.3417931034482755e-06,
1627
+ "loss": 0.0011,
1628
+ "step": 6525
1629
+ },
1630
+ {
1631
+ "epoch": 385.18,
1632
+ "learning_rate": 2.334896551724138e-06,
1633
+ "loss": 0.0008,
1634
+ "step": 6550
1635
+ },
1636
+ {
1637
+ "epoch": 386.65,
1638
+ "learning_rate": 2.3279999999999996e-06,
1639
+ "loss": 0.0006,
1640
+ "step": 6575
1641
+ },
1642
+ {
1643
+ "epoch": 388.12,
1644
+ "learning_rate": 2.321103448275862e-06,
1645
+ "loss": 0.001,
1646
+ "step": 6600
1647
+ },
1648
+ {
1649
+ "epoch": 389.59,
1650
+ "learning_rate": 2.314206896551724e-06,
1651
+ "loss": 0.0009,
1652
+ "step": 6625
1653
+ },
1654
+ {
1655
+ "epoch": 391.06,
1656
+ "learning_rate": 2.307310344827586e-06,
1657
+ "loss": 0.0008,
1658
+ "step": 6650
1659
+ },
1660
+ {
1661
+ "epoch": 392.53,
1662
+ "learning_rate": 2.300413793103448e-06,
1663
+ "loss": 0.001,
1664
+ "step": 6675
1665
+ },
1666
+ {
1667
+ "epoch": 394.0,
1668
+ "learning_rate": 2.29351724137931e-06,
1669
+ "loss": 0.0009,
1670
+ "step": 6700
1671
+ },
1672
+ {
1673
+ "epoch": 395.47,
1674
+ "learning_rate": 2.2866206896551726e-06,
1675
+ "loss": 0.0011,
1676
+ "step": 6725
1677
+ },
1678
+ {
1679
+ "epoch": 396.94,
1680
+ "learning_rate": 2.2797241379310342e-06,
1681
+ "loss": 0.0008,
1682
+ "step": 6750
1683
+ },
1684
+ {
1685
+ "epoch": 398.41,
1686
+ "learning_rate": 2.2728275862068967e-06,
1687
+ "loss": 0.0007,
1688
+ "step": 6775
1689
+ },
1690
+ {
1691
+ "epoch": 399.88,
1692
+ "learning_rate": 2.2659310344827583e-06,
1693
+ "loss": 0.0006,
1694
+ "step": 6800
1695
+ },
1696
+ {
1697
+ "epoch": 401.35,
1698
+ "learning_rate": 2.2590344827586207e-06,
1699
+ "loss": 0.0007,
1700
+ "step": 6825
1701
+ },
1702
+ {
1703
+ "epoch": 402.82,
1704
+ "learning_rate": 2.2521379310344828e-06,
1705
+ "loss": 0.0011,
1706
+ "step": 6850
1707
+ },
1708
+ {
1709
+ "epoch": 404.29,
1710
+ "learning_rate": 2.245241379310345e-06,
1711
+ "loss": 0.001,
1712
+ "step": 6875
1713
+ },
1714
+ {
1715
+ "epoch": 405.76,
1716
+ "learning_rate": 2.238344827586207e-06,
1717
+ "loss": 0.0007,
1718
+ "step": 6900
1719
+ },
1720
+ {
1721
+ "epoch": 407.24,
1722
+ "learning_rate": 2.231448275862069e-06,
1723
+ "loss": 0.0008,
1724
+ "step": 6925
1725
+ },
1726
+ {
1727
+ "epoch": 408.71,
1728
+ "learning_rate": 2.224551724137931e-06,
1729
+ "loss": 0.0007,
1730
+ "step": 6950
1731
+ },
1732
+ {
1733
+ "epoch": 410.18,
1734
+ "learning_rate": 2.217655172413793e-06,
1735
+ "loss": 0.0008,
1736
+ "step": 6975
1737
+ },
1738
+ {
1739
+ "epoch": 411.65,
1740
+ "learning_rate": 2.210758620689655e-06,
1741
+ "loss": 0.0007,
1742
+ "step": 7000
1743
+ },
1744
+ {
1745
+ "epoch": 411.65,
1746
+ "eval_loss": 0.5146484375,
1747
+ "eval_runtime": 159.9051,
1748
+ "eval_samples_per_second": 1.701,
1749
+ "eval_steps_per_second": 0.106,
1750
+ "eval_wer": 10.057578008915305,
1751
+ "step": 7000
1752
+ },
1753
+ {
1754
+ "epoch": 413.12,
1755
+ "learning_rate": 2.203862068965517e-06,
1756
+ "loss": 0.0007,
1757
+ "step": 7025
1758
+ },
1759
+ {
1760
+ "epoch": 414.59,
1761
+ "learning_rate": 2.196965517241379e-06,
1762
+ "loss": 0.0006,
1763
+ "step": 7050
1764
+ },
1765
+ {
1766
+ "epoch": 416.06,
1767
+ "learning_rate": 2.1900689655172415e-06,
1768
+ "loss": 0.0009,
1769
+ "step": 7075
1770
+ },
1771
+ {
1772
+ "epoch": 417.53,
1773
+ "learning_rate": 2.183172413793103e-06,
1774
+ "loss": 0.0008,
1775
+ "step": 7100
1776
+ },
1777
+ {
1778
+ "epoch": 419.0,
1779
+ "learning_rate": 2.1762758620689656e-06,
1780
+ "loss": 0.0007,
1781
+ "step": 7125
1782
+ },
1783
+ {
1784
+ "epoch": 420.47,
1785
+ "learning_rate": 2.1693793103448276e-06,
1786
+ "loss": 0.0008,
1787
+ "step": 7150
1788
+ },
1789
+ {
1790
+ "epoch": 421.94,
1791
+ "learning_rate": 2.1624827586206896e-06,
1792
+ "loss": 0.0007,
1793
+ "step": 7175
1794
+ },
1795
+ {
1796
+ "epoch": 423.41,
1797
+ "learning_rate": 2.1555862068965517e-06,
1798
+ "loss": 0.0005,
1799
+ "step": 7200
1800
+ },
1801
+ {
1802
+ "epoch": 424.88,
1803
+ "learning_rate": 2.1486896551724137e-06,
1804
+ "loss": 0.0008,
1805
+ "step": 7225
1806
+ },
1807
+ {
1808
+ "epoch": 426.35,
1809
+ "learning_rate": 2.1417931034482757e-06,
1810
+ "loss": 0.0009,
1811
+ "step": 7250
1812
+ },
1813
+ {
1814
+ "epoch": 427.82,
1815
+ "learning_rate": 2.1348965517241378e-06,
1816
+ "loss": 0.0009,
1817
+ "step": 7275
1818
+ },
1819
+ {
1820
+ "epoch": 429.29,
1821
+ "learning_rate": 2.128e-06,
1822
+ "loss": 0.0006,
1823
+ "step": 7300
1824
+ },
1825
+ {
1826
+ "epoch": 430.76,
1827
+ "learning_rate": 2.121103448275862e-06,
1828
+ "loss": 0.0006,
1829
+ "step": 7325
1830
+ },
1831
+ {
1832
+ "epoch": 432.24,
1833
+ "learning_rate": 2.1142068965517243e-06,
1834
+ "loss": 0.0006,
1835
+ "step": 7350
1836
+ },
1837
+ {
1838
+ "epoch": 433.71,
1839
+ "learning_rate": 2.107310344827586e-06,
1840
+ "loss": 0.0006,
1841
+ "step": 7375
1842
+ },
1843
+ {
1844
+ "epoch": 435.18,
1845
+ "learning_rate": 2.1004137931034483e-06,
1846
+ "loss": 0.0007,
1847
+ "step": 7400
1848
+ },
1849
+ {
1850
+ "epoch": 436.65,
1851
+ "learning_rate": 2.09351724137931e-06,
1852
+ "loss": 0.0006,
1853
+ "step": 7425
1854
+ },
1855
+ {
1856
+ "epoch": 438.12,
1857
+ "learning_rate": 2.0871724137931035e-06,
1858
+ "loss": 0.0007,
1859
+ "step": 7450
1860
+ },
1861
+ {
1862
+ "epoch": 439.59,
1863
+ "learning_rate": 2.080275862068965e-06,
1864
+ "loss": 0.0006,
1865
+ "step": 7475
1866
+ },
1867
+ {
1868
+ "epoch": 441.06,
1869
+ "learning_rate": 2.0733793103448276e-06,
1870
+ "loss": 0.0009,
1871
+ "step": 7500
1872
+ },
1873
+ {
1874
+ "epoch": 442.53,
1875
+ "learning_rate": 2.0664827586206896e-06,
1876
+ "loss": 0.0008,
1877
+ "step": 7525
1878
+ },
1879
+ {
1880
+ "epoch": 444.0,
1881
+ "learning_rate": 2.0595862068965516e-06,
1882
+ "loss": 0.0005,
1883
+ "step": 7550
1884
+ },
1885
+ {
1886
+ "epoch": 445.47,
1887
+ "learning_rate": 2.0526896551724137e-06,
1888
+ "loss": 0.0004,
1889
+ "step": 7575
1890
+ },
1891
+ {
1892
+ "epoch": 446.94,
1893
+ "learning_rate": 2.0457931034482757e-06,
1894
+ "loss": 0.0006,
1895
+ "step": 7600
1896
+ },
1897
+ {
1898
+ "epoch": 448.41,
1899
+ "learning_rate": 2.0388965517241377e-06,
1900
+ "loss": 0.0007,
1901
+ "step": 7625
1902
+ },
1903
+ {
1904
+ "epoch": 449.88,
1905
+ "learning_rate": 2.0319999999999998e-06,
1906
+ "loss": 0.0005,
1907
+ "step": 7650
1908
+ },
1909
+ {
1910
+ "epoch": 451.35,
1911
+ "learning_rate": 2.025103448275862e-06,
1912
+ "loss": 0.0005,
1913
+ "step": 7675
1914
+ },
1915
+ {
1916
+ "epoch": 452.82,
1917
+ "learning_rate": 2.018206896551724e-06,
1918
+ "loss": 0.0009,
1919
+ "step": 7700
1920
+ },
1921
+ {
1922
+ "epoch": 454.29,
1923
+ "learning_rate": 2.0113103448275863e-06,
1924
+ "loss": 0.0005,
1925
+ "step": 7725
1926
+ },
1927
+ {
1928
+ "epoch": 455.76,
1929
+ "learning_rate": 2.0044137931034483e-06,
1930
+ "loss": 0.0005,
1931
+ "step": 7750
1932
+ },
1933
+ {
1934
+ "epoch": 457.24,
1935
+ "learning_rate": 1.9975172413793104e-06,
1936
+ "loss": 0.0006,
1937
+ "step": 7775
1938
+ },
1939
+ {
1940
+ "epoch": 458.71,
1941
+ "learning_rate": 1.9906206896551724e-06,
1942
+ "loss": 0.0005,
1943
+ "step": 7800
1944
+ },
1945
+ {
1946
+ "epoch": 460.18,
1947
+ "learning_rate": 1.9837241379310344e-06,
1948
+ "loss": 0.0005,
1949
+ "step": 7825
1950
+ },
1951
+ {
1952
+ "epoch": 461.65,
1953
+ "learning_rate": 1.9768275862068965e-06,
1954
+ "loss": 0.0006,
1955
+ "step": 7850
1956
+ },
1957
+ {
1958
+ "epoch": 463.12,
1959
+ "learning_rate": 1.9699310344827585e-06,
1960
+ "loss": 0.0004,
1961
+ "step": 7875
1962
+ },
1963
+ {
1964
+ "epoch": 464.59,
1965
+ "learning_rate": 1.9630344827586205e-06,
1966
+ "loss": 0.0007,
1967
+ "step": 7900
1968
+ },
1969
+ {
1970
+ "epoch": 466.06,
1971
+ "learning_rate": 1.956137931034483e-06,
1972
+ "loss": 0.0005,
1973
+ "step": 7925
1974
+ },
1975
+ {
1976
+ "epoch": 467.53,
1977
+ "learning_rate": 1.949241379310345e-06,
1978
+ "loss": 0.0006,
1979
+ "step": 7950
1980
+ },
1981
+ {
1982
+ "epoch": 469.0,
1983
+ "learning_rate": 1.942344827586207e-06,
1984
+ "loss": 0.0006,
1985
+ "step": 7975
1986
+ },
1987
+ {
1988
+ "epoch": 470.47,
1989
+ "learning_rate": 1.935448275862069e-06,
1990
+ "loss": 0.0007,
1991
+ "step": 8000
1992
+ },
1993
+ {
1994
+ "epoch": 470.47,
1995
+ "eval_loss": 0.53857421875,
1996
+ "eval_runtime": 158.4391,
1997
+ "eval_samples_per_second": 1.717,
1998
+ "eval_steps_per_second": 0.107,
1999
+ "eval_wer": 10.131872213967311,
2000
+ "step": 8000
2001
+ },
2002
+ {
2003
+ "epoch": 471.94,
2004
+ "learning_rate": 1.928551724137931e-06,
2005
+ "loss": 0.0005,
2006
+ "step": 8025
2007
+ },
2008
+ {
2009
+ "epoch": 473.41,
2010
+ "learning_rate": 1.921655172413793e-06,
2011
+ "loss": 0.0008,
2012
+ "step": 8050
2013
+ },
2014
+ {
2015
+ "epoch": 474.88,
2016
+ "learning_rate": 1.914758620689655e-06,
2017
+ "loss": 0.0005,
2018
+ "step": 8075
2019
+ },
2020
+ {
2021
+ "epoch": 476.35,
2022
+ "learning_rate": 1.907862068965517e-06,
2023
+ "loss": 0.0004,
2024
+ "step": 8100
2025
+ },
2026
+ {
2027
+ "epoch": 477.82,
2028
+ "learning_rate": 1.9009655172413792e-06,
2029
+ "loss": 0.0005,
2030
+ "step": 8125
2031
+ },
2032
+ {
2033
+ "epoch": 479.29,
2034
+ "learning_rate": 1.8940689655172413e-06,
2035
+ "loss": 0.0004,
2036
+ "step": 8150
2037
+ },
2038
+ {
2039
+ "epoch": 480.76,
2040
+ "learning_rate": 1.8871724137931033e-06,
2041
+ "loss": 0.0007,
2042
+ "step": 8175
2043
+ },
2044
+ {
2045
+ "epoch": 482.24,
2046
+ "learning_rate": 1.8802758620689653e-06,
2047
+ "loss": 0.0005,
2048
+ "step": 8200
2049
+ },
2050
+ {
2051
+ "epoch": 483.71,
2052
+ "learning_rate": 1.8733793103448274e-06,
2053
+ "loss": 0.0007,
2054
+ "step": 8225
2055
+ },
2056
+ {
2057
+ "epoch": 485.18,
2058
+ "learning_rate": 1.8664827586206894e-06,
2059
+ "loss": 0.0005,
2060
+ "step": 8250
2061
+ },
2062
+ {
2063
+ "epoch": 486.65,
2064
+ "learning_rate": 1.8595862068965517e-06,
2065
+ "loss": 0.0004,
2066
+ "step": 8275
2067
+ },
2068
+ {
2069
+ "epoch": 488.12,
2070
+ "learning_rate": 1.8526896551724137e-06,
2071
+ "loss": 0.0005,
2072
+ "step": 8300
2073
+ },
2074
+ {
2075
+ "epoch": 489.59,
2076
+ "learning_rate": 1.845793103448276e-06,
2077
+ "loss": 0.0004,
2078
+ "step": 8325
2079
+ },
2080
+ {
2081
+ "epoch": 491.06,
2082
+ "learning_rate": 1.838896551724138e-06,
2083
+ "loss": 0.0004,
2084
+ "step": 8350
2085
+ },
2086
+ {
2087
+ "epoch": 492.53,
2088
+ "learning_rate": 1.832e-06,
2089
+ "loss": 0.0005,
2090
+ "step": 8375
2091
+ },
2092
+ {
2093
+ "epoch": 494.0,
2094
+ "learning_rate": 1.825103448275862e-06,
2095
+ "loss": 0.0004,
2096
+ "step": 8400
2097
+ },
2098
+ {
2099
+ "epoch": 495.47,
2100
+ "learning_rate": 1.818206896551724e-06,
2101
+ "loss": 0.0007,
2102
+ "step": 8425
2103
+ },
2104
+ {
2105
+ "epoch": 496.94,
2106
+ "learning_rate": 1.811862068965517e-06,
2107
+ "loss": 0.0008,
2108
+ "step": 8450
2109
+ },
2110
+ {
2111
+ "epoch": 498.41,
2112
+ "learning_rate": 1.8049655172413792e-06,
2113
+ "loss": 0.0005,
2114
+ "step": 8475
2115
+ },
2116
+ {
2117
+ "epoch": 499.88,
2118
+ "learning_rate": 1.7980689655172413e-06,
2119
+ "loss": 0.0006,
2120
+ "step": 8500
2121
+ },
2122
+ {
2123
+ "epoch": 501.35,
2124
+ "learning_rate": 1.7911724137931035e-06,
2125
+ "loss": 0.0004,
2126
+ "step": 8525
2127
+ },
2128
+ {
2129
+ "epoch": 502.82,
2130
+ "learning_rate": 1.7842758620689655e-06,
2131
+ "loss": 0.0004,
2132
+ "step": 8550
2133
+ },
2134
+ {
2135
+ "epoch": 504.29,
2136
+ "learning_rate": 1.7773793103448276e-06,
2137
+ "loss": 0.0006,
2138
+ "step": 8575
2139
+ },
2140
+ {
2141
+ "epoch": 505.76,
2142
+ "learning_rate": 1.7704827586206896e-06,
2143
+ "loss": 0.0004,
2144
+ "step": 8600
2145
+ },
2146
+ {
2147
+ "epoch": 507.24,
2148
+ "learning_rate": 1.7635862068965516e-06,
2149
+ "loss": 0.0004,
2150
+ "step": 8625
2151
+ },
2152
+ {
2153
+ "epoch": 508.71,
2154
+ "learning_rate": 1.7566896551724137e-06,
2155
+ "loss": 0.0006,
2156
+ "step": 8650
2157
+ },
2158
+ {
2159
+ "epoch": 510.18,
2160
+ "learning_rate": 1.7497931034482757e-06,
2161
+ "loss": 0.0004,
2162
+ "step": 8675
2163
+ },
2164
+ {
2165
+ "epoch": 511.65,
2166
+ "learning_rate": 1.742896551724138e-06,
2167
+ "loss": 0.0005,
2168
+ "step": 8700
2169
+ },
2170
+ {
2171
+ "epoch": 513.12,
2172
+ "learning_rate": 1.736e-06,
2173
+ "loss": 0.0006,
2174
+ "step": 8725
2175
+ },
2176
+ {
2177
+ "epoch": 514.59,
2178
+ "learning_rate": 1.729103448275862e-06,
2179
+ "loss": 0.0006,
2180
+ "step": 8750
2181
+ },
2182
+ {
2183
+ "epoch": 516.06,
2184
+ "learning_rate": 1.722206896551724e-06,
2185
+ "loss": 0.0004,
2186
+ "step": 8775
2187
+ },
2188
+ {
2189
+ "epoch": 517.53,
2190
+ "learning_rate": 1.715310344827586e-06,
2191
+ "loss": 0.0003,
2192
+ "step": 8800
2193
+ },
2194
+ {
2195
+ "epoch": 519.0,
2196
+ "learning_rate": 1.7084137931034481e-06,
2197
+ "loss": 0.0003,
2198
+ "step": 8825
2199
+ },
2200
+ {
2201
+ "epoch": 520.47,
2202
+ "learning_rate": 1.7015172413793101e-06,
2203
+ "loss": 0.0004,
2204
+ "step": 8850
2205
+ },
2206
+ {
2207
+ "epoch": 521.94,
2208
+ "learning_rate": 1.6946206896551722e-06,
2209
+ "loss": 0.0006,
2210
+ "step": 8875
2211
+ },
2212
+ {
2213
+ "epoch": 523.41,
2214
+ "learning_rate": 1.6877241379310342e-06,
2215
+ "loss": 0.0005,
2216
+ "step": 8900
2217
+ },
2218
+ {
2219
+ "epoch": 524.88,
2220
+ "learning_rate": 1.6808275862068967e-06,
2221
+ "loss": 0.0029,
2222
+ "step": 8925
2223
+ },
2224
+ {
2225
+ "epoch": 526.35,
2226
+ "learning_rate": 1.6739310344827587e-06,
2227
+ "loss": 0.0004,
2228
+ "step": 8950
2229
+ },
2230
+ {
2231
+ "epoch": 527.82,
2232
+ "learning_rate": 1.6670344827586207e-06,
2233
+ "loss": 0.0003,
2234
+ "step": 8975
2235
+ },
2236
+ {
2237
+ "epoch": 529.29,
2238
+ "learning_rate": 1.6601379310344828e-06,
2239
+ "loss": 0.0004,
2240
+ "step": 9000
2241
+ },
2242
+ {
2243
+ "epoch": 529.29,
2244
+ "eval_loss": 0.5361328125,
2245
+ "eval_runtime": 156.9399,
2246
+ "eval_samples_per_second": 1.733,
2247
+ "eval_steps_per_second": 0.108,
2248
+ "eval_wer": 9.778974739970282,
2249
+ "step": 9000
2250
+ },
2251
+ {
2252
+ "epoch": 530.76,
2253
+ "learning_rate": 1.6532413793103448e-06,
2254
+ "loss": 0.0006,
2255
+ "step": 9025
2256
+ },
2257
+ {
2258
+ "epoch": 532.24,
2259
+ "learning_rate": 1.6463448275862068e-06,
2260
+ "loss": 0.0003,
2261
+ "step": 9050
2262
+ },
2263
+ {
2264
+ "epoch": 533.71,
2265
+ "learning_rate": 1.6394482758620689e-06,
2266
+ "loss": 0.0003,
2267
+ "step": 9075
2268
+ },
2269
+ {
2270
+ "epoch": 535.18,
2271
+ "learning_rate": 1.632551724137931e-06,
2272
+ "loss": 0.0005,
2273
+ "step": 9100
2274
+ },
2275
+ {
2276
+ "epoch": 536.65,
2277
+ "learning_rate": 1.625655172413793e-06,
2278
+ "loss": 0.0006,
2279
+ "step": 9125
2280
+ },
2281
+ {
2282
+ "epoch": 538.12,
2283
+ "learning_rate": 1.6187586206896552e-06,
2284
+ "loss": 0.0003,
2285
+ "step": 9150
2286
+ },
2287
+ {
2288
+ "epoch": 539.59,
2289
+ "learning_rate": 1.6118620689655172e-06,
2290
+ "loss": 0.0004,
2291
+ "step": 9175
2292
+ },
2293
+ {
2294
+ "epoch": 541.06,
2295
+ "learning_rate": 1.6049655172413792e-06,
2296
+ "loss": 0.0003,
2297
+ "step": 9200
2298
+ },
2299
+ {
2300
+ "epoch": 542.53,
2301
+ "learning_rate": 1.5980689655172413e-06,
2302
+ "loss": 0.0004,
2303
+ "step": 9225
2304
+ },
2305
+ {
2306
+ "epoch": 544.0,
2307
+ "learning_rate": 1.5911724137931033e-06,
2308
+ "loss": 0.0006,
2309
+ "step": 9250
2310
+ },
2311
+ {
2312
+ "epoch": 545.47,
2313
+ "learning_rate": 1.5842758620689653e-06,
2314
+ "loss": 0.0002,
2315
+ "step": 9275
2316
+ },
2317
+ {
2318
+ "epoch": 546.94,
2319
+ "learning_rate": 1.5773793103448274e-06,
2320
+ "loss": 0.0003,
2321
+ "step": 9300
2322
+ },
2323
+ {
2324
+ "epoch": 548.41,
2325
+ "learning_rate": 1.5704827586206896e-06,
2326
+ "loss": 0.0003,
2327
+ "step": 9325
2328
+ },
2329
+ {
2330
+ "epoch": 549.88,
2331
+ "learning_rate": 1.5635862068965516e-06,
2332
+ "loss": 0.0003,
2333
+ "step": 9350
2334
+ },
2335
+ {
2336
+ "epoch": 551.35,
2337
+ "learning_rate": 1.5566896551724139e-06,
2338
+ "loss": 0.0004,
2339
+ "step": 9375
2340
+ },
2341
+ {
2342
+ "epoch": 552.82,
2343
+ "learning_rate": 1.549793103448276e-06,
2344
+ "loss": 0.0004,
2345
+ "step": 9400
2346
+ },
2347
+ {
2348
+ "epoch": 554.29,
2349
+ "learning_rate": 1.542896551724138e-06,
2350
+ "loss": 0.0005,
2351
+ "step": 9425
2352
+ },
2353
+ {
2354
+ "epoch": 555.76,
2355
+ "learning_rate": 1.5365517241379309e-06,
2356
+ "loss": 0.0004,
2357
+ "step": 9450
2358
+ },
2359
+ {
2360
+ "epoch": 557.24,
2361
+ "learning_rate": 1.529655172413793e-06,
2362
+ "loss": 0.0003,
2363
+ "step": 9475
2364
+ },
2365
+ {
2366
+ "epoch": 558.71,
2367
+ "learning_rate": 1.522758620689655e-06,
2368
+ "loss": 0.0003,
2369
+ "step": 9500
2370
+ },
2371
+ {
2372
+ "epoch": 560.18,
2373
+ "learning_rate": 1.5158620689655172e-06,
2374
+ "loss": 0.0003,
2375
+ "step": 9525
2376
+ },
2377
+ {
2378
+ "epoch": 561.65,
2379
+ "learning_rate": 1.5089655172413792e-06,
2380
+ "loss": 0.0005,
2381
+ "step": 9550
2382
+ },
2383
+ {
2384
+ "epoch": 563.12,
2385
+ "learning_rate": 1.5020689655172415e-06,
2386
+ "loss": 0.0004,
2387
+ "step": 9575
2388
+ },
2389
+ {
2390
+ "epoch": 564.59,
2391
+ "learning_rate": 1.4951724137931035e-06,
2392
+ "loss": 0.0004,
2393
+ "step": 9600
2394
+ },
2395
+ {
2396
+ "epoch": 566.06,
2397
+ "learning_rate": 1.4882758620689655e-06,
2398
+ "loss": 0.0003,
2399
+ "step": 9625
2400
+ },
2401
+ {
2402
+ "epoch": 567.53,
2403
+ "learning_rate": 1.4813793103448276e-06,
2404
+ "loss": 0.0005,
2405
+ "step": 9650
2406
+ },
2407
+ {
2408
+ "epoch": 569.0,
2409
+ "learning_rate": 1.4744827586206896e-06,
2410
+ "loss": 0.0003,
2411
+ "step": 9675
2412
+ },
2413
+ {
2414
+ "epoch": 570.47,
2415
+ "learning_rate": 1.4675862068965516e-06,
2416
+ "loss": 0.0003,
2417
+ "step": 9700
2418
+ },
2419
+ {
2420
+ "epoch": 571.94,
2421
+ "learning_rate": 1.4606896551724137e-06,
2422
+ "loss": 0.0003,
2423
+ "step": 9725
2424
+ },
2425
+ {
2426
+ "epoch": 573.41,
2427
+ "learning_rate": 1.4537931034482757e-06,
2428
+ "loss": 0.0002,
2429
+ "step": 9750
2430
+ },
2431
+ {
2432
+ "epoch": 574.88,
2433
+ "learning_rate": 1.4468965517241377e-06,
2434
+ "loss": 0.0002,
2435
+ "step": 9775
2436
+ },
2437
+ {
2438
+ "epoch": 576.35,
2439
+ "learning_rate": 1.44e-06,
2440
+ "loss": 0.0004,
2441
+ "step": 9800
2442
+ },
2443
+ {
2444
+ "epoch": 577.82,
2445
+ "learning_rate": 1.433103448275862e-06,
2446
+ "loss": 0.0002,
2447
+ "step": 9825
2448
+ },
2449
+ {
2450
+ "epoch": 579.29,
2451
+ "learning_rate": 1.426206896551724e-06,
2452
+ "loss": 0.0005,
2453
+ "step": 9850
2454
+ },
2455
+ {
2456
+ "epoch": 580.76,
2457
+ "learning_rate": 1.419310344827586e-06,
2458
+ "loss": 0.0004,
2459
+ "step": 9875
2460
+ },
2461
+ {
2462
+ "epoch": 582.24,
2463
+ "learning_rate": 1.4124137931034481e-06,
2464
+ "loss": 0.0003,
2465
+ "step": 9900
2466
+ },
2467
+ {
2468
+ "epoch": 583.71,
2469
+ "learning_rate": 1.4055172413793104e-06,
2470
+ "loss": 0.0004,
2471
+ "step": 9925
2472
+ },
2473
+ {
2474
+ "epoch": 585.18,
2475
+ "learning_rate": 1.3986206896551724e-06,
2476
+ "loss": 0.0004,
2477
+ "step": 9950
2478
+ },
2479
+ {
2480
+ "epoch": 586.65,
2481
+ "learning_rate": 1.3917241379310344e-06,
2482
+ "loss": 0.0004,
2483
+ "step": 9975
2484
+ },
2485
+ {
2486
+ "epoch": 588.12,
2487
+ "learning_rate": 1.3848275862068965e-06,
2488
+ "loss": 0.0003,
2489
+ "step": 10000
2490
+ },
2491
+ {
2492
+ "epoch": 588.12,
2493
+ "eval_loss": 0.54296875,
2494
+ "eval_runtime": 156.5622,
2495
+ "eval_samples_per_second": 1.737,
2496
+ "eval_steps_per_second": 0.109,
2497
+ "eval_wer": 9.973997028231798,
2498
+ "step": 10000
2499
+ },
2500
+ {
2501
+ "epoch": 589.59,
2502
+ "learning_rate": 1.3779310344827587e-06,
2503
+ "loss": 0.0002,
2504
+ "step": 10025
2505
+ },
2506
+ {
2507
+ "epoch": 591.06,
2508
+ "learning_rate": 1.3710344827586207e-06,
2509
+ "loss": 0.0003,
2510
+ "step": 10050
2511
+ },
2512
+ {
2513
+ "epoch": 592.53,
2514
+ "learning_rate": 1.3641379310344828e-06,
2515
+ "loss": 0.0002,
2516
+ "step": 10075
2517
+ },
2518
+ {
2519
+ "epoch": 594.0,
2520
+ "learning_rate": 1.3572413793103448e-06,
2521
+ "loss": 0.0003,
2522
+ "step": 10100
2523
+ },
2524
+ {
2525
+ "epoch": 595.47,
2526
+ "learning_rate": 1.3503448275862068e-06,
2527
+ "loss": 0.0003,
2528
+ "step": 10125
2529
+ },
2530
+ {
2531
+ "epoch": 596.94,
2532
+ "learning_rate": 1.3434482758620689e-06,
2533
+ "loss": 0.0002,
2534
+ "step": 10150
2535
+ },
2536
+ {
2537
+ "epoch": 598.41,
2538
+ "learning_rate": 1.3365517241379309e-06,
2539
+ "loss": 0.0004,
2540
+ "step": 10175
2541
+ },
2542
+ {
2543
+ "epoch": 599.88,
2544
+ "learning_rate": 1.329655172413793e-06,
2545
+ "loss": 0.0002,
2546
+ "step": 10200
2547
+ },
2548
+ {
2549
+ "epoch": 601.35,
2550
+ "learning_rate": 1.322758620689655e-06,
2551
+ "loss": 0.0003,
2552
+ "step": 10225
2553
+ },
2554
+ {
2555
+ "epoch": 602.82,
2556
+ "learning_rate": 1.3158620689655172e-06,
2557
+ "loss": 0.0003,
2558
+ "step": 10250
2559
+ },
2560
+ {
2561
+ "epoch": 604.29,
2562
+ "learning_rate": 1.3089655172413792e-06,
2563
+ "loss": 0.0002,
2564
+ "step": 10275
2565
+ },
2566
+ {
2567
+ "epoch": 605.76,
2568
+ "learning_rate": 1.3020689655172413e-06,
2569
+ "loss": 0.0002,
2570
+ "step": 10300
2571
+ },
2572
+ {
2573
+ "epoch": 607.24,
2574
+ "learning_rate": 1.2951724137931035e-06,
2575
+ "loss": 0.0003,
2576
+ "step": 10325
2577
+ },
2578
+ {
2579
+ "epoch": 608.71,
2580
+ "learning_rate": 1.2882758620689655e-06,
2581
+ "loss": 0.0002,
2582
+ "step": 10350
2583
+ },
2584
+ {
2585
+ "epoch": 610.18,
2586
+ "learning_rate": 1.2813793103448276e-06,
2587
+ "loss": 0.0003,
2588
+ "step": 10375
2589
+ },
2590
+ {
2591
+ "epoch": 611.65,
2592
+ "learning_rate": 1.2744827586206896e-06,
2593
+ "loss": 0.0003,
2594
+ "step": 10400
2595
+ },
2596
+ {
2597
+ "epoch": 613.12,
2598
+ "learning_rate": 1.2675862068965516e-06,
2599
+ "loss": 0.0003,
2600
+ "step": 10425
2601
+ },
2602
+ {
2603
+ "epoch": 614.59,
2604
+ "learning_rate": 1.2612413793103448e-06,
2605
+ "loss": 0.0005,
2606
+ "step": 10450
2607
+ },
2608
+ {
2609
+ "epoch": 616.06,
2610
+ "learning_rate": 1.2543448275862068e-06,
2611
+ "loss": 0.0003,
2612
+ "step": 10475
2613
+ },
2614
+ {
2615
+ "epoch": 617.53,
2616
+ "learning_rate": 1.2474482758620688e-06,
2617
+ "loss": 0.0003,
2618
+ "step": 10500
2619
+ },
2620
+ {
2621
+ "epoch": 619.0,
2622
+ "learning_rate": 1.240551724137931e-06,
2623
+ "loss": 0.0001,
2624
+ "step": 10525
2625
+ },
2626
+ {
2627
+ "epoch": 620.47,
2628
+ "learning_rate": 1.2336551724137931e-06,
2629
+ "loss": 0.0002,
2630
+ "step": 10550
2631
+ },
2632
+ {
2633
+ "epoch": 621.94,
2634
+ "learning_rate": 1.2267586206896552e-06,
2635
+ "loss": 0.0005,
2636
+ "step": 10575
2637
+ },
2638
+ {
2639
+ "epoch": 623.41,
2640
+ "learning_rate": 1.2198620689655172e-06,
2641
+ "loss": 0.0002,
2642
+ "step": 10600
2643
+ },
2644
+ {
2645
+ "epoch": 624.88,
2646
+ "learning_rate": 1.2129655172413792e-06,
2647
+ "loss": 0.0003,
2648
+ "step": 10625
2649
+ },
2650
+ {
2651
+ "epoch": 626.35,
2652
+ "learning_rate": 1.2060689655172413e-06,
2653
+ "loss": 0.0002,
2654
+ "step": 10650
2655
+ },
2656
+ {
2657
+ "epoch": 627.82,
2658
+ "learning_rate": 1.1991724137931035e-06,
2659
+ "loss": 0.0003,
2660
+ "step": 10675
2661
+ },
2662
+ {
2663
+ "epoch": 629.29,
2664
+ "learning_rate": 1.1922758620689655e-06,
2665
+ "loss": 0.0003,
2666
+ "step": 10700
2667
+ },
2668
+ {
2669
+ "epoch": 630.76,
2670
+ "learning_rate": 1.1853793103448276e-06,
2671
+ "loss": 0.0003,
2672
+ "step": 10725
2673
+ },
2674
+ {
2675
+ "epoch": 632.24,
2676
+ "learning_rate": 1.1784827586206896e-06,
2677
+ "loss": 0.0002,
2678
+ "step": 10750
2679
+ },
2680
+ {
2681
+ "epoch": 633.71,
2682
+ "learning_rate": 1.1715862068965516e-06,
2683
+ "loss": 0.0002,
2684
+ "step": 10775
2685
+ },
2686
+ {
2687
+ "epoch": 635.18,
2688
+ "learning_rate": 1.1646896551724137e-06,
2689
+ "loss": 0.0004,
2690
+ "step": 10800
2691
+ },
2692
+ {
2693
+ "epoch": 636.65,
2694
+ "learning_rate": 1.1577931034482757e-06,
2695
+ "loss": 0.0003,
2696
+ "step": 10825
2697
+ },
2698
+ {
2699
+ "epoch": 638.12,
2700
+ "learning_rate": 1.1508965517241377e-06,
2701
+ "loss": 0.0002,
2702
+ "step": 10850
2703
+ },
2704
+ {
2705
+ "epoch": 639.59,
2706
+ "learning_rate": 1.1439999999999998e-06,
2707
+ "loss": 0.0002,
2708
+ "step": 10875
2709
+ },
2710
+ {
2711
+ "epoch": 641.06,
2712
+ "learning_rate": 1.137103448275862e-06,
2713
+ "loss": 0.0003,
2714
+ "step": 10900
2715
+ },
2716
+ {
2717
+ "epoch": 642.53,
2718
+ "learning_rate": 1.1302068965517243e-06,
2719
+ "loss": 0.0002,
2720
+ "step": 10925
2721
+ },
2722
+ {
2723
+ "epoch": 644.0,
2724
+ "learning_rate": 1.1233103448275863e-06,
2725
+ "loss": 0.0004,
2726
+ "step": 10950
2727
+ },
2728
+ {
2729
+ "epoch": 645.47,
2730
+ "learning_rate": 1.1164137931034483e-06,
2731
+ "loss": 0.0004,
2732
+ "step": 10975
2733
+ },
2734
+ {
2735
+ "epoch": 646.94,
2736
+ "learning_rate": 1.1095172413793103e-06,
2737
+ "loss": 0.0002,
2738
+ "step": 11000
2739
+ },
2740
+ {
2741
+ "epoch": 646.94,
2742
+ "eval_loss": 0.5458984375,
2743
+ "eval_runtime": 157.5866,
2744
+ "eval_samples_per_second": 1.726,
2745
+ "eval_steps_per_second": 0.108,
2746
+ "eval_wer": 9.955423476968797,
2747
+ "step": 11000
2748
+ },
2749
+ {
2750
+ "epoch": 648.41,
2751
+ "learning_rate": 1.1026206896551724e-06,
2752
+ "loss": 0.0003,
2753
+ "step": 11025
2754
+ },
2755
+ {
2756
+ "epoch": 649.88,
2757
+ "learning_rate": 1.0957241379310344e-06,
2758
+ "loss": 0.0002,
2759
+ "step": 11050
2760
+ },
2761
+ {
2762
+ "epoch": 651.35,
2763
+ "learning_rate": 1.0888275862068964e-06,
2764
+ "loss": 0.0002,
2765
+ "step": 11075
2766
+ },
2767
+ {
2768
+ "epoch": 652.82,
2769
+ "learning_rate": 1.0819310344827585e-06,
2770
+ "loss": 0.0003,
2771
+ "step": 11100
2772
+ },
2773
+ {
2774
+ "epoch": 654.29,
2775
+ "learning_rate": 1.0750344827586207e-06,
2776
+ "loss": 0.0002,
2777
+ "step": 11125
2778
+ },
2779
+ {
2780
+ "epoch": 655.76,
2781
+ "learning_rate": 1.0681379310344828e-06,
2782
+ "loss": 0.0003,
2783
+ "step": 11150
2784
+ },
2785
+ {
2786
+ "epoch": 657.24,
2787
+ "learning_rate": 1.0612413793103448e-06,
2788
+ "loss": 0.0003,
2789
+ "step": 11175
2790
+ },
2791
+ {
2792
+ "epoch": 658.71,
2793
+ "learning_rate": 1.0543448275862068e-06,
2794
+ "loss": 0.0005,
2795
+ "step": 11200
2796
+ },
2797
+ {
2798
+ "epoch": 660.18,
2799
+ "learning_rate": 1.0474482758620689e-06,
2800
+ "loss": 0.0002,
2801
+ "step": 11225
2802
+ },
2803
+ {
2804
+ "epoch": 661.65,
2805
+ "learning_rate": 1.0405517241379309e-06,
2806
+ "loss": 0.0002,
2807
+ "step": 11250
2808
+ },
2809
+ {
2810
+ "epoch": 663.12,
2811
+ "learning_rate": 1.033655172413793e-06,
2812
+ "loss": 0.0003,
2813
+ "step": 11275
2814
+ },
2815
+ {
2816
+ "epoch": 664.59,
2817
+ "learning_rate": 1.026758620689655e-06,
2818
+ "loss": 0.0002,
2819
+ "step": 11300
2820
+ },
2821
+ {
2822
+ "epoch": 666.06,
2823
+ "learning_rate": 1.0198620689655172e-06,
2824
+ "loss": 0.0002,
2825
+ "step": 11325
2826
+ },
2827
+ {
2828
+ "epoch": 667.53,
2829
+ "learning_rate": 1.0129655172413794e-06,
2830
+ "loss": 0.0003,
2831
+ "step": 11350
2832
+ },
2833
+ {
2834
+ "epoch": 669.0,
2835
+ "learning_rate": 1.0060689655172415e-06,
2836
+ "loss": 0.0009,
2837
+ "step": 11375
2838
+ },
2839
+ {
2840
+ "epoch": 670.47,
2841
+ "learning_rate": 9.991724137931033e-07,
2842
+ "loss": 0.0002,
2843
+ "step": 11400
2844
+ },
2845
+ {
2846
+ "epoch": 671.94,
2847
+ "learning_rate": 9.922758620689655e-07,
2848
+ "loss": 0.0002,
2849
+ "step": 11425
2850
+ },
2851
+ {
2852
+ "epoch": 673.41,
2853
+ "learning_rate": 9.859310344827587e-07,
2854
+ "loss": 0.0003,
2855
+ "step": 11450
2856
+ },
2857
+ {
2858
+ "epoch": 674.88,
2859
+ "learning_rate": 9.790344827586207e-07,
2860
+ "loss": 0.0002,
2861
+ "step": 11475
2862
+ },
2863
+ {
2864
+ "epoch": 676.35,
2865
+ "learning_rate": 9.721379310344827e-07,
2866
+ "loss": 0.0002,
2867
+ "step": 11500
2868
+ },
2869
+ {
2870
+ "epoch": 677.82,
2871
+ "learning_rate": 9.652413793103448e-07,
2872
+ "loss": 0.0002,
2873
+ "step": 11525
2874
+ },
2875
+ {
2876
+ "epoch": 679.29,
2877
+ "learning_rate": 9.583448275862068e-07,
2878
+ "loss": 0.0003,
2879
+ "step": 11550
2880
+ },
2881
+ {
2882
+ "epoch": 680.76,
2883
+ "learning_rate": 9.514482758620688e-07,
2884
+ "loss": 0.0003,
2885
+ "step": 11575
2886
+ },
2887
+ {
2888
+ "epoch": 682.24,
2889
+ "learning_rate": 9.44551724137931e-07,
2890
+ "loss": 0.0003,
2891
+ "step": 11600
2892
+ },
2893
+ {
2894
+ "epoch": 683.71,
2895
+ "learning_rate": 9.376551724137931e-07,
2896
+ "loss": 0.0002,
2897
+ "step": 11625
2898
+ },
2899
+ {
2900
+ "epoch": 685.18,
2901
+ "learning_rate": 9.307586206896552e-07,
2902
+ "loss": 0.0002,
2903
+ "step": 11650
2904
+ },
2905
+ {
2906
+ "epoch": 686.65,
2907
+ "learning_rate": 9.238620689655172e-07,
2908
+ "loss": 0.0003,
2909
+ "step": 11675
2910
+ },
2911
+ {
2912
+ "epoch": 688.12,
2913
+ "learning_rate": 9.169655172413792e-07,
2914
+ "loss": 0.0003,
2915
+ "step": 11700
2916
+ },
2917
+ {
2918
+ "epoch": 689.59,
2919
+ "learning_rate": 9.100689655172414e-07,
2920
+ "loss": 0.0001,
2921
+ "step": 11725
2922
+ },
2923
+ {
2924
+ "epoch": 691.06,
2925
+ "learning_rate": 9.031724137931034e-07,
2926
+ "loss": 0.0004,
2927
+ "step": 11750
2928
+ },
2929
+ {
2930
+ "epoch": 692.53,
2931
+ "learning_rate": 8.962758620689654e-07,
2932
+ "loss": 0.0003,
2933
+ "step": 11775
2934
+ },
2935
+ {
2936
+ "epoch": 694.0,
2937
+ "learning_rate": 8.893793103448275e-07,
2938
+ "loss": 0.0005,
2939
+ "step": 11800
2940
+ },
2941
+ {
2942
+ "epoch": 695.47,
2943
+ "learning_rate": 8.824827586206897e-07,
2944
+ "loss": 0.0002,
2945
+ "step": 11825
2946
+ },
2947
+ {
2948
+ "epoch": 696.94,
2949
+ "learning_rate": 8.755862068965517e-07,
2950
+ "loss": 0.0002,
2951
+ "step": 11850
2952
+ },
2953
+ {
2954
+ "epoch": 698.41,
2955
+ "learning_rate": 8.686896551724138e-07,
2956
+ "loss": 0.0002,
2957
+ "step": 11875
2958
+ },
2959
+ {
2960
+ "epoch": 699.88,
2961
+ "learning_rate": 8.617931034482758e-07,
2962
+ "loss": 0.0002,
2963
+ "step": 11900
2964
+ },
2965
+ {
2966
+ "epoch": 701.35,
2967
+ "learning_rate": 8.548965517241378e-07,
2968
+ "loss": 0.0003,
2969
+ "step": 11925
2970
+ },
2971
+ {
2972
+ "epoch": 702.82,
2973
+ "learning_rate": 8.48e-07,
2974
+ "loss": 0.0002,
2975
+ "step": 11950
2976
+ },
2977
+ {
2978
+ "epoch": 704.29,
2979
+ "learning_rate": 8.41103448275862e-07,
2980
+ "loss": 0.0002,
2981
+ "step": 11975
2982
+ },
2983
+ {
2984
+ "epoch": 705.76,
2985
+ "learning_rate": 8.34206896551724e-07,
2986
+ "loss": 0.0003,
2987
+ "step": 12000
2988
+ },
2989
+ {
2990
+ "epoch": 705.76,
2991
+ "eval_loss": 0.55615234375,
2992
+ "eval_runtime": 158.1148,
2993
+ "eval_samples_per_second": 1.72,
2994
+ "eval_steps_per_second": 0.108,
2995
+ "eval_wer": 9.9832838038633,
2996
+ "step": 12000
2997
+ },
2998
+ {
2999
+ "epoch": 706.47,
3000
+ "learning_rate": 3.1968e-07,
3001
+ "loss": 0.0002,
3002
+ "step": 12025
3003
+ },
3004
+ {
3005
+ "epoch": 707.94,
3006
+ "learning_rate": 3.1168e-07,
3007
+ "loss": 0.0003,
3008
+ "step": 12050
3009
+ },
3010
+ {
3011
+ "epoch": 709.41,
3012
+ "learning_rate": 3.0368e-07,
3013
+ "loss": 0.0002,
3014
+ "step": 12075
3015
+ },
3016
+ {
3017
+ "epoch": 710.88,
3018
+ "learning_rate": 2.9568e-07,
3019
+ "loss": 0.0002,
3020
+ "step": 12100
3021
+ },
3022
+ {
3023
+ "epoch": 712.35,
3024
+ "learning_rate": 2.8768e-07,
3025
+ "loss": 0.0003,
3026
+ "step": 12125
3027
+ },
3028
+ {
3029
+ "epoch": 713.82,
3030
+ "learning_rate": 2.7968e-07,
3031
+ "loss": 0.0002,
3032
+ "step": 12150
3033
+ },
3034
+ {
3035
+ "epoch": 715.29,
3036
+ "learning_rate": 2.7167999999999996e-07,
3037
+ "loss": 0.0005,
3038
+ "step": 12175
3039
+ },
3040
+ {
3041
+ "epoch": 716.76,
3042
+ "learning_rate": 2.6368e-07,
3043
+ "loss": 0.0002,
3044
+ "step": 12200
3045
+ },
3046
+ {
3047
+ "epoch": 718.24,
3048
+ "learning_rate": 2.5568e-07,
3049
+ "loss": 0.0002,
3050
+ "step": 12225
3051
+ },
3052
+ {
3053
+ "epoch": 719.71,
3054
+ "learning_rate": 2.4768e-07,
3055
+ "loss": 0.0002,
3056
+ "step": 12250
3057
+ },
3058
+ {
3059
+ "epoch": 721.18,
3060
+ "learning_rate": 2.3968e-07,
3061
+ "loss": 0.0003,
3062
+ "step": 12275
3063
+ },
3064
+ {
3065
+ "epoch": 722.65,
3066
+ "learning_rate": 2.3168e-07,
3067
+ "loss": 0.0002,
3068
+ "step": 12300
3069
+ },
3070
+ {
3071
+ "epoch": 724.12,
3072
+ "learning_rate": 2.2367999999999998e-07,
3073
+ "loss": 0.0002,
3074
+ "step": 12325
3075
+ },
3076
+ {
3077
+ "epoch": 725.59,
3078
+ "learning_rate": 2.1568e-07,
3079
+ "loss": 0.0002,
3080
+ "step": 12350
3081
+ },
3082
+ {
3083
+ "epoch": 727.06,
3084
+ "learning_rate": 2.0768e-07,
3085
+ "loss": 0.0001,
3086
+ "step": 12375
3087
+ },
3088
+ {
3089
+ "epoch": 728.53,
3090
+ "learning_rate": 1.9968e-07,
3091
+ "loss": 0.0002,
3092
+ "step": 12400
3093
+ },
3094
+ {
3095
+ "epoch": 730.0,
3096
+ "learning_rate": 1.9167999999999998e-07,
3097
+ "loss": 0.0002,
3098
+ "step": 12425
3099
+ },
3100
+ {
3101
+ "epoch": 731.47,
3102
+ "learning_rate": 1.8432e-07,
3103
+ "loss": 0.0003,
3104
+ "step": 12450
3105
+ },
3106
+ {
3107
+ "epoch": 732.94,
3108
+ "learning_rate": 1.7632e-07,
3109
+ "loss": 0.0002,
3110
+ "step": 12475
3111
+ },
3112
+ {
3113
+ "epoch": 734.41,
3114
+ "learning_rate": 1.6832e-07,
3115
+ "loss": 0.0001,
3116
+ "step": 12500
3117
+ },
3118
+ {
3119
+ "epoch": 735.88,
3120
+ "learning_rate": 1.6032e-07,
3121
+ "loss": 0.0001,
3122
+ "step": 12525
3123
+ },
3124
+ {
3125
+ "epoch": 737.35,
3126
+ "learning_rate": 1.5232e-07,
3127
+ "loss": 0.0001,
3128
+ "step": 12550
3129
+ },
3130
+ {
3131
+ "epoch": 738.82,
3132
+ "learning_rate": 1.4431999999999998e-07,
3133
+ "loss": 0.0002,
3134
+ "step": 12575
3135
+ },
3136
+ {
3137
+ "epoch": 740.29,
3138
+ "learning_rate": 1.3632e-07,
3139
+ "loss": 0.0002,
3140
+ "step": 12600
3141
+ },
3142
+ {
3143
+ "epoch": 741.76,
3144
+ "learning_rate": 1.2831999999999997e-07,
3145
+ "loss": 0.0001,
3146
+ "step": 12625
3147
+ },
3148
+ {
3149
+ "epoch": 743.24,
3150
+ "learning_rate": 1.2031999999999998e-07,
3151
+ "loss": 0.0003,
3152
+ "step": 12650
3153
+ },
3154
+ {
3155
+ "epoch": 744.71,
3156
+ "learning_rate": 1.1232e-07,
3157
+ "loss": 0.0002,
3158
+ "step": 12675
3159
+ },
3160
+ {
3161
+ "epoch": 746.18,
3162
+ "learning_rate": 1.0432e-07,
3163
+ "loss": 0.0002,
3164
+ "step": 12700
3165
+ },
3166
+ {
3167
+ "epoch": 747.65,
3168
+ "learning_rate": 9.632e-08,
3169
+ "loss": 0.0002,
3170
+ "step": 12725
3171
+ },
3172
+ {
3173
+ "epoch": 749.12,
3174
+ "learning_rate": 8.831999999999999e-08,
3175
+ "loss": 0.0002,
3176
+ "step": 12750
3177
+ },
3178
+ {
3179
+ "epoch": 750.59,
3180
+ "learning_rate": 8.032e-08,
3181
+ "loss": 0.0002,
3182
+ "step": 12775
3183
+ },
3184
+ {
3185
+ "epoch": 752.06,
3186
+ "learning_rate": 7.231999999999999e-08,
3187
+ "loss": 0.0002,
3188
+ "step": 12800
3189
+ },
3190
+ {
3191
+ "epoch": 753.53,
3192
+ "learning_rate": 6.432e-08,
3193
+ "loss": 0.0002,
3194
+ "step": 12825
3195
+ },
3196
+ {
3197
+ "epoch": 755.0,
3198
+ "learning_rate": 5.632e-08,
3199
+ "loss": 0.0002,
3200
+ "step": 12850
3201
+ },
3202
+ {
3203
+ "epoch": 756.47,
3204
+ "learning_rate": 4.832e-08,
3205
+ "loss": 0.0002,
3206
+ "step": 12875
3207
+ },
3208
+ {
3209
+ "epoch": 757.94,
3210
+ "learning_rate": 4.032e-08,
3211
+ "loss": 0.0002,
3212
+ "step": 12900
3213
+ },
3214
+ {
3215
+ "epoch": 759.41,
3216
+ "learning_rate": 3.232e-08,
3217
+ "loss": 0.0001,
3218
+ "step": 12925
3219
+ },
3220
+ {
3221
+ "epoch": 760.88,
3222
+ "learning_rate": 2.432e-08,
3223
+ "loss": 0.0002,
3224
+ "step": 12950
3225
+ },
3226
+ {
3227
+ "epoch": 762.35,
3228
+ "learning_rate": 1.632e-08,
3229
+ "loss": 0.0001,
3230
+ "step": 12975
3231
+ },
3232
+ {
3233
+ "epoch": 763.82,
3234
+ "learning_rate": 8.32e-09,
3235
+ "loss": 0.0001,
3236
+ "step": 13000
3237
+ },
3238
+ {
3239
+ "epoch": 763.82,
3240
+ "eval_loss": 0.5546875,
3241
+ "eval_runtime": 156.9741,
3242
+ "eval_samples_per_second": 1.733,
3243
+ "eval_steps_per_second": 0.108,
3244
+ "eval_wer": 9.9925705794948,
3245
+ "step": 13000
3246
+ },
3247
+ {
3248
+ "epoch": 765.47,
3249
+ "learning_rate": 2.965925925925926e-07,
3250
+ "loss": 0.0002,
3251
+ "step": 13025
3252
+ },
3253
+ {
3254
+ "epoch": 766.94,
3255
+ "learning_rate": 2.891851851851852e-07,
3256
+ "loss": 0.0001,
3257
+ "step": 13050
3258
+ },
3259
+ {
3260
+ "epoch": 768.41,
3261
+ "learning_rate": 2.817777777777778e-07,
3262
+ "loss": 0.0002,
3263
+ "step": 13075
3264
+ },
3265
+ {
3266
+ "epoch": 769.88,
3267
+ "learning_rate": 2.7437037037037035e-07,
3268
+ "loss": 0.0001,
3269
+ "step": 13100
3270
+ },
3271
+ {
3272
+ "epoch": 771.35,
3273
+ "learning_rate": 2.6696296296296296e-07,
3274
+ "loss": 0.0001,
3275
+ "step": 13125
3276
+ },
3277
+ {
3278
+ "epoch": 772.82,
3279
+ "learning_rate": 2.595555555555555e-07,
3280
+ "loss": 0.0002,
3281
+ "step": 13150
3282
+ },
3283
+ {
3284
+ "epoch": 774.29,
3285
+ "learning_rate": 2.521481481481481e-07,
3286
+ "loss": 0.0002,
3287
+ "step": 13175
3288
+ },
3289
+ {
3290
+ "epoch": 775.76,
3291
+ "learning_rate": 2.4474074074074073e-07,
3292
+ "loss": 0.0001,
3293
+ "step": 13200
3294
+ },
3295
+ {
3296
+ "epoch": 777.24,
3297
+ "learning_rate": 2.3733333333333334e-07,
3298
+ "loss": 0.0001,
3299
+ "step": 13225
3300
+ },
3301
+ {
3302
+ "epoch": 778.71,
3303
+ "learning_rate": 2.2992592592592592e-07,
3304
+ "loss": 0.0001,
3305
+ "step": 13250
3306
+ },
3307
+ {
3308
+ "epoch": 780.18,
3309
+ "learning_rate": 2.2251851851851853e-07,
3310
+ "loss": 0.0003,
3311
+ "step": 13275
3312
+ },
3313
+ {
3314
+ "epoch": 781.65,
3315
+ "learning_rate": 2.1511111111111111e-07,
3316
+ "loss": 0.0001,
3317
+ "step": 13300
3318
+ },
3319
+ {
3320
+ "epoch": 783.12,
3321
+ "learning_rate": 2.077037037037037e-07,
3322
+ "loss": 0.0001,
3323
+ "step": 13325
3324
+ },
3325
+ {
3326
+ "epoch": 784.59,
3327
+ "learning_rate": 2.002962962962963e-07,
3328
+ "loss": 0.0001,
3329
+ "step": 13350
3330
+ },
3331
+ {
3332
+ "epoch": 786.06,
3333
+ "learning_rate": 1.9288888888888889e-07,
3334
+ "loss": 0.0001,
3335
+ "step": 13375
3336
+ },
3337
+ {
3338
+ "epoch": 787.53,
3339
+ "learning_rate": 1.8548148148148147e-07,
3340
+ "loss": 0.0002,
3341
+ "step": 13400
3342
+ },
3343
+ {
3344
+ "epoch": 789.0,
3345
+ "learning_rate": 1.7807407407407408e-07,
3346
+ "loss": 0.0002,
3347
+ "step": 13425
3348
+ },
3349
+ {
3350
+ "epoch": 790.47,
3351
+ "learning_rate": 1.7066666666666666e-07,
3352
+ "loss": 0.0002,
3353
+ "step": 13450
3354
+ },
3355
+ {
3356
+ "epoch": 791.94,
3357
+ "learning_rate": 1.6385185185185184e-07,
3358
+ "loss": 0.0001,
3359
+ "step": 13475
3360
+ },
3361
+ {
3362
+ "epoch": 793.41,
3363
+ "learning_rate": 1.5644444444444442e-07,
3364
+ "loss": 0.0003,
3365
+ "step": 13500
3366
+ },
3367
+ {
3368
+ "epoch": 794.88,
3369
+ "learning_rate": 1.49037037037037e-07,
3370
+ "loss": 0.0001,
3371
+ "step": 13525
3372
+ },
3373
+ {
3374
+ "epoch": 796.35,
3375
+ "learning_rate": 1.4162962962962962e-07,
3376
+ "loss": 0.0001,
3377
+ "step": 13550
3378
+ },
3379
+ {
3380
+ "epoch": 797.82,
3381
+ "learning_rate": 1.342222222222222e-07,
3382
+ "loss": 0.0001,
3383
+ "step": 13575
3384
+ },
3385
+ {
3386
+ "epoch": 799.29,
3387
+ "learning_rate": 1.268148148148148e-07,
3388
+ "loss": 0.0001,
3389
+ "step": 13600
3390
+ },
3391
+ {
3392
+ "epoch": 800.76,
3393
+ "learning_rate": 1.194074074074074e-07,
3394
+ "loss": 0.0002,
3395
+ "step": 13625
3396
+ },
3397
+ {
3398
+ "epoch": 802.24,
3399
+ "learning_rate": 1.12e-07,
3400
+ "loss": 0.0001,
3401
+ "step": 13650
3402
+ },
3403
+ {
3404
+ "epoch": 803.71,
3405
+ "learning_rate": 1.0459259259259259e-07,
3406
+ "loss": 0.0002,
3407
+ "step": 13675
3408
+ },
3409
+ {
3410
+ "epoch": 805.18,
3411
+ "learning_rate": 9.718518518518517e-08,
3412
+ "loss": 0.0002,
3413
+ "step": 13700
3414
+ },
3415
+ {
3416
+ "epoch": 806.65,
3417
+ "learning_rate": 8.977777777777777e-08,
3418
+ "loss": 0.0002,
3419
+ "step": 13725
3420
+ },
3421
+ {
3422
+ "epoch": 808.12,
3423
+ "learning_rate": 8.237037037037037e-08,
3424
+ "loss": 0.0002,
3425
+ "step": 13750
3426
+ },
3427
+ {
3428
+ "epoch": 809.59,
3429
+ "learning_rate": 7.496296296296296e-08,
3430
+ "loss": 0.0002,
3431
+ "step": 13775
3432
+ },
3433
+ {
3434
+ "epoch": 811.06,
3435
+ "learning_rate": 6.755555555555554e-08,
3436
+ "loss": 0.0001,
3437
+ "step": 13800
3438
+ },
3439
+ {
3440
+ "epoch": 812.53,
3441
+ "learning_rate": 6.014814814814814e-08,
3442
+ "loss": 0.0001,
3443
+ "step": 13825
3444
+ },
3445
+ {
3446
+ "epoch": 814.0,
3447
+ "learning_rate": 5.274074074074074e-08,
3448
+ "loss": 0.0002,
3449
+ "step": 13850
3450
+ },
3451
+ {
3452
+ "epoch": 815.47,
3453
+ "learning_rate": 4.5333333333333336e-08,
3454
+ "loss": 0.0001,
3455
+ "step": 13875
3456
+ },
3457
+ {
3458
+ "epoch": 816.94,
3459
+ "learning_rate": 3.7925925925925924e-08,
3460
+ "loss": 0.0002,
3461
+ "step": 13900
3462
+ },
3463
+ {
3464
+ "epoch": 818.41,
3465
+ "learning_rate": 3.051851851851851e-08,
3466
+ "loss": 0.0001,
3467
+ "step": 13925
3468
+ },
3469
+ {
3470
+ "epoch": 819.88,
3471
+ "learning_rate": 2.311111111111111e-08,
3472
+ "loss": 0.0002,
3473
+ "step": 13950
3474
+ },
3475
+ {
3476
+ "epoch": 821.35,
3477
+ "learning_rate": 1.57037037037037e-08,
3478
+ "loss": 0.0001,
3479
+ "step": 13975
3480
+ },
3481
+ {
3482
+ "epoch": 822.82,
3483
+ "learning_rate": 8.296296296296296e-09,
3484
+ "loss": 0.0001,
3485
+ "step": 14000
3486
+ },
3487
+ {
3488
+ "epoch": 822.82,
3489
+ "eval_loss": 0.5576171875,
3490
+ "eval_runtime": 157.6735,
3491
+ "eval_samples_per_second": 1.725,
3492
+ "eval_steps_per_second": 0.108,
3493
+ "eval_wer": 9.899702823179792,
3494
+ "step": 14000
3495
+ },
3496
+ {
3497
+ "epoch": 824.47,
3498
+ "learning_rate": 0.00012324102564102563,
3499
+ "loss": 7.1148,
3500
+ "step": 14025
3501
+ },
3502
+ {
3503
+ "epoch": 825.94,
3504
+ "learning_rate": 0.00012272820512820512,
3505
+ "loss": 5.3802,
3506
+ "step": 14050
3507
+ },
3508
+ {
3509
+ "epoch": 827.41,
3510
+ "learning_rate": 0.00012221538461538463,
3511
+ "loss": 4.0038,
3512
+ "step": 14075
3513
+ },
3514
+ {
3515
+ "epoch": 828.88,
3516
+ "learning_rate": 0.0001217025641025641,
3517
+ "loss": 3.0771,
3518
+ "step": 14100
3519
+ },
3520
+ {
3521
+ "epoch": 830.35,
3522
+ "learning_rate": 0.00012118974358974359,
3523
+ "loss": 2.4888,
3524
+ "step": 14125
3525
+ },
3526
+ {
3527
+ "epoch": 831.82,
3528
+ "learning_rate": 0.0001206769230769231,
3529
+ "loss": 2.0454,
3530
+ "step": 14150
3531
+ },
3532
+ {
3533
+ "epoch": 833.29,
3534
+ "learning_rate": 0.00012016410256410258,
3535
+ "loss": 1.6123,
3536
+ "step": 14175
3537
+ },
3538
+ {
3539
+ "epoch": 834.76,
3540
+ "learning_rate": 0.00011965128205128207,
3541
+ "loss": 1.1082,
3542
+ "step": 14200
3543
+ },
3544
+ {
3545
+ "epoch": 836.24,
3546
+ "learning_rate": 0.00011913846153846155,
3547
+ "loss": 0.6733,
3548
+ "step": 14225
3549
+ },
3550
+ {
3551
+ "epoch": 837.71,
3552
+ "learning_rate": 0.00011862564102564103,
3553
+ "loss": 0.4108,
3554
+ "step": 14250
3555
+ },
3556
+ {
3557
+ "epoch": 839.18,
3558
+ "learning_rate": 0.00011811282051282051,
3559
+ "loss": 0.2879,
3560
+ "step": 14275
3561
+ },
3562
+ {
3563
+ "epoch": 840.65,
3564
+ "learning_rate": 0.0001176,
3565
+ "loss": 0.2274,
3566
+ "step": 14300
3567
+ },
3568
+ {
3569
+ "epoch": 842.12,
3570
+ "learning_rate": 0.00011708717948717949,
3571
+ "loss": 0.1869,
3572
+ "step": 14325
3573
+ },
3574
+ {
3575
+ "epoch": 843.59,
3576
+ "learning_rate": 0.00011657435897435897,
3577
+ "loss": 0.1548,
3578
+ "step": 14350
3579
+ },
3580
+ {
3581
+ "epoch": 845.06,
3582
+ "learning_rate": 0.00011606153846153847,
3583
+ "loss": 2.892,
3584
+ "step": 14375
3585
+ },
3586
+ {
3587
+ "epoch": 846.53,
3588
+ "learning_rate": 0.00011556923076923078,
3589
+ "loss": 4.4433,
3590
+ "step": 14400
3591
+ },
3592
+ {
3593
+ "epoch": 848.0,
3594
+ "learning_rate": 0.00011505641025641026,
3595
+ "loss": 0.9719,
3596
+ "step": 14425
3597
+ },
3598
+ {
3599
+ "epoch": 849.47,
3600
+ "learning_rate": 0.00011454358974358974,
3601
+ "loss": 0.0969,
3602
+ "step": 14450
3603
+ },
3604
+ {
3605
+ "epoch": 850.94,
3606
+ "learning_rate": 0.00011403076923076923,
3607
+ "loss": 0.0932,
3608
+ "step": 14475
3609
+ },
3610
+ {
3611
+ "epoch": 852.41,
3612
+ "learning_rate": 0.00011351794871794871,
3613
+ "loss": 0.0829,
3614
+ "step": 14500
3615
+ },
3616
+ {
3617
+ "epoch": 853.88,
3618
+ "learning_rate": 0.0001130051282051282,
3619
+ "loss": 0.0785,
3620
+ "step": 14525
3621
+ },
3622
+ {
3623
+ "epoch": 855.35,
3624
+ "learning_rate": 0.0001124923076923077,
3625
+ "loss": 0.0679,
3626
+ "step": 14550
3627
+ },
3628
+ {
3629
+ "epoch": 856.82,
3630
+ "learning_rate": 0.00011197948717948719,
3631
+ "loss": 0.0656,
3632
+ "step": 14575
3633
+ },
3634
+ {
3635
+ "epoch": 858.29,
3636
+ "learning_rate": 0.00011146666666666667,
3637
+ "loss": 0.064,
3638
+ "step": 14600
3639
+ },
3640
+ {
3641
+ "epoch": 859.76,
3642
+ "learning_rate": 0.00011095384615384616,
3643
+ "loss": 0.0614,
3644
+ "step": 14625
3645
+ },
3646
+ {
3647
+ "epoch": 861.24,
3648
+ "learning_rate": 0.00011044102564102565,
3649
+ "loss": 0.0612,
3650
+ "step": 14650
3651
+ },
3652
+ {
3653
+ "epoch": 862.71,
3654
+ "learning_rate": 0.00010992820512820515,
3655
+ "loss": 0.0609,
3656
+ "step": 14675
3657
+ },
3658
+ {
3659
+ "epoch": 864.18,
3660
+ "learning_rate": 0.00010941538461538463,
3661
+ "loss": 0.0586,
3662
+ "step": 14700
3663
+ },
3664
+ {
3665
+ "epoch": 865.65,
3666
+ "learning_rate": 0.0001089025641025641,
3667
+ "loss": 0.0581,
3668
+ "step": 14725
3669
+ },
3670
+ {
3671
+ "epoch": 867.12,
3672
+ "learning_rate": 0.00010838974358974358,
3673
+ "loss": 0.0569,
3674
+ "step": 14750
3675
+ },
3676
+ {
3677
+ "epoch": 868.59,
3678
+ "learning_rate": 0.00010787692307692308,
3679
+ "loss": 0.0573,
3680
+ "step": 14775
3681
+ },
3682
+ {
3683
+ "epoch": 870.06,
3684
+ "learning_rate": 0.00010736410256410257,
3685
+ "loss": 0.0555,
3686
+ "step": 14800
3687
+ },
3688
+ {
3689
+ "epoch": 871.53,
3690
+ "learning_rate": 0.00010685128205128205,
3691
+ "loss": 0.0546,
3692
+ "step": 14825
3693
+ },
3694
+ {
3695
+ "epoch": 873.0,
3696
+ "learning_rate": 0.00010633846153846154,
3697
+ "loss": 0.0548,
3698
+ "step": 14850
3699
+ },
3700
+ {
3701
+ "epoch": 874.47,
3702
+ "learning_rate": 0.00010582564102564103,
3703
+ "loss": 0.0541,
3704
+ "step": 14875
3705
+ },
3706
+ {
3707
+ "epoch": 875.94,
3708
+ "learning_rate": 0.00010531282051282053,
3709
+ "loss": 0.0526,
3710
+ "step": 14900
3711
+ },
3712
+ {
3713
+ "epoch": 877.41,
3714
+ "learning_rate": 0.00010480000000000001,
3715
+ "loss": 0.0521,
3716
+ "step": 14925
3717
+ },
3718
+ {
3719
+ "epoch": 878.88,
3720
+ "learning_rate": 0.0001042871794871795,
3721
+ "loss": 0.0539,
3722
+ "step": 14950
3723
+ },
3724
+ {
3725
+ "epoch": 880.35,
3726
+ "learning_rate": 0.00010377435897435899,
3727
+ "loss": 0.0535,
3728
+ "step": 14975
3729
+ },
3730
+ {
3731
+ "epoch": 881.82,
3732
+ "learning_rate": 0.00010326153846153847,
3733
+ "loss": 0.0538,
3734
+ "step": 15000
3735
+ },
3736
+ {
3737
+ "epoch": 881.82,
3738
+ "eval_loss": 5.33984375,
3739
+ "eval_runtime": 102.1523,
3740
+ "eval_samples_per_second": 2.663,
3741
+ "eval_steps_per_second": 0.166,
3742
+ "eval_wer": 99.87927191679049,
3743
+ "step": 15000
3744
+ },
3745
+ {
3746
+ "epoch": 883.29,
3747
+ "learning_rate": 0.00010274871794871795,
3748
+ "loss": 0.0535,
3749
+ "step": 15025
3750
+ },
3751
+ {
3752
+ "epoch": 884.76,
3753
+ "learning_rate": 0.00010223589743589743,
3754
+ "loss": 0.0516,
3755
+ "step": 15050
3756
+ },
3757
+ {
3758
+ "epoch": 886.24,
3759
+ "learning_rate": 0.00010172307692307692,
3760
+ "loss": 0.0503,
3761
+ "step": 15075
3762
+ },
3763
+ {
3764
+ "epoch": 887.71,
3765
+ "learning_rate": 0.0001012102564102564,
3766
+ "loss": 0.05,
3767
+ "step": 15100
3768
+ },
3769
+ {
3770
+ "epoch": 889.18,
3771
+ "learning_rate": 0.0001006974358974359,
3772
+ "loss": 0.0512,
3773
+ "step": 15125
3774
+ },
3775
+ {
3776
+ "epoch": 890.65,
3777
+ "learning_rate": 0.00010018461538461539,
3778
+ "loss": 0.0503,
3779
+ "step": 15150
3780
+ },
3781
+ {
3782
+ "epoch": 892.12,
3783
+ "learning_rate": 9.967179487179488e-05,
3784
+ "loss": 0.0516,
3785
+ "step": 15175
3786
+ },
3787
+ {
3788
+ "epoch": 893.59,
3789
+ "learning_rate": 9.915897435897436e-05,
3790
+ "loss": 0.0518,
3791
+ "step": 15200
3792
+ },
3793
+ {
3794
+ "epoch": 895.06,
3795
+ "learning_rate": 9.864615384615385e-05,
3796
+ "loss": 0.0521,
3797
+ "step": 15225
3798
+ },
3799
+ {
3800
+ "epoch": 896.53,
3801
+ "learning_rate": 9.813333333333334e-05,
3802
+ "loss": 0.0508,
3803
+ "step": 15250
3804
+ },
3805
+ {
3806
+ "epoch": 898.0,
3807
+ "learning_rate": 9.762051282051282e-05,
3808
+ "loss": 0.0507,
3809
+ "step": 15275
3810
+ },
3811
+ {
3812
+ "epoch": 899.47,
3813
+ "learning_rate": 9.710769230769231e-05,
3814
+ "loss": 0.0506,
3815
+ "step": 15300
3816
+ },
3817
+ {
3818
+ "epoch": 900.94,
3819
+ "learning_rate": 9.65948717948718e-05,
3820
+ "loss": 0.0496,
3821
+ "step": 15325
3822
+ },
3823
+ {
3824
+ "epoch": 902.41,
3825
+ "learning_rate": 9.608205128205128e-05,
3826
+ "loss": 0.052,
3827
+ "step": 15350
3828
+ },
3829
+ {
3830
+ "epoch": 903.88,
3831
+ "learning_rate": 9.556923076923078e-05,
3832
+ "loss": 0.05,
3833
+ "step": 15375
3834
+ },
3835
+ {
3836
+ "epoch": 905.35,
3837
+ "learning_rate": 9.505641025641026e-05,
3838
+ "loss": 0.0498,
3839
+ "step": 15400
3840
+ },
3841
+ {
3842
+ "epoch": 906.82,
3843
+ "learning_rate": 9.454358974358974e-05,
3844
+ "loss": 0.0501,
3845
+ "step": 15425
3846
+ },
3847
+ {
3848
+ "epoch": 908.29,
3849
+ "learning_rate": 9.403076923076923e-05,
3850
+ "loss": 0.0512,
3851
+ "step": 15450
3852
+ },
3853
+ {
3854
+ "epoch": 909.76,
3855
+ "learning_rate": 9.351794871794872e-05,
3856
+ "loss": 0.0499,
3857
+ "step": 15475
3858
+ },
3859
+ {
3860
+ "epoch": 911.24,
3861
+ "learning_rate": 9.300512820512822e-05,
3862
+ "loss": 0.05,
3863
+ "step": 15500
3864
+ },
3865
+ {
3866
+ "epoch": 912.71,
3867
+ "learning_rate": 9.24923076923077e-05,
3868
+ "loss": 0.0516,
3869
+ "step": 15525
3870
+ },
3871
+ {
3872
+ "epoch": 914.18,
3873
+ "learning_rate": 9.197948717948719e-05,
3874
+ "loss": 0.0517,
3875
+ "step": 15550
3876
+ },
3877
+ {
3878
+ "epoch": 915.65,
3879
+ "learning_rate": 9.146666666666666e-05,
3880
+ "loss": 0.0499,
3881
+ "step": 15575
3882
+ },
3883
+ {
3884
+ "epoch": 917.12,
3885
+ "learning_rate": 9.095384615384616e-05,
3886
+ "loss": 0.0531,
3887
+ "step": 15600
3888
+ },
3889
+ {
3890
+ "epoch": 918.59,
3891
+ "learning_rate": 9.044102564102565e-05,
3892
+ "loss": 0.0502,
3893
+ "step": 15625
3894
+ },
3895
+ {
3896
+ "epoch": 920.06,
3897
+ "learning_rate": 8.992820512820514e-05,
3898
+ "loss": 0.0495,
3899
+ "step": 15650
3900
+ },
3901
+ {
3902
+ "epoch": 921.53,
3903
+ "learning_rate": 8.941538461538462e-05,
3904
+ "loss": 0.0499,
3905
+ "step": 15675
3906
+ },
3907
+ {
3908
+ "epoch": 923.0,
3909
+ "learning_rate": 8.890256410256411e-05,
3910
+ "loss": 0.0515,
3911
+ "step": 15700
3912
+ },
3913
+ {
3914
+ "epoch": 924.47,
3915
+ "learning_rate": 8.83897435897436e-05,
3916
+ "loss": 0.0491,
3917
+ "step": 15725
3918
+ },
3919
+ {
3920
+ "epoch": 925.94,
3921
+ "learning_rate": 8.787692307692308e-05,
3922
+ "loss": 0.0491,
3923
+ "step": 15750
3924
+ },
3925
+ {
3926
+ "epoch": 927.41,
3927
+ "learning_rate": 8.736410256410257e-05,
3928
+ "loss": 0.0482,
3929
+ "step": 15775
3930
+ },
3931
+ {
3932
+ "epoch": 928.88,
3933
+ "learning_rate": 8.685128205128206e-05,
3934
+ "loss": 0.0487,
3935
+ "step": 15800
3936
+ },
3937
+ {
3938
+ "epoch": 930.35,
3939
+ "learning_rate": 8.633846153846154e-05,
3940
+ "loss": 0.0494,
3941
+ "step": 15825
3942
+ },
3943
+ {
3944
+ "epoch": 931.82,
3945
+ "learning_rate": 8.582564102564103e-05,
3946
+ "loss": 0.0491,
3947
+ "step": 15850
3948
+ },
3949
+ {
3950
+ "epoch": 933.29,
3951
+ "learning_rate": 8.531282051282051e-05,
3952
+ "loss": 0.0483,
3953
+ "step": 15875
3954
+ },
3955
+ {
3956
+ "epoch": 934.76,
3957
+ "learning_rate": 8.48e-05,
3958
+ "loss": 0.048,
3959
+ "step": 15900
3960
+ },
3961
+ {
3962
+ "epoch": 936.24,
3963
+ "learning_rate": 8.428717948717949e-05,
3964
+ "loss": 0.0488,
3965
+ "step": 15925
3966
+ },
3967
+ {
3968
+ "epoch": 937.71,
3969
+ "learning_rate": 8.377435897435897e-05,
3970
+ "loss": 0.0494,
3971
+ "step": 15950
3972
+ },
3973
+ {
3974
+ "epoch": 939.18,
3975
+ "learning_rate": 8.326153846153847e-05,
3976
+ "loss": 0.0491,
3977
+ "step": 15975
3978
+ },
3979
+ {
3980
+ "epoch": 940.65,
3981
+ "learning_rate": 8.274871794871796e-05,
3982
+ "loss": 0.0482,
3983
+ "step": 16000
3984
+ },
3985
+ {
3986
+ "epoch": 940.65,
3987
+ "eval_loss": 5.62109375,
3988
+ "eval_runtime": 164.5773,
3989
+ "eval_samples_per_second": 1.653,
3990
+ "eval_steps_per_second": 0.103,
3991
+ "eval_wer": 136.06983655274888,
3992
+ "step": 16000
3993
+ },
3994
+ {
3995
+ "epoch": 942.12,
3996
+ "learning_rate": 8.223589743589743e-05,
3997
+ "loss": 0.0492,
3998
+ "step": 16025
3999
+ },
4000
+ {
4001
+ "epoch": 943.59,
4002
+ "learning_rate": 8.172307692307692e-05,
4003
+ "loss": 0.0485,
4004
+ "step": 16050
4005
+ },
4006
+ {
4007
+ "epoch": 945.06,
4008
+ "learning_rate": 8.121025641025641e-05,
4009
+ "loss": 0.0489,
4010
+ "step": 16075
4011
+ },
4012
+ {
4013
+ "epoch": 946.53,
4014
+ "learning_rate": 8.069743589743591e-05,
4015
+ "loss": 0.0494,
4016
+ "step": 16100
4017
+ },
4018
+ {
4019
+ "epoch": 948.0,
4020
+ "learning_rate": 8.01846153846154e-05,
4021
+ "loss": 0.0487,
4022
+ "step": 16125
4023
+ },
4024
+ {
4025
+ "epoch": 949.47,
4026
+ "learning_rate": 7.967179487179488e-05,
4027
+ "loss": 0.0473,
4028
+ "step": 16150
4029
+ },
4030
+ {
4031
+ "epoch": 950.94,
4032
+ "learning_rate": 7.915897435897435e-05,
4033
+ "loss": 0.0489,
4034
+ "step": 16175
4035
+ },
4036
+ {
4037
+ "epoch": 952.41,
4038
+ "learning_rate": 7.864615384615385e-05,
4039
+ "loss": 0.048,
4040
+ "step": 16200
4041
+ },
4042
+ {
4043
+ "epoch": 953.88,
4044
+ "learning_rate": 7.813333333333334e-05,
4045
+ "loss": 0.0479,
4046
+ "step": 16225
4047
+ },
4048
+ {
4049
+ "epoch": 955.35,
4050
+ "learning_rate": 7.762051282051283e-05,
4051
+ "loss": 0.0549,
4052
+ "step": 16250
4053
+ },
4054
+ {
4055
+ "epoch": 956.82,
4056
+ "learning_rate": 7.710769230769231e-05,
4057
+ "loss": 0.0479,
4058
+ "step": 16275
4059
+ },
4060
+ {
4061
+ "epoch": 958.29,
4062
+ "learning_rate": 7.65948717948718e-05,
4063
+ "loss": 0.0468,
4064
+ "step": 16300
4065
+ },
4066
+ {
4067
+ "epoch": 959.76,
4068
+ "learning_rate": 7.608205128205129e-05,
4069
+ "loss": 0.0477,
4070
+ "step": 16325
4071
+ },
4072
+ {
4073
+ "epoch": 961.24,
4074
+ "learning_rate": 7.556923076923077e-05,
4075
+ "loss": 0.0482,
4076
+ "step": 16350
4077
+ },
4078
+ {
4079
+ "epoch": 962.71,
4080
+ "learning_rate": 7.505641025641026e-05,
4081
+ "loss": 0.0493,
4082
+ "step": 16375
4083
+ },
4084
+ {
4085
+ "epoch": 964.18,
4086
+ "learning_rate": 7.454358974358975e-05,
4087
+ "loss": 0.0499,
4088
+ "step": 16400
4089
+ },
4090
+ {
4091
+ "epoch": 965.65,
4092
+ "learning_rate": 7.403076923076923e-05,
4093
+ "loss": 0.0516,
4094
+ "step": 16425
4095
+ },
4096
+ {
4097
+ "epoch": 967.12,
4098
+ "learning_rate": 7.351794871794873e-05,
4099
+ "loss": 0.052,
4100
+ "step": 16450
4101
+ },
4102
+ {
4103
+ "epoch": 968.59,
4104
+ "learning_rate": 7.30051282051282e-05,
4105
+ "loss": 0.0495,
4106
+ "step": 16475
4107
+ },
4108
+ {
4109
+ "epoch": 970.06,
4110
+ "learning_rate": 7.249230769230769e-05,
4111
+ "loss": 0.0495,
4112
+ "step": 16500
4113
+ },
4114
+ {
4115
+ "epoch": 971.53,
4116
+ "learning_rate": 7.197948717948718e-05,
4117
+ "loss": 0.0482,
4118
+ "step": 16525
4119
+ },
4120
+ {
4121
+ "epoch": 973.0,
4122
+ "learning_rate": 7.146666666666666e-05,
4123
+ "loss": 0.0511,
4124
+ "step": 16550
4125
+ },
4126
+ {
4127
+ "epoch": 974.47,
4128
+ "learning_rate": 7.095384615384616e-05,
4129
+ "loss": 0.0487,
4130
+ "step": 16575
4131
+ },
4132
+ {
4133
+ "epoch": 975.94,
4134
+ "learning_rate": 7.044102564102565e-05,
4135
+ "loss": 0.049,
4136
+ "step": 16600
4137
+ },
4138
+ {
4139
+ "epoch": 977.41,
4140
+ "learning_rate": 6.992820512820512e-05,
4141
+ "loss": 0.048,
4142
+ "step": 16625
4143
+ },
4144
+ {
4145
+ "epoch": 978.88,
4146
+ "learning_rate": 6.941538461538461e-05,
4147
+ "loss": 0.0485,
4148
+ "step": 16650
4149
+ },
4150
+ {
4151
+ "epoch": 980.35,
4152
+ "learning_rate": 6.890256410256411e-05,
4153
+ "loss": 0.0525,
4154
+ "step": 16675
4155
+ },
4156
+ {
4157
+ "epoch": 981.82,
4158
+ "learning_rate": 6.83897435897436e-05,
4159
+ "loss": 0.0478,
4160
+ "step": 16700
4161
+ },
4162
+ {
4163
+ "epoch": 983.29,
4164
+ "learning_rate": 6.787692307692308e-05,
4165
+ "loss": 0.0481,
4166
+ "step": 16725
4167
+ },
4168
+ {
4169
+ "epoch": 984.76,
4170
+ "learning_rate": 6.736410256410257e-05,
4171
+ "loss": 0.0494,
4172
+ "step": 16750
4173
+ },
4174
+ {
4175
+ "epoch": 986.24,
4176
+ "learning_rate": 6.685128205128204e-05,
4177
+ "loss": 0.0468,
4178
+ "step": 16775
4179
+ },
4180
+ {
4181
+ "epoch": 987.71,
4182
+ "learning_rate": 6.633846153846154e-05,
4183
+ "loss": 0.0631,
4184
+ "step": 16800
4185
+ },
4186
+ {
4187
+ "epoch": 989.18,
4188
+ "learning_rate": 6.582564102564103e-05,
4189
+ "loss": 0.0468,
4190
+ "step": 16825
4191
+ },
4192
+ {
4193
+ "epoch": 990.65,
4194
+ "learning_rate": 6.531282051282052e-05,
4195
+ "loss": 0.0464,
4196
+ "step": 16850
4197
+ },
4198
+ {
4199
+ "epoch": 992.12,
4200
+ "learning_rate": 6.48e-05,
4201
+ "loss": 0.0625,
4202
+ "step": 16875
4203
+ },
4204
+ {
4205
+ "epoch": 993.59,
4206
+ "learning_rate": 6.428717948717949e-05,
4207
+ "loss": 0.0497,
4208
+ "step": 16900
4209
+ },
4210
+ {
4211
+ "epoch": 995.06,
4212
+ "learning_rate": 6.377435897435898e-05,
4213
+ "loss": 0.0481,
4214
+ "step": 16925
4215
+ },
4216
+ {
4217
+ "epoch": 996.53,
4218
+ "learning_rate": 6.326153846153846e-05,
4219
+ "loss": 0.0484,
4220
+ "step": 16950
4221
+ },
4222
+ {
4223
+ "epoch": 998.0,
4224
+ "learning_rate": 6.274871794871795e-05,
4225
+ "loss": 0.0506,
4226
+ "step": 16975
4227
+ },
4228
+ {
4229
+ "epoch": 999.47,
4230
+ "learning_rate": 6.223589743589744e-05,
4231
+ "loss": 0.0471,
4232
+ "step": 17000
4233
+ },
4234
+ {
4235
+ "epoch": 999.47,
4236
+ "eval_loss": 5.6484375,
4237
+ "eval_runtime": 155.9288,
4238
+ "eval_samples_per_second": 1.744,
4239
+ "eval_steps_per_second": 0.109,
4240
+ "eval_wer": 121.2481426448737,
4241
+ "step": 17000
4242
+ }
4243
+ ],
4244
+ "max_steps": 20000,
4245
+ "num_train_epochs": 1177,
4246
+ "total_flos": 5.251869494196956e+20,
4247
+ "trial_name": null,
4248
+ "trial_params": null
4249
+ }
checkpoint-17000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6e3ac4aeab20cf895e188b7a0ae60077219ad0067d587dfa1da35e123e14fa0
3
+ size 4795
checkpoint-17000/zero_to_fp32.py ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
4
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
5
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
6
+ # application.
7
+ #
8
+ # example: python zero_to_fp32.py . pytorch_model.bin
9
+
10
+ import argparse
11
+ import torch
12
+ import glob
13
+ import math
14
+ import os
15
+ import re
16
+ from collections import OrderedDict
17
+
18
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
19
+ # DeepSpeed data structures it has to be available in the current python environment.
20
+ from deepspeed.utils import logger
21
+ from deepspeed.checkpoint.constants import (DS_VERSION,
22
+ OPTIMIZER_STATE_DICT,
23
+ SINGLE_PARTITION_OF_FP32_GROUPS,
24
+ FP32_FLAT_GROUPS,
25
+ ZERO_STAGE,
26
+ PARTITION_COUNT,
27
+ PARAM_SHAPES,
28
+ BUFFER_NAMES)
29
+
30
+ debug = 0
31
+
32
+ # load to cpu
33
+ device = torch.device('cpu')
34
+
35
+
36
+ def atoi(text):
37
+ return int(text) if text.isdigit() else text
38
+
39
+
40
+ def natural_keys(text):
41
+ '''
42
+ alist.sort(key=natural_keys) sorts in human order
43
+ http://nedbatchelder.com/blog/200712/human_sorting.html
44
+ (See Toothy's implementation in the comments)
45
+ '''
46
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
47
+
48
+
49
+ def get_model_state_file(checkpoint_dir, zero_stage):
50
+ if not os.path.isdir(checkpoint_dir):
51
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
52
+
53
+ # there should be only one file
54
+ if zero_stage == 2:
55
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
56
+ elif zero_stage == 3:
57
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
58
+
59
+ if not os.path.exists(file):
60
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
61
+
62
+ return file
63
+
64
+
65
+ def get_optim_files(checkpoint_dir):
66
+ # XXX: need to test that this simple glob rule works for multi-node setup too
67
+ optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
68
+ "*_optim_states.pt")),
69
+ key=natural_keys)
70
+
71
+ if len(optim_files) == 0:
72
+ raise FileNotFoundError(
73
+ f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
74
+
75
+ return optim_files
76
+
77
+
78
+ def parse_model_state(file):
79
+ state_dict = torch.load(file, map_location=device)
80
+
81
+ if BUFFER_NAMES not in state_dict:
82
+ raise ValueError(f"{file} is not a model state checkpoint")
83
+ buffer_names = state_dict[BUFFER_NAMES]
84
+ if debug:
85
+ print("Found buffers:", buffer_names)
86
+
87
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
88
+ buffers = {
89
+ k: v.float()
90
+ for k,
91
+ v in state_dict["module"].items() if k in buffer_names
92
+ }
93
+ param_shapes = state_dict[PARAM_SHAPES]
94
+
95
+ ds_version = state_dict.get(DS_VERSION, None)
96
+
97
+ return buffers, param_shapes, ds_version
98
+
99
+
100
+ def parse_optim_states(files, ds_checkpoint_dir):
101
+
102
+ total_files = len(files)
103
+ state_dicts = []
104
+ for f in files:
105
+ state_dicts.append(torch.load(f, map_location=device))
106
+
107
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
108
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
109
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
110
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
111
+
112
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
113
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
114
+ # use the max of the partition_count to get the dp world_size.
115
+
116
+ if type(world_size) is list:
117
+ world_size = max(world_size)
118
+
119
+ if world_size != total_files:
120
+ raise ValueError(
121
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
122
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
123
+ )
124
+
125
+ # the groups are named differently in each stage
126
+ if zero_stage == 2:
127
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
128
+ elif zero_stage == 3:
129
+ fp32_groups_key = FP32_FLAT_GROUPS
130
+ else:
131
+ raise ValueError(f"unknown zero stage {zero_stage}")
132
+
133
+ if zero_stage == 2:
134
+ fp32_flat_groups = [
135
+ state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
136
+ for i in range(len(state_dicts))
137
+ ]
138
+ elif zero_stage == 3:
139
+ # if there is more than one param group, there will be multiple flattened tensors - one
140
+ # flattened tensor per group - for simplicity merge them into a single tensor
141
+ #
142
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
143
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
144
+
145
+ fp32_flat_groups = [
146
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
147
+ 0) for i in range(len(state_dicts))
148
+ ]
149
+
150
+ return zero_stage, world_size, fp32_flat_groups
151
+
152
+
153
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
154
+ """
155
+ Returns fp32 state_dict reconstructed from ds checkpoint
156
+
157
+ Args:
158
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
159
+
160
+ """
161
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
162
+
163
+ optim_files = get_optim_files(ds_checkpoint_dir)
164
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
165
+ print(
166
+ f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
167
+
168
+ model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
169
+ buffers, param_shapes, ds_version = parse_model_state(model_file)
170
+ print(f'Parsing checkpoint created by deepspeed=={ds_version}')
171
+
172
+ if zero_stage == 2:
173
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
174
+ param_shapes,
175
+ fp32_flat_groups,
176
+ buffers)
177
+ elif zero_stage == 3:
178
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
179
+ param_shapes,
180
+ fp32_flat_groups,
181
+ buffers)
182
+
183
+
184
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
185
+ param_shapes,
186
+ fp32_flat_groups,
187
+ buffers):
188
+
189
+ # Reconstruction protocol:
190
+ #
191
+ # XXX: document this
192
+
193
+ if debug:
194
+ for i in range(world_size):
195
+ for j in range(len(fp32_flat_groups[0])):
196
+ print(
197
+ f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
198
+
199
+ # XXX: memory usage doubles here (zero2)
200
+ num_param_groups = len(fp32_flat_groups[0])
201
+ merged_single_partition_of_fp32_groups = []
202
+ for i in range(num_param_groups):
203
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
204
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
205
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
206
+ avail_numel = sum([
207
+ full_single_fp32_vector.numel()
208
+ for full_single_fp32_vector in merged_single_partition_of_fp32_groups
209
+ ])
210
+
211
+ if debug:
212
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
213
+ wanted_numel = sum(
214
+ [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
215
+ # not asserting if there is a mismatch due to possible padding
216
+ print(f"Have {avail_numel} numels to process.")
217
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
218
+
219
+ state_dict = OrderedDict()
220
+
221
+ # buffers
222
+ state_dict.update(buffers)
223
+ if debug:
224
+ print(f"added {len(buffers)} buffers")
225
+
226
+ # params
227
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
228
+ # out-of-core computing solution
229
+ total_numel = 0
230
+ total_params = 0
231
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
232
+ offset = 0
233
+ avail_numel = full_single_fp32_vector.numel()
234
+ for name, shape in shapes.items():
235
+
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+ total_params += 1
239
+
240
+ if debug:
241
+ print(
242
+ f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
243
+ )
244
+ state_dict[name] = full_single_fp32_vector.narrow(
245
+ 0,
246
+ offset,
247
+ unpartitioned_numel).view(shape)
248
+ offset += unpartitioned_numel
249
+
250
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
251
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
252
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
253
+ # live optimizer object, so we are checking that the numbers are within the right range
254
+ align_to = 2 * world_size
255
+
256
+ def zero2_align(x):
257
+ return align_to * math.ceil(x / align_to)
258
+
259
+ if debug:
260
+ print(f"original offset={offset}, avail_numel={avail_numel}")
261
+
262
+ offset = zero2_align(offset)
263
+ avail_numel = zero2_align(avail_numel)
264
+
265
+ if debug:
266
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
267
+
268
+ # Sanity check
269
+ if offset != avail_numel:
270
+ raise ValueError(
271
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
272
+
273
+ print(
274
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
275
+ )
276
+
277
+ return state_dict
278
+
279
+
280
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
281
+ remainder = unpartitioned_numel % world_size
282
+ padding_numel = (world_size - remainder) if remainder else 0
283
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
284
+ return partitioned_numel, padding_numel
285
+
286
+
287
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
288
+ param_shapes,
289
+ fp32_flat_groups,
290
+ buffers):
291
+
292
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
293
+ # param, re-consolidating each param, while dealing with padding if any
294
+
295
+ avail_numel = fp32_flat_groups[0].numel() * world_size
296
+ # merge list of dicts, preserving order
297
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
298
+
299
+ if debug:
300
+ for i in range(world_size):
301
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
302
+
303
+ wanted_params = len(param_shapes)
304
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
305
+ # not asserting if there is a mismatch due to possible padding
306
+ print(f"Have {avail_numel} numels to process.")
307
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
308
+
309
+ state_dict = OrderedDict()
310
+
311
+ # buffers
312
+ state_dict.update(buffers)
313
+ if debug:
314
+ print(f"added {len(buffers)} buffers")
315
+
316
+ # params
317
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
318
+ # out-of-core computing solution
319
+ offset = 0
320
+ total_numel = 0
321
+ total_params = 0
322
+ for name, shape in param_shapes.items():
323
+
324
+ unpartitioned_numel = shape.numel()
325
+ total_numel += unpartitioned_numel
326
+ total_params += 1
327
+
328
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
329
+
330
+ if debug:
331
+ print(
332
+ f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
333
+ )
334
+
335
+ # XXX: memory usage doubles here
336
+ state_dict[name] = torch.cat(
337
+ tuple(fp32_flat_groups[i].narrow(0,
338
+ offset,
339
+ partitioned_numel)
340
+ for i in range(world_size)),
341
+ 0).narrow(0,
342
+ 0,
343
+ unpartitioned_numel).view(shape)
344
+ offset += partitioned_numel
345
+
346
+ offset *= world_size
347
+
348
+ # Sanity check
349
+ if offset != avail_numel:
350
+ raise ValueError(
351
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
352
+
353
+ print(
354
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
355
+ )
356
+
357
+ return state_dict
358
+
359
+
360
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
361
+ """
362
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
363
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
364
+ via a model hub.
365
+
366
+ Args:
367
+ - ``checkpoint_dir``: path to the desired checkpoint folder
368
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
369
+
370
+ Returns:
371
+ - pytorch ``state_dict``
372
+
373
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
374
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
375
+ the checkpoint.
376
+
377
+ A typical usage might be ::
378
+
379
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
380
+ # do the training and checkpoint saving
381
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
382
+ model = model.cpu() # move to cpu
383
+ model.load_state_dict(state_dict)
384
+ # submit to model hub or save the model to share with others
385
+
386
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
387
+ application. i.e. you will need to re-initialize the deepspeed engine, since
388
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
389
+
390
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
391
+
392
+ """
393
+ if tag is None:
394
+ latest_path = os.path.join(checkpoint_dir, 'latest')
395
+ if os.path.isfile(latest_path):
396
+ with open(latest_path, 'r') as fd:
397
+ tag = fd.read().strip()
398
+ else:
399
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
400
+
401
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
402
+
403
+ if not os.path.isdir(ds_checkpoint_dir):
404
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
405
+
406
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
407
+
408
+
409
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
410
+ """
411
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
412
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
413
+
414
+ Args:
415
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
416
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
417
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
418
+ """
419
+
420
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
421
+ print(f"Saving fp32 state dict to {output_file}")
422
+ torch.save(state_dict, output_file)
423
+
424
+
425
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
426
+ """
427
+ 1. Put the provided model to cpu
428
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
429
+ 3. Load it into the provided model
430
+
431
+ Args:
432
+ - ``model``: the model object to update
433
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
434
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
435
+
436
+ Returns:
437
+ - ``model`: modified model
438
+
439
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
440
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
441
+ conveniently placed for you in the checkpoint folder.
442
+
443
+ A typical usage might be ::
444
+
445
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
446
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
447
+ # submit to model hub or save the model to share with others
448
+
449
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
450
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
451
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
452
+
453
+ """
454
+ logger.info(f"Extracting fp32 weights")
455
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
456
+
457
+ logger.info(f"Overwriting model with fp32 weights")
458
+ model = model.cpu()
459
+ model.load_state_dict(state_dict, strict=False)
460
+
461
+ return model
462
+
463
+
464
+ if __name__ == "__main__":
465
+
466
+ parser = argparse.ArgumentParser()
467
+ parser.add_argument(
468
+ "checkpoint_dir",
469
+ type=str,
470
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
471
+ parser.add_argument(
472
+ "output_file",
473
+ type=str,
474
+ help=
475
+ "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
476
+ )
477
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
478
+ args = parser.parse_args()
479
+
480
+ debug = args.debug
481
+
482
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)
checkpoint-18000/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "emilios/whisper-medium-el-n2",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 24,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.1,
21
+ "encoder_attention_heads": 16,
22
+ "encoder_ffn_dim": 4096,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 24,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": null,
27
+ "init_std": 0.02,
28
+ "is_encoder_decoder": true,
29
+ "max_length": 448,
30
+ "max_source_positions": 1500,
31
+ "max_target_positions": 448,
32
+ "model_type": "whisper",
33
+ "num_hidden_layers": 24,
34
+ "num_mel_bins": 80,
35
+ "pad_token_id": 50257,
36
+ "scale_embedding": false,
37
+ "torch_dtype": "float16",
38
+ "transformers_version": "4.26.0.dev0",
39
+ "use_cache": false,
40
+ "vocab_size": 51865
41
+ }
checkpoint-18000/global_step18000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:968e33791e5da9e611607f08b7bccc994a655e736b76069a6904307bf45069a4
3
+ size 1527967899
checkpoint-18000/global_step18000/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93e4f29f011651f19d5cb83131fb2ca0707a6560a65e217bb3f45949619ff6c8
3
+ size 9166378846
checkpoint-18000/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step18000
checkpoint-18000/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-18000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0083259db3b58a2a002a48614b01db92f9a0d63a6d02a7aeff5ba6e221b37e9a
3
+ size 1527847357
checkpoint-18000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:caec33bf7a1d27f20d4d423d145811b5b034deea36849823358a452fa528a772
3
+ size 14575
checkpoint-18000/trainer_state.json ADDED
@@ -0,0 +1,4498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 9.778974739970282,
3
+ "best_model_checkpoint": "./checkpoint-9000",
4
+ "epoch": 1058.2941176470588,
5
+ "global_step": 18000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 2.78,
12
+ "learning_rate": 5.0453611334320685e-06,
13
+ "loss": 0.6804,
14
+ "step": 25
15
+ },
16
+ {
17
+ "epoch": 5.56,
18
+ "learning_rate": 6.229195710491767e-06,
19
+ "loss": 0.1847,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 8.33,
24
+ "learning_rate": 6.903829450223392e-06,
25
+ "loss": 0.0821,
26
+ "step": 75
27
+ },
28
+ {
29
+ "epoch": 11.11,
30
+ "learning_rate": 7.377725845391017e-06,
31
+ "loss": 0.0485,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 13.89,
36
+ "learning_rate": 7.743343231239583e-06,
37
+ "loss": 0.0432,
38
+ "step": 125
39
+ },
40
+ {
41
+ "epoch": 16.67,
42
+ "learning_rate": 8.041073861170494e-06,
43
+ "loss": 0.0328,
44
+ "step": 150
45
+ },
46
+ {
47
+ "epoch": 19.44,
48
+ "learning_rate": 8.292222957399574e-06,
49
+ "loss": 0.0291,
50
+ "step": 175
51
+ },
52
+ {
53
+ "epoch": 22.22,
54
+ "learning_rate": 8.509413541357755e-06,
55
+ "loss": 0.0298,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 25.0,
60
+ "learning_rate": 8.700744577655557e-06,
61
+ "loss": 0.0269,
62
+ "step": 225
63
+ },
64
+ {
65
+ "epoch": 27.78,
66
+ "learning_rate": 8.871723942761204e-06,
67
+ "loss": 0.0272,
68
+ "step": 250
69
+ },
70
+ {
71
+ "epoch": 30.56,
72
+ "learning_rate": 9.026267958246849e-06,
73
+ "loss": 0.027,
74
+ "step": 275
75
+ },
76
+ {
77
+ "epoch": 33.33,
78
+ "learning_rate": 9.16726106663399e-06,
79
+ "loss": 0.0213,
80
+ "step": 300
81
+ },
82
+ {
83
+ "epoch": 36.11,
84
+ "learning_rate": 9.296889251455016e-06,
85
+ "loss": 0.0215,
86
+ "step": 325
87
+ },
88
+ {
89
+ "epoch": 38.89,
90
+ "learning_rate": 9.416848797368692e-06,
91
+ "loss": 0.0195,
92
+ "step": 350
93
+ },
94
+ {
95
+ "epoch": 41.67,
96
+ "learning_rate": 9.528482449516371e-06,
97
+ "loss": 0.0167,
98
+ "step": 375
99
+ },
100
+ {
101
+ "epoch": 44.44,
102
+ "learning_rate": 9.632871309784314e-06,
103
+ "loss": 0.0184,
104
+ "step": 400
105
+ },
106
+ {
107
+ "epoch": 47.22,
108
+ "learning_rate": 9.73089868785391e-06,
109
+ "loss": 0.0159,
110
+ "step": 425
111
+ },
112
+ {
113
+ "epoch": 50.0,
114
+ "learning_rate": 9.823295589572114e-06,
115
+ "loss": 0.0172,
116
+ "step": 450
117
+ },
118
+ {
119
+ "epoch": 52.78,
120
+ "learning_rate": 9.910673836465484e-06,
121
+ "loss": 0.0123,
122
+ "step": 475
123
+ },
124
+ {
125
+ "epoch": 55.56,
126
+ "learning_rate": 9.993550644973805e-06,
127
+ "loss": 0.0144,
128
+ "step": 500
129
+ },
130
+ {
131
+ "epoch": 58.33,
132
+ "learning_rate": 9.951111111111111e-06,
133
+ "loss": 0.0135,
134
+ "step": 525
135
+ },
136
+ {
137
+ "epoch": 61.11,
138
+ "learning_rate": 9.895555555555557e-06,
139
+ "loss": 0.0128,
140
+ "step": 550
141
+ },
142
+ {
143
+ "epoch": 63.89,
144
+ "learning_rate": 9.84e-06,
145
+ "loss": 0.0115,
146
+ "step": 575
147
+ },
148
+ {
149
+ "epoch": 66.67,
150
+ "learning_rate": 9.784444444444445e-06,
151
+ "loss": 0.0105,
152
+ "step": 600
153
+ },
154
+ {
155
+ "epoch": 69.44,
156
+ "learning_rate": 9.72888888888889e-06,
157
+ "loss": 0.0104,
158
+ "step": 625
159
+ },
160
+ {
161
+ "epoch": 72.22,
162
+ "learning_rate": 9.673333333333334e-06,
163
+ "loss": 0.0087,
164
+ "step": 650
165
+ },
166
+ {
167
+ "epoch": 75.0,
168
+ "learning_rate": 9.617777777777778e-06,
169
+ "loss": 0.0091,
170
+ "step": 675
171
+ },
172
+ {
173
+ "epoch": 77.78,
174
+ "learning_rate": 9.562222222222223e-06,
175
+ "loss": 0.0085,
176
+ "step": 700
177
+ },
178
+ {
179
+ "epoch": 80.56,
180
+ "learning_rate": 9.506666666666667e-06,
181
+ "loss": 0.011,
182
+ "step": 725
183
+ },
184
+ {
185
+ "epoch": 83.33,
186
+ "learning_rate": 9.451111111111112e-06,
187
+ "loss": 0.0117,
188
+ "step": 750
189
+ },
190
+ {
191
+ "epoch": 86.11,
192
+ "learning_rate": 9.395555555555556e-06,
193
+ "loss": 0.0088,
194
+ "step": 775
195
+ },
196
+ {
197
+ "epoch": 88.89,
198
+ "learning_rate": 9.340000000000002e-06,
199
+ "loss": 0.0077,
200
+ "step": 800
201
+ },
202
+ {
203
+ "epoch": 91.67,
204
+ "learning_rate": 9.284444444444444e-06,
205
+ "loss": 0.0091,
206
+ "step": 825
207
+ },
208
+ {
209
+ "epoch": 94.44,
210
+ "learning_rate": 9.22888888888889e-06,
211
+ "loss": 0.0067,
212
+ "step": 850
213
+ },
214
+ {
215
+ "epoch": 97.22,
216
+ "learning_rate": 9.173333333333334e-06,
217
+ "loss": 0.0082,
218
+ "step": 875
219
+ },
220
+ {
221
+ "epoch": 100.0,
222
+ "learning_rate": 9.117777777777778e-06,
223
+ "loss": 0.0055,
224
+ "step": 900
225
+ },
226
+ {
227
+ "epoch": 102.78,
228
+ "learning_rate": 9.062222222222224e-06,
229
+ "loss": 0.0077,
230
+ "step": 925
231
+ },
232
+ {
233
+ "epoch": 105.56,
234
+ "learning_rate": 9.006666666666666e-06,
235
+ "loss": 0.0055,
236
+ "step": 950
237
+ },
238
+ {
239
+ "epoch": 108.33,
240
+ "learning_rate": 8.951111111111112e-06,
241
+ "loss": 0.005,
242
+ "step": 975
243
+ },
244
+ {
245
+ "epoch": 111.11,
246
+ "learning_rate": 8.895555555555556e-06,
247
+ "loss": 0.0066,
248
+ "step": 1000
249
+ },
250
+ {
251
+ "epoch": 111.11,
252
+ "eval_loss": 0.2357177734375,
253
+ "eval_runtime": 64.7785,
254
+ "eval_samples_per_second": 2.022,
255
+ "eval_steps_per_second": 0.139,
256
+ "eval_wer": 23.044096728307252,
257
+ "step": 1000
258
+ },
259
+ {
260
+ "epoch": 113.89,
261
+ "learning_rate": 8.844444444444445e-06,
262
+ "loss": 0.0057,
263
+ "step": 1025
264
+ },
265
+ {
266
+ "epoch": 116.67,
267
+ "learning_rate": 8.788888888888891e-06,
268
+ "loss": 0.0096,
269
+ "step": 1050
270
+ },
271
+ {
272
+ "epoch": 119.44,
273
+ "learning_rate": 8.733333333333333e-06,
274
+ "loss": 0.0063,
275
+ "step": 1075
276
+ },
277
+ {
278
+ "epoch": 122.22,
279
+ "learning_rate": 8.677777777777779e-06,
280
+ "loss": 0.0069,
281
+ "step": 1100
282
+ },
283
+ {
284
+ "epoch": 125.0,
285
+ "learning_rate": 8.622222222222223e-06,
286
+ "loss": 0.0069,
287
+ "step": 1125
288
+ },
289
+ {
290
+ "epoch": 127.78,
291
+ "learning_rate": 8.566666666666667e-06,
292
+ "loss": 0.0046,
293
+ "step": 1150
294
+ },
295
+ {
296
+ "epoch": 130.56,
297
+ "learning_rate": 8.511111111111113e-06,
298
+ "loss": 0.0051,
299
+ "step": 1175
300
+ },
301
+ {
302
+ "epoch": 133.33,
303
+ "learning_rate": 8.455555555555555e-06,
304
+ "loss": 0.0055,
305
+ "step": 1200
306
+ },
307
+ {
308
+ "epoch": 136.11,
309
+ "learning_rate": 8.400000000000001e-06,
310
+ "loss": 0.0042,
311
+ "step": 1225
312
+ },
313
+ {
314
+ "epoch": 138.89,
315
+ "learning_rate": 8.344444444444445e-06,
316
+ "loss": 0.0042,
317
+ "step": 1250
318
+ },
319
+ {
320
+ "epoch": 141.67,
321
+ "learning_rate": 8.288888888888889e-06,
322
+ "loss": 0.005,
323
+ "step": 1275
324
+ },
325
+ {
326
+ "epoch": 144.44,
327
+ "learning_rate": 8.233333333333335e-06,
328
+ "loss": 0.0054,
329
+ "step": 1300
330
+ },
331
+ {
332
+ "epoch": 147.22,
333
+ "learning_rate": 8.177777777777779e-06,
334
+ "loss": 0.0052,
335
+ "step": 1325
336
+ },
337
+ {
338
+ "epoch": 150.0,
339
+ "learning_rate": 8.122222222222223e-06,
340
+ "loss": 0.0057,
341
+ "step": 1350
342
+ },
343
+ {
344
+ "epoch": 152.78,
345
+ "learning_rate": 8.066666666666667e-06,
346
+ "loss": 0.0039,
347
+ "step": 1375
348
+ },
349
+ {
350
+ "epoch": 155.56,
351
+ "learning_rate": 8.011111111111113e-06,
352
+ "loss": 0.0032,
353
+ "step": 1400
354
+ },
355
+ {
356
+ "epoch": 158.33,
357
+ "learning_rate": 7.955555555555557e-06,
358
+ "loss": 0.0034,
359
+ "step": 1425
360
+ },
361
+ {
362
+ "epoch": 161.11,
363
+ "learning_rate": 7.902222222222223e-06,
364
+ "loss": 0.0068,
365
+ "step": 1450
366
+ },
367
+ {
368
+ "epoch": 163.89,
369
+ "learning_rate": 7.846666666666667e-06,
370
+ "loss": 0.0034,
371
+ "step": 1475
372
+ },
373
+ {
374
+ "epoch": 166.67,
375
+ "learning_rate": 7.791111111111111e-06,
376
+ "loss": 0.0026,
377
+ "step": 1500
378
+ },
379
+ {
380
+ "epoch": 169.44,
381
+ "learning_rate": 7.735555555555557e-06,
382
+ "loss": 0.0036,
383
+ "step": 1525
384
+ },
385
+ {
386
+ "epoch": 172.22,
387
+ "learning_rate": 7.680000000000001e-06,
388
+ "loss": 0.0033,
389
+ "step": 1550
390
+ },
391
+ {
392
+ "epoch": 175.0,
393
+ "learning_rate": 7.624444444444445e-06,
394
+ "loss": 0.0021,
395
+ "step": 1575
396
+ },
397
+ {
398
+ "epoch": 177.78,
399
+ "learning_rate": 7.56888888888889e-06,
400
+ "loss": 0.0033,
401
+ "step": 1600
402
+ },
403
+ {
404
+ "epoch": 180.56,
405
+ "learning_rate": 7.513333333333334e-06,
406
+ "loss": 0.0037,
407
+ "step": 1625
408
+ },
409
+ {
410
+ "epoch": 183.33,
411
+ "learning_rate": 7.457777777777778e-06,
412
+ "loss": 0.0032,
413
+ "step": 1650
414
+ },
415
+ {
416
+ "epoch": 186.11,
417
+ "learning_rate": 7.402222222222223e-06,
418
+ "loss": 0.0037,
419
+ "step": 1675
420
+ },
421
+ {
422
+ "epoch": 188.89,
423
+ "learning_rate": 7.346666666666668e-06,
424
+ "loss": 0.0022,
425
+ "step": 1700
426
+ },
427
+ {
428
+ "epoch": 191.67,
429
+ "learning_rate": 7.291111111111112e-06,
430
+ "loss": 0.0024,
431
+ "step": 1725
432
+ },
433
+ {
434
+ "epoch": 194.44,
435
+ "learning_rate": 7.235555555555556e-06,
436
+ "loss": 0.0026,
437
+ "step": 1750
438
+ },
439
+ {
440
+ "epoch": 197.22,
441
+ "learning_rate": 7.180000000000001e-06,
442
+ "loss": 0.0022,
443
+ "step": 1775
444
+ },
445
+ {
446
+ "epoch": 200.0,
447
+ "learning_rate": 7.124444444444445e-06,
448
+ "loss": 0.0026,
449
+ "step": 1800
450
+ },
451
+ {
452
+ "epoch": 202.78,
453
+ "learning_rate": 7.06888888888889e-06,
454
+ "loss": 0.0032,
455
+ "step": 1825
456
+ },
457
+ {
458
+ "epoch": 205.56,
459
+ "learning_rate": 7.0133333333333345e-06,
460
+ "loss": 0.0033,
461
+ "step": 1850
462
+ },
463
+ {
464
+ "epoch": 208.33,
465
+ "learning_rate": 6.9577777777777785e-06,
466
+ "loss": 0.0027,
467
+ "step": 1875
468
+ },
469
+ {
470
+ "epoch": 211.11,
471
+ "learning_rate": 6.902222222222223e-06,
472
+ "loss": 0.0043,
473
+ "step": 1900
474
+ },
475
+ {
476
+ "epoch": 213.89,
477
+ "learning_rate": 6.846666666666667e-06,
478
+ "loss": 0.0028,
479
+ "step": 1925
480
+ },
481
+ {
482
+ "epoch": 216.67,
483
+ "learning_rate": 6.7911111111111115e-06,
484
+ "loss": 0.0012,
485
+ "step": 1950
486
+ },
487
+ {
488
+ "epoch": 219.44,
489
+ "learning_rate": 6.735555555555556e-06,
490
+ "loss": 0.0015,
491
+ "step": 1975
492
+ },
493
+ {
494
+ "epoch": 222.22,
495
+ "learning_rate": 6.680000000000001e-06,
496
+ "loss": 0.0024,
497
+ "step": 2000
498
+ },
499
+ {
500
+ "epoch": 222.22,
501
+ "eval_loss": 0.2607421875,
502
+ "eval_runtime": 57.0802,
503
+ "eval_samples_per_second": 2.295,
504
+ "eval_steps_per_second": 0.158,
505
+ "eval_wer": 19.665718349928877,
506
+ "step": 2000
507
+ },
508
+ {
509
+ "epoch": 225.0,
510
+ "learning_rate": 6.6244444444444445e-06,
511
+ "loss": 0.0029,
512
+ "step": 2025
513
+ },
514
+ {
515
+ "epoch": 227.78,
516
+ "learning_rate": 6.568888888888889e-06,
517
+ "loss": 0.0021,
518
+ "step": 2050
519
+ },
520
+ {
521
+ "epoch": 230.56,
522
+ "learning_rate": 6.513333333333333e-06,
523
+ "loss": 0.0022,
524
+ "step": 2075
525
+ },
526
+ {
527
+ "epoch": 233.33,
528
+ "learning_rate": 6.457777777777778e-06,
529
+ "loss": 0.0022,
530
+ "step": 2100
531
+ },
532
+ {
533
+ "epoch": 236.11,
534
+ "learning_rate": 6.402222222222223e-06,
535
+ "loss": 0.0011,
536
+ "step": 2125
537
+ },
538
+ {
539
+ "epoch": 238.89,
540
+ "learning_rate": 6.346666666666668e-06,
541
+ "loss": 0.0026,
542
+ "step": 2150
543
+ },
544
+ {
545
+ "epoch": 241.67,
546
+ "learning_rate": 6.291111111111111e-06,
547
+ "loss": 0.0021,
548
+ "step": 2175
549
+ },
550
+ {
551
+ "epoch": 244.44,
552
+ "learning_rate": 6.235555555555556e-06,
553
+ "loss": 0.0016,
554
+ "step": 2200
555
+ },
556
+ {
557
+ "epoch": 247.22,
558
+ "learning_rate": 6.18e-06,
559
+ "loss": 0.0024,
560
+ "step": 2225
561
+ },
562
+ {
563
+ "epoch": 250.0,
564
+ "learning_rate": 6.124444444444445e-06,
565
+ "loss": 0.0046,
566
+ "step": 2250
567
+ },
568
+ {
569
+ "epoch": 252.78,
570
+ "learning_rate": 6.06888888888889e-06,
571
+ "loss": 0.0018,
572
+ "step": 2275
573
+ },
574
+ {
575
+ "epoch": 255.56,
576
+ "learning_rate": 6.013333333333335e-06,
577
+ "loss": 0.0012,
578
+ "step": 2300
579
+ },
580
+ {
581
+ "epoch": 258.33,
582
+ "learning_rate": 5.957777777777778e-06,
583
+ "loss": 0.0014,
584
+ "step": 2325
585
+ },
586
+ {
587
+ "epoch": 261.11,
588
+ "learning_rate": 5.902222222222223e-06,
589
+ "loss": 0.0007,
590
+ "step": 2350
591
+ },
592
+ {
593
+ "epoch": 263.89,
594
+ "learning_rate": 5.846666666666667e-06,
595
+ "loss": 0.0014,
596
+ "step": 2375
597
+ },
598
+ {
599
+ "epoch": 266.67,
600
+ "learning_rate": 5.791111111111112e-06,
601
+ "loss": 0.0009,
602
+ "step": 2400
603
+ },
604
+ {
605
+ "epoch": 269.44,
606
+ "learning_rate": 5.735555555555557e-06,
607
+ "loss": 0.0008,
608
+ "step": 2425
609
+ },
610
+ {
611
+ "epoch": 272.22,
612
+ "learning_rate": 5.68e-06,
613
+ "loss": 0.0028,
614
+ "step": 2450
615
+ },
616
+ {
617
+ "epoch": 275.0,
618
+ "learning_rate": 5.624444444444445e-06,
619
+ "loss": 0.002,
620
+ "step": 2475
621
+ },
622
+ {
623
+ "epoch": 277.78,
624
+ "learning_rate": 5.56888888888889e-06,
625
+ "loss": 0.0011,
626
+ "step": 2500
627
+ },
628
+ {
629
+ "epoch": 280.56,
630
+ "learning_rate": 5.513333333333334e-06,
631
+ "loss": 0.001,
632
+ "step": 2525
633
+ },
634
+ {
635
+ "epoch": 283.33,
636
+ "learning_rate": 5.4577777777777785e-06,
637
+ "loss": 0.0007,
638
+ "step": 2550
639
+ },
640
+ {
641
+ "epoch": 286.11,
642
+ "learning_rate": 5.402222222222223e-06,
643
+ "loss": 0.0007,
644
+ "step": 2575
645
+ },
646
+ {
647
+ "epoch": 288.89,
648
+ "learning_rate": 5.346666666666667e-06,
649
+ "loss": 0.0008,
650
+ "step": 2600
651
+ },
652
+ {
653
+ "epoch": 291.67,
654
+ "learning_rate": 5.2911111111111115e-06,
655
+ "loss": 0.0012,
656
+ "step": 2625
657
+ },
658
+ {
659
+ "epoch": 294.44,
660
+ "learning_rate": 5.235555555555556e-06,
661
+ "loss": 0.0016,
662
+ "step": 2650
663
+ },
664
+ {
665
+ "epoch": 297.22,
666
+ "learning_rate": 5.18e-06,
667
+ "loss": 0.0012,
668
+ "step": 2675
669
+ },
670
+ {
671
+ "epoch": 300.0,
672
+ "learning_rate": 5.124444444444445e-06,
673
+ "loss": 0.001,
674
+ "step": 2700
675
+ },
676
+ {
677
+ "epoch": 302.78,
678
+ "learning_rate": 5.06888888888889e-06,
679
+ "loss": 0.0012,
680
+ "step": 2725
681
+ },
682
+ {
683
+ "epoch": 305.56,
684
+ "learning_rate": 5.013333333333333e-06,
685
+ "loss": 0.001,
686
+ "step": 2750
687
+ },
688
+ {
689
+ "epoch": 308.33,
690
+ "learning_rate": 4.957777777777778e-06,
691
+ "loss": 0.0013,
692
+ "step": 2775
693
+ },
694
+ {
695
+ "epoch": 311.11,
696
+ "learning_rate": 4.902222222222222e-06,
697
+ "loss": 0.0015,
698
+ "step": 2800
699
+ },
700
+ {
701
+ "epoch": 313.89,
702
+ "learning_rate": 4.846666666666667e-06,
703
+ "loss": 0.0014,
704
+ "step": 2825
705
+ },
706
+ {
707
+ "epoch": 316.67,
708
+ "learning_rate": 4.791111111111111e-06,
709
+ "loss": 0.0007,
710
+ "step": 2850
711
+ },
712
+ {
713
+ "epoch": 319.44,
714
+ "learning_rate": 4.735555555555556e-06,
715
+ "loss": 0.0009,
716
+ "step": 2875
717
+ },
718
+ {
719
+ "epoch": 322.22,
720
+ "learning_rate": 4.680000000000001e-06,
721
+ "loss": 0.0021,
722
+ "step": 2900
723
+ },
724
+ {
725
+ "epoch": 325.0,
726
+ "learning_rate": 4.624444444444445e-06,
727
+ "loss": 0.0015,
728
+ "step": 2925
729
+ },
730
+ {
731
+ "epoch": 327.78,
732
+ "learning_rate": 4.568888888888889e-06,
733
+ "loss": 0.0012,
734
+ "step": 2950
735
+ },
736
+ {
737
+ "epoch": 330.56,
738
+ "learning_rate": 4.513333333333333e-06,
739
+ "loss": 0.0009,
740
+ "step": 2975
741
+ },
742
+ {
743
+ "epoch": 333.33,
744
+ "learning_rate": 4.457777777777778e-06,
745
+ "loss": 0.0011,
746
+ "step": 3000
747
+ },
748
+ {
749
+ "epoch": 333.33,
750
+ "eval_loss": 0.277099609375,
751
+ "eval_runtime": 58.1634,
752
+ "eval_samples_per_second": 2.252,
753
+ "eval_steps_per_second": 0.155,
754
+ "eval_wer": 20.874822190611663,
755
+ "step": 3000
756
+ },
757
+ {
758
+ "epoch": 177.47,
759
+ "learning_rate": 1.760888888888889e-06,
760
+ "loss": 0.5801,
761
+ "step": 3025
762
+ },
763
+ {
764
+ "epoch": 178.94,
765
+ "learning_rate": 1.7386666666666666e-06,
766
+ "loss": 0.1501,
767
+ "step": 3050
768
+ },
769
+ {
770
+ "epoch": 180.41,
771
+ "learning_rate": 1.7164444444444444e-06,
772
+ "loss": 0.0789,
773
+ "step": 3075
774
+ },
775
+ {
776
+ "epoch": 181.88,
777
+ "learning_rate": 1.6942222222222222e-06,
778
+ "loss": 0.0531,
779
+ "step": 3100
780
+ },
781
+ {
782
+ "epoch": 183.35,
783
+ "learning_rate": 1.6719999999999998e-06,
784
+ "loss": 0.0409,
785
+ "step": 3125
786
+ },
787
+ {
788
+ "epoch": 184.82,
789
+ "learning_rate": 1.6497777777777777e-06,
790
+ "loss": 0.032,
791
+ "step": 3150
792
+ },
793
+ {
794
+ "epoch": 186.29,
795
+ "learning_rate": 1.6275555555555555e-06,
796
+ "loss": 0.0251,
797
+ "step": 3175
798
+ },
799
+ {
800
+ "epoch": 187.76,
801
+ "learning_rate": 1.6053333333333333e-06,
802
+ "loss": 0.0203,
803
+ "step": 3200
804
+ },
805
+ {
806
+ "epoch": 189.24,
807
+ "learning_rate": 1.5831111111111111e-06,
808
+ "loss": 0.0167,
809
+ "step": 3225
810
+ },
811
+ {
812
+ "epoch": 190.71,
813
+ "learning_rate": 1.560888888888889e-06,
814
+ "loss": 0.0159,
815
+ "step": 3250
816
+ },
817
+ {
818
+ "epoch": 192.18,
819
+ "learning_rate": 1.5386666666666666e-06,
820
+ "loss": 0.0137,
821
+ "step": 3275
822
+ },
823
+ {
824
+ "epoch": 193.65,
825
+ "learning_rate": 1.5164444444444444e-06,
826
+ "loss": 0.0122,
827
+ "step": 3300
828
+ },
829
+ {
830
+ "epoch": 195.12,
831
+ "learning_rate": 1.494222222222222e-06,
832
+ "loss": 0.0106,
833
+ "step": 3325
834
+ },
835
+ {
836
+ "epoch": 196.59,
837
+ "learning_rate": 1.4719999999999998e-06,
838
+ "loss": 0.0094,
839
+ "step": 3350
840
+ },
841
+ {
842
+ "epoch": 198.06,
843
+ "learning_rate": 1.4497777777777777e-06,
844
+ "loss": 0.009,
845
+ "step": 3375
846
+ },
847
+ {
848
+ "epoch": 199.53,
849
+ "learning_rate": 1.4275555555555555e-06,
850
+ "loss": 0.0104,
851
+ "step": 3400
852
+ },
853
+ {
854
+ "epoch": 201.0,
855
+ "learning_rate": 1.4053333333333333e-06,
856
+ "loss": 0.0069,
857
+ "step": 3425
858
+ },
859
+ {
860
+ "epoch": 202.47,
861
+ "learning_rate": 1.3848888888888889e-06,
862
+ "loss": 0.0073,
863
+ "step": 3450
864
+ },
865
+ {
866
+ "epoch": 203.94,
867
+ "learning_rate": 1.3626666666666667e-06,
868
+ "loss": 0.0073,
869
+ "step": 3475
870
+ },
871
+ {
872
+ "epoch": 205.41,
873
+ "learning_rate": 1.3404444444444445e-06,
874
+ "loss": 0.0063,
875
+ "step": 3500
876
+ },
877
+ {
878
+ "epoch": 206.88,
879
+ "learning_rate": 1.3182222222222221e-06,
880
+ "loss": 0.007,
881
+ "step": 3525
882
+ },
883
+ {
884
+ "epoch": 208.35,
885
+ "learning_rate": 1.296e-06,
886
+ "loss": 0.0061,
887
+ "step": 3550
888
+ },
889
+ {
890
+ "epoch": 209.82,
891
+ "learning_rate": 1.2737777777777776e-06,
892
+ "loss": 0.0053,
893
+ "step": 3575
894
+ },
895
+ {
896
+ "epoch": 211.29,
897
+ "learning_rate": 1.2515555555555554e-06,
898
+ "loss": 0.0056,
899
+ "step": 3600
900
+ },
901
+ {
902
+ "epoch": 212.76,
903
+ "learning_rate": 1.2293333333333334e-06,
904
+ "loss": 0.005,
905
+ "step": 3625
906
+ },
907
+ {
908
+ "epoch": 214.24,
909
+ "learning_rate": 1.207111111111111e-06,
910
+ "loss": 0.0047,
911
+ "step": 3650
912
+ },
913
+ {
914
+ "epoch": 215.71,
915
+ "learning_rate": 1.1848888888888889e-06,
916
+ "loss": 0.0052,
917
+ "step": 3675
918
+ },
919
+ {
920
+ "epoch": 217.18,
921
+ "learning_rate": 1.1626666666666667e-06,
922
+ "loss": 0.0044,
923
+ "step": 3700
924
+ },
925
+ {
926
+ "epoch": 218.65,
927
+ "learning_rate": 1.1404444444444443e-06,
928
+ "loss": 0.0046,
929
+ "step": 3725
930
+ },
931
+ {
932
+ "epoch": 220.12,
933
+ "learning_rate": 1.1182222222222221e-06,
934
+ "loss": 0.0045,
935
+ "step": 3750
936
+ },
937
+ {
938
+ "epoch": 221.59,
939
+ "learning_rate": 1.096e-06,
940
+ "loss": 0.0041,
941
+ "step": 3775
942
+ },
943
+ {
944
+ "epoch": 223.06,
945
+ "learning_rate": 1.0737777777777776e-06,
946
+ "loss": 0.0054,
947
+ "step": 3800
948
+ },
949
+ {
950
+ "epoch": 224.53,
951
+ "learning_rate": 1.0515555555555556e-06,
952
+ "loss": 0.0038,
953
+ "step": 3825
954
+ },
955
+ {
956
+ "epoch": 226.0,
957
+ "learning_rate": 1.0293333333333334e-06,
958
+ "loss": 0.0038,
959
+ "step": 3850
960
+ },
961
+ {
962
+ "epoch": 227.47,
963
+ "learning_rate": 1.007111111111111e-06,
964
+ "loss": 0.004,
965
+ "step": 3875
966
+ },
967
+ {
968
+ "epoch": 228.94,
969
+ "learning_rate": 9.848888888888889e-07,
970
+ "loss": 0.0036,
971
+ "step": 3900
972
+ },
973
+ {
974
+ "epoch": 230.41,
975
+ "learning_rate": 9.626666666666667e-07,
976
+ "loss": 0.0041,
977
+ "step": 3925
978
+ },
979
+ {
980
+ "epoch": 231.88,
981
+ "learning_rate": 9.404444444444443e-07,
982
+ "loss": 0.0032,
983
+ "step": 3950
984
+ },
985
+ {
986
+ "epoch": 233.35,
987
+ "learning_rate": 9.182222222222223e-07,
988
+ "loss": 0.0038,
989
+ "step": 3975
990
+ },
991
+ {
992
+ "epoch": 234.82,
993
+ "learning_rate": 8.96e-07,
994
+ "loss": 0.0043,
995
+ "step": 4000
996
+ },
997
+ {
998
+ "epoch": 234.82,
999
+ "eval_loss": 0.45361328125,
1000
+ "eval_runtime": 157.593,
1001
+ "eval_samples_per_second": 1.726,
1002
+ "eval_steps_per_second": 0.108,
1003
+ "eval_wer": 10.707652303120357,
1004
+ "step": 4000
1005
+ },
1006
+ {
1007
+ "epoch": 236.29,
1008
+ "learning_rate": 8.737777777777777e-07,
1009
+ "loss": 0.004,
1010
+ "step": 4025
1011
+ },
1012
+ {
1013
+ "epoch": 237.76,
1014
+ "learning_rate": 8.515555555555555e-07,
1015
+ "loss": 0.0029,
1016
+ "step": 4050
1017
+ },
1018
+ {
1019
+ "epoch": 239.24,
1020
+ "learning_rate": 8.293333333333333e-07,
1021
+ "loss": 0.0034,
1022
+ "step": 4075
1023
+ },
1024
+ {
1025
+ "epoch": 240.71,
1026
+ "learning_rate": 8.071111111111111e-07,
1027
+ "loss": 0.0032,
1028
+ "step": 4100
1029
+ },
1030
+ {
1031
+ "epoch": 242.18,
1032
+ "learning_rate": 7.848888888888888e-07,
1033
+ "loss": 0.003,
1034
+ "step": 4125
1035
+ },
1036
+ {
1037
+ "epoch": 243.65,
1038
+ "learning_rate": 7.626666666666667e-07,
1039
+ "loss": 0.0034,
1040
+ "step": 4150
1041
+ },
1042
+ {
1043
+ "epoch": 245.12,
1044
+ "learning_rate": 7.404444444444444e-07,
1045
+ "loss": 0.0032,
1046
+ "step": 4175
1047
+ },
1048
+ {
1049
+ "epoch": 246.59,
1050
+ "learning_rate": 7.182222222222222e-07,
1051
+ "loss": 0.0032,
1052
+ "step": 4200
1053
+ },
1054
+ {
1055
+ "epoch": 248.06,
1056
+ "learning_rate": 6.959999999999999e-07,
1057
+ "loss": 0.0028,
1058
+ "step": 4225
1059
+ },
1060
+ {
1061
+ "epoch": 249.53,
1062
+ "learning_rate": 6.737777777777778e-07,
1063
+ "loss": 0.0028,
1064
+ "step": 4250
1065
+ },
1066
+ {
1067
+ "epoch": 251.0,
1068
+ "learning_rate": 6.515555555555555e-07,
1069
+ "loss": 0.0025,
1070
+ "step": 4275
1071
+ },
1072
+ {
1073
+ "epoch": 252.47,
1074
+ "learning_rate": 6.293333333333333e-07,
1075
+ "loss": 0.0026,
1076
+ "step": 4300
1077
+ },
1078
+ {
1079
+ "epoch": 253.94,
1080
+ "learning_rate": 6.071111111111111e-07,
1081
+ "loss": 0.003,
1082
+ "step": 4325
1083
+ },
1084
+ {
1085
+ "epoch": 255.41,
1086
+ "learning_rate": 5.848888888888889e-07,
1087
+ "loss": 0.0026,
1088
+ "step": 4350
1089
+ },
1090
+ {
1091
+ "epoch": 256.88,
1092
+ "learning_rate": 5.626666666666666e-07,
1093
+ "loss": 0.0027,
1094
+ "step": 4375
1095
+ },
1096
+ {
1097
+ "epoch": 258.35,
1098
+ "learning_rate": 5.404444444444443e-07,
1099
+ "loss": 0.003,
1100
+ "step": 4400
1101
+ },
1102
+ {
1103
+ "epoch": 259.82,
1104
+ "learning_rate": 5.182222222222223e-07,
1105
+ "loss": 0.0027,
1106
+ "step": 4425
1107
+ },
1108
+ {
1109
+ "epoch": 261.29,
1110
+ "learning_rate": 4.977777777777777e-07,
1111
+ "loss": 0.0026,
1112
+ "step": 4450
1113
+ },
1114
+ {
1115
+ "epoch": 262.76,
1116
+ "learning_rate": 4.7555555555555554e-07,
1117
+ "loss": 0.0023,
1118
+ "step": 4475
1119
+ },
1120
+ {
1121
+ "epoch": 264.24,
1122
+ "learning_rate": 4.5333333333333326e-07,
1123
+ "loss": 0.0021,
1124
+ "step": 4500
1125
+ },
1126
+ {
1127
+ "epoch": 265.71,
1128
+ "learning_rate": 4.311111111111111e-07,
1129
+ "loss": 0.0022,
1130
+ "step": 4525
1131
+ },
1132
+ {
1133
+ "epoch": 267.18,
1134
+ "learning_rate": 4.088888888888889e-07,
1135
+ "loss": 0.0034,
1136
+ "step": 4550
1137
+ },
1138
+ {
1139
+ "epoch": 268.65,
1140
+ "learning_rate": 3.8666666666666664e-07,
1141
+ "loss": 0.0023,
1142
+ "step": 4575
1143
+ },
1144
+ {
1145
+ "epoch": 270.12,
1146
+ "learning_rate": 3.6444444444444446e-07,
1147
+ "loss": 0.0022,
1148
+ "step": 4600
1149
+ },
1150
+ {
1151
+ "epoch": 271.59,
1152
+ "learning_rate": 3.422222222222222e-07,
1153
+ "loss": 0.0022,
1154
+ "step": 4625
1155
+ },
1156
+ {
1157
+ "epoch": 273.06,
1158
+ "learning_rate": 3.2e-07,
1159
+ "loss": 0.0024,
1160
+ "step": 4650
1161
+ },
1162
+ {
1163
+ "epoch": 274.53,
1164
+ "learning_rate": 2.9777777777777773e-07,
1165
+ "loss": 0.0031,
1166
+ "step": 4675
1167
+ },
1168
+ {
1169
+ "epoch": 276.0,
1170
+ "learning_rate": 2.7555555555555555e-07,
1171
+ "loss": 0.0022,
1172
+ "step": 4700
1173
+ },
1174
+ {
1175
+ "epoch": 277.47,
1176
+ "learning_rate": 2.533333333333333e-07,
1177
+ "loss": 0.0022,
1178
+ "step": 4725
1179
+ },
1180
+ {
1181
+ "epoch": 278.94,
1182
+ "learning_rate": 2.311111111111111e-07,
1183
+ "loss": 0.0021,
1184
+ "step": 4750
1185
+ },
1186
+ {
1187
+ "epoch": 280.41,
1188
+ "learning_rate": 2.088888888888889e-07,
1189
+ "loss": 0.0023,
1190
+ "step": 4775
1191
+ },
1192
+ {
1193
+ "epoch": 281.88,
1194
+ "learning_rate": 1.8666666666666667e-07,
1195
+ "loss": 0.0025,
1196
+ "step": 4800
1197
+ },
1198
+ {
1199
+ "epoch": 283.35,
1200
+ "learning_rate": 1.6444444444444444e-07,
1201
+ "loss": 0.0022,
1202
+ "step": 4825
1203
+ },
1204
+ {
1205
+ "epoch": 284.82,
1206
+ "learning_rate": 1.4222222222222222e-07,
1207
+ "loss": 0.0022,
1208
+ "step": 4850
1209
+ },
1210
+ {
1211
+ "epoch": 286.29,
1212
+ "learning_rate": 1.2e-07,
1213
+ "loss": 0.0021,
1214
+ "step": 4875
1215
+ },
1216
+ {
1217
+ "epoch": 287.76,
1218
+ "learning_rate": 9.777777777777778e-08,
1219
+ "loss": 0.0023,
1220
+ "step": 4900
1221
+ },
1222
+ {
1223
+ "epoch": 289.24,
1224
+ "learning_rate": 7.555555555555555e-08,
1225
+ "loss": 0.002,
1226
+ "step": 4925
1227
+ },
1228
+ {
1229
+ "epoch": 290.71,
1230
+ "learning_rate": 5.3333333333333334e-08,
1231
+ "loss": 0.0025,
1232
+ "step": 4950
1233
+ },
1234
+ {
1235
+ "epoch": 292.18,
1236
+ "learning_rate": 3.111111111111111e-08,
1237
+ "loss": 0.002,
1238
+ "step": 4975
1239
+ },
1240
+ {
1241
+ "epoch": 293.65,
1242
+ "learning_rate": 8.888888888888889e-09,
1243
+ "loss": 0.0024,
1244
+ "step": 5000
1245
+ },
1246
+ {
1247
+ "epoch": 293.65,
1248
+ "eval_loss": 0.465576171875,
1249
+ "eval_runtime": 158.123,
1250
+ "eval_samples_per_second": 1.72,
1251
+ "eval_steps_per_second": 0.108,
1252
+ "eval_wer": 10.642644873699851,
1253
+ "step": 5000
1254
+ },
1255
+ {
1256
+ "epoch": 295.47,
1257
+ "learning_rate": 2.7544827586206896e-06,
1258
+ "loss": 0.0021,
1259
+ "step": 5025
1260
+ },
1261
+ {
1262
+ "epoch": 296.94,
1263
+ "learning_rate": 2.7475862068965512e-06,
1264
+ "loss": 0.0024,
1265
+ "step": 5050
1266
+ },
1267
+ {
1268
+ "epoch": 298.41,
1269
+ "learning_rate": 2.7406896551724137e-06,
1270
+ "loss": 0.0025,
1271
+ "step": 5075
1272
+ },
1273
+ {
1274
+ "epoch": 299.88,
1275
+ "learning_rate": 2.7337931034482757e-06,
1276
+ "loss": 0.0022,
1277
+ "step": 5100
1278
+ },
1279
+ {
1280
+ "epoch": 301.35,
1281
+ "learning_rate": 2.7268965517241378e-06,
1282
+ "loss": 0.0027,
1283
+ "step": 5125
1284
+ },
1285
+ {
1286
+ "epoch": 302.82,
1287
+ "learning_rate": 2.7200000000000002e-06,
1288
+ "loss": 0.0024,
1289
+ "step": 5150
1290
+ },
1291
+ {
1292
+ "epoch": 304.29,
1293
+ "learning_rate": 2.713103448275862e-06,
1294
+ "loss": 0.0024,
1295
+ "step": 5175
1296
+ },
1297
+ {
1298
+ "epoch": 305.76,
1299
+ "learning_rate": 2.7062068965517243e-06,
1300
+ "loss": 0.0023,
1301
+ "step": 5200
1302
+ },
1303
+ {
1304
+ "epoch": 307.24,
1305
+ "learning_rate": 2.699310344827586e-06,
1306
+ "loss": 0.0027,
1307
+ "step": 5225
1308
+ },
1309
+ {
1310
+ "epoch": 308.71,
1311
+ "learning_rate": 2.6924137931034483e-06,
1312
+ "loss": 0.0023,
1313
+ "step": 5250
1314
+ },
1315
+ {
1316
+ "epoch": 310.18,
1317
+ "learning_rate": 2.68551724137931e-06,
1318
+ "loss": 0.0021,
1319
+ "step": 5275
1320
+ },
1321
+ {
1322
+ "epoch": 311.65,
1323
+ "learning_rate": 2.6786206896551724e-06,
1324
+ "loss": 0.0025,
1325
+ "step": 5300
1326
+ },
1327
+ {
1328
+ "epoch": 313.12,
1329
+ "learning_rate": 2.6717241379310344e-06,
1330
+ "loss": 0.0021,
1331
+ "step": 5325
1332
+ },
1333
+ {
1334
+ "epoch": 314.59,
1335
+ "learning_rate": 2.6648275862068965e-06,
1336
+ "loss": 0.0019,
1337
+ "step": 5350
1338
+ },
1339
+ {
1340
+ "epoch": 316.06,
1341
+ "learning_rate": 2.6579310344827585e-06,
1342
+ "loss": 0.0019,
1343
+ "step": 5375
1344
+ },
1345
+ {
1346
+ "epoch": 317.53,
1347
+ "learning_rate": 2.6510344827586205e-06,
1348
+ "loss": 0.0018,
1349
+ "step": 5400
1350
+ },
1351
+ {
1352
+ "epoch": 319.0,
1353
+ "learning_rate": 2.6441379310344826e-06,
1354
+ "loss": 0.0022,
1355
+ "step": 5425
1356
+ },
1357
+ {
1358
+ "epoch": 320.47,
1359
+ "learning_rate": 2.6377931034482757e-06,
1360
+ "loss": 0.0019,
1361
+ "step": 5450
1362
+ },
1363
+ {
1364
+ "epoch": 321.94,
1365
+ "learning_rate": 2.6308965517241377e-06,
1366
+ "loss": 0.0016,
1367
+ "step": 5475
1368
+ },
1369
+ {
1370
+ "epoch": 323.41,
1371
+ "learning_rate": 2.624e-06,
1372
+ "loss": 0.0013,
1373
+ "step": 5500
1374
+ },
1375
+ {
1376
+ "epoch": 324.88,
1377
+ "learning_rate": 2.617103448275862e-06,
1378
+ "loss": 0.0019,
1379
+ "step": 5525
1380
+ },
1381
+ {
1382
+ "epoch": 326.35,
1383
+ "learning_rate": 2.6102068965517243e-06,
1384
+ "loss": 0.0017,
1385
+ "step": 5550
1386
+ },
1387
+ {
1388
+ "epoch": 327.82,
1389
+ "learning_rate": 2.603310344827586e-06,
1390
+ "loss": 0.0018,
1391
+ "step": 5575
1392
+ },
1393
+ {
1394
+ "epoch": 329.29,
1395
+ "learning_rate": 2.5964137931034483e-06,
1396
+ "loss": 0.0013,
1397
+ "step": 5600
1398
+ },
1399
+ {
1400
+ "epoch": 330.76,
1401
+ "learning_rate": 2.58951724137931e-06,
1402
+ "loss": 0.0016,
1403
+ "step": 5625
1404
+ },
1405
+ {
1406
+ "epoch": 332.24,
1407
+ "learning_rate": 2.5826206896551724e-06,
1408
+ "loss": 0.0013,
1409
+ "step": 5650
1410
+ },
1411
+ {
1412
+ "epoch": 333.71,
1413
+ "learning_rate": 2.575724137931034e-06,
1414
+ "loss": 0.0018,
1415
+ "step": 5675
1416
+ },
1417
+ {
1418
+ "epoch": 335.18,
1419
+ "learning_rate": 2.5688275862068965e-06,
1420
+ "loss": 0.0014,
1421
+ "step": 5700
1422
+ },
1423
+ {
1424
+ "epoch": 336.65,
1425
+ "learning_rate": 2.561931034482759e-06,
1426
+ "loss": 0.0013,
1427
+ "step": 5725
1428
+ },
1429
+ {
1430
+ "epoch": 338.12,
1431
+ "learning_rate": 2.5550344827586205e-06,
1432
+ "loss": 0.0011,
1433
+ "step": 5750
1434
+ },
1435
+ {
1436
+ "epoch": 339.59,
1437
+ "learning_rate": 2.548137931034483e-06,
1438
+ "loss": 0.0018,
1439
+ "step": 5775
1440
+ },
1441
+ {
1442
+ "epoch": 341.06,
1443
+ "learning_rate": 2.5412413793103446e-06,
1444
+ "loss": 0.0013,
1445
+ "step": 5800
1446
+ },
1447
+ {
1448
+ "epoch": 342.53,
1449
+ "learning_rate": 2.534344827586207e-06,
1450
+ "loss": 0.0012,
1451
+ "step": 5825
1452
+ },
1453
+ {
1454
+ "epoch": 344.0,
1455
+ "learning_rate": 2.5274482758620687e-06,
1456
+ "loss": 0.0014,
1457
+ "step": 5850
1458
+ },
1459
+ {
1460
+ "epoch": 345.47,
1461
+ "learning_rate": 2.520551724137931e-06,
1462
+ "loss": 0.001,
1463
+ "step": 5875
1464
+ },
1465
+ {
1466
+ "epoch": 346.94,
1467
+ "learning_rate": 2.5136551724137927e-06,
1468
+ "loss": 0.0012,
1469
+ "step": 5900
1470
+ },
1471
+ {
1472
+ "epoch": 348.41,
1473
+ "learning_rate": 2.506758620689655e-06,
1474
+ "loss": 0.0012,
1475
+ "step": 5925
1476
+ },
1477
+ {
1478
+ "epoch": 349.88,
1479
+ "learning_rate": 2.499862068965517e-06,
1480
+ "loss": 0.0012,
1481
+ "step": 5950
1482
+ },
1483
+ {
1484
+ "epoch": 351.35,
1485
+ "learning_rate": 2.4929655172413792e-06,
1486
+ "loss": 0.0013,
1487
+ "step": 5975
1488
+ },
1489
+ {
1490
+ "epoch": 352.82,
1491
+ "learning_rate": 2.4860689655172413e-06,
1492
+ "loss": 0.0015,
1493
+ "step": 6000
1494
+ },
1495
+ {
1496
+ "epoch": 352.82,
1497
+ "eval_loss": 0.497802734375,
1498
+ "eval_runtime": 156.7207,
1499
+ "eval_samples_per_second": 1.736,
1500
+ "eval_steps_per_second": 0.108,
1501
+ "eval_wer": 10.503343239227341,
1502
+ "step": 6000
1503
+ },
1504
+ {
1505
+ "epoch": 354.29,
1506
+ "learning_rate": 2.4791724137931033e-06,
1507
+ "loss": 0.0013,
1508
+ "step": 6025
1509
+ },
1510
+ {
1511
+ "epoch": 355.76,
1512
+ "learning_rate": 2.4722758620689653e-06,
1513
+ "loss": 0.0012,
1514
+ "step": 6050
1515
+ },
1516
+ {
1517
+ "epoch": 357.24,
1518
+ "learning_rate": 2.4653793103448274e-06,
1519
+ "loss": 0.0011,
1520
+ "step": 6075
1521
+ },
1522
+ {
1523
+ "epoch": 358.71,
1524
+ "learning_rate": 2.4584827586206894e-06,
1525
+ "loss": 0.0008,
1526
+ "step": 6100
1527
+ },
1528
+ {
1529
+ "epoch": 360.18,
1530
+ "learning_rate": 2.4515862068965514e-06,
1531
+ "loss": 0.0008,
1532
+ "step": 6125
1533
+ },
1534
+ {
1535
+ "epoch": 361.65,
1536
+ "learning_rate": 2.444689655172414e-06,
1537
+ "loss": 0.0011,
1538
+ "step": 6150
1539
+ },
1540
+ {
1541
+ "epoch": 363.12,
1542
+ "learning_rate": 2.4377931034482755e-06,
1543
+ "loss": 0.0012,
1544
+ "step": 6175
1545
+ },
1546
+ {
1547
+ "epoch": 364.59,
1548
+ "learning_rate": 2.430896551724138e-06,
1549
+ "loss": 0.0013,
1550
+ "step": 6200
1551
+ },
1552
+ {
1553
+ "epoch": 366.06,
1554
+ "learning_rate": 2.424e-06,
1555
+ "loss": 0.0011,
1556
+ "step": 6225
1557
+ },
1558
+ {
1559
+ "epoch": 367.53,
1560
+ "learning_rate": 2.417103448275862e-06,
1561
+ "loss": 0.0012,
1562
+ "step": 6250
1563
+ },
1564
+ {
1565
+ "epoch": 369.0,
1566
+ "learning_rate": 2.410206896551724e-06,
1567
+ "loss": 0.0011,
1568
+ "step": 6275
1569
+ },
1570
+ {
1571
+ "epoch": 370.47,
1572
+ "learning_rate": 2.403310344827586e-06,
1573
+ "loss": 0.0009,
1574
+ "step": 6300
1575
+ },
1576
+ {
1577
+ "epoch": 371.94,
1578
+ "learning_rate": 2.396413793103448e-06,
1579
+ "loss": 0.0014,
1580
+ "step": 6325
1581
+ },
1582
+ {
1583
+ "epoch": 373.41,
1584
+ "learning_rate": 2.38951724137931e-06,
1585
+ "loss": 0.0018,
1586
+ "step": 6350
1587
+ },
1588
+ {
1589
+ "epoch": 374.88,
1590
+ "learning_rate": 2.382620689655172e-06,
1591
+ "loss": 0.0009,
1592
+ "step": 6375
1593
+ },
1594
+ {
1595
+ "epoch": 376.35,
1596
+ "learning_rate": 2.3757241379310342e-06,
1597
+ "loss": 0.001,
1598
+ "step": 6400
1599
+ },
1600
+ {
1601
+ "epoch": 377.82,
1602
+ "learning_rate": 2.3688275862068963e-06,
1603
+ "loss": 0.0009,
1604
+ "step": 6425
1605
+ },
1606
+ {
1607
+ "epoch": 379.29,
1608
+ "learning_rate": 2.36248275862069e-06,
1609
+ "loss": 0.0008,
1610
+ "step": 6450
1611
+ },
1612
+ {
1613
+ "epoch": 380.76,
1614
+ "learning_rate": 2.3555862068965514e-06,
1615
+ "loss": 0.0009,
1616
+ "step": 6475
1617
+ },
1618
+ {
1619
+ "epoch": 382.24,
1620
+ "learning_rate": 2.348689655172414e-06,
1621
+ "loss": 0.0009,
1622
+ "step": 6500
1623
+ },
1624
+ {
1625
+ "epoch": 383.71,
1626
+ "learning_rate": 2.3417931034482755e-06,
1627
+ "loss": 0.0011,
1628
+ "step": 6525
1629
+ },
1630
+ {
1631
+ "epoch": 385.18,
1632
+ "learning_rate": 2.334896551724138e-06,
1633
+ "loss": 0.0008,
1634
+ "step": 6550
1635
+ },
1636
+ {
1637
+ "epoch": 386.65,
1638
+ "learning_rate": 2.3279999999999996e-06,
1639
+ "loss": 0.0006,
1640
+ "step": 6575
1641
+ },
1642
+ {
1643
+ "epoch": 388.12,
1644
+ "learning_rate": 2.321103448275862e-06,
1645
+ "loss": 0.001,
1646
+ "step": 6600
1647
+ },
1648
+ {
1649
+ "epoch": 389.59,
1650
+ "learning_rate": 2.314206896551724e-06,
1651
+ "loss": 0.0009,
1652
+ "step": 6625
1653
+ },
1654
+ {
1655
+ "epoch": 391.06,
1656
+ "learning_rate": 2.307310344827586e-06,
1657
+ "loss": 0.0008,
1658
+ "step": 6650
1659
+ },
1660
+ {
1661
+ "epoch": 392.53,
1662
+ "learning_rate": 2.300413793103448e-06,
1663
+ "loss": 0.001,
1664
+ "step": 6675
1665
+ },
1666
+ {
1667
+ "epoch": 394.0,
1668
+ "learning_rate": 2.29351724137931e-06,
1669
+ "loss": 0.0009,
1670
+ "step": 6700
1671
+ },
1672
+ {
1673
+ "epoch": 395.47,
1674
+ "learning_rate": 2.2866206896551726e-06,
1675
+ "loss": 0.0011,
1676
+ "step": 6725
1677
+ },
1678
+ {
1679
+ "epoch": 396.94,
1680
+ "learning_rate": 2.2797241379310342e-06,
1681
+ "loss": 0.0008,
1682
+ "step": 6750
1683
+ },
1684
+ {
1685
+ "epoch": 398.41,
1686
+ "learning_rate": 2.2728275862068967e-06,
1687
+ "loss": 0.0007,
1688
+ "step": 6775
1689
+ },
1690
+ {
1691
+ "epoch": 399.88,
1692
+ "learning_rate": 2.2659310344827583e-06,
1693
+ "loss": 0.0006,
1694
+ "step": 6800
1695
+ },
1696
+ {
1697
+ "epoch": 401.35,
1698
+ "learning_rate": 2.2590344827586207e-06,
1699
+ "loss": 0.0007,
1700
+ "step": 6825
1701
+ },
1702
+ {
1703
+ "epoch": 402.82,
1704
+ "learning_rate": 2.2521379310344828e-06,
1705
+ "loss": 0.0011,
1706
+ "step": 6850
1707
+ },
1708
+ {
1709
+ "epoch": 404.29,
1710
+ "learning_rate": 2.245241379310345e-06,
1711
+ "loss": 0.001,
1712
+ "step": 6875
1713
+ },
1714
+ {
1715
+ "epoch": 405.76,
1716
+ "learning_rate": 2.238344827586207e-06,
1717
+ "loss": 0.0007,
1718
+ "step": 6900
1719
+ },
1720
+ {
1721
+ "epoch": 407.24,
1722
+ "learning_rate": 2.231448275862069e-06,
1723
+ "loss": 0.0008,
1724
+ "step": 6925
1725
+ },
1726
+ {
1727
+ "epoch": 408.71,
1728
+ "learning_rate": 2.224551724137931e-06,
1729
+ "loss": 0.0007,
1730
+ "step": 6950
1731
+ },
1732
+ {
1733
+ "epoch": 410.18,
1734
+ "learning_rate": 2.217655172413793e-06,
1735
+ "loss": 0.0008,
1736
+ "step": 6975
1737
+ },
1738
+ {
1739
+ "epoch": 411.65,
1740
+ "learning_rate": 2.210758620689655e-06,
1741
+ "loss": 0.0007,
1742
+ "step": 7000
1743
+ },
1744
+ {
1745
+ "epoch": 411.65,
1746
+ "eval_loss": 0.5146484375,
1747
+ "eval_runtime": 159.9051,
1748
+ "eval_samples_per_second": 1.701,
1749
+ "eval_steps_per_second": 0.106,
1750
+ "eval_wer": 10.057578008915305,
1751
+ "step": 7000
1752
+ },
1753
+ {
1754
+ "epoch": 413.12,
1755
+ "learning_rate": 2.203862068965517e-06,
1756
+ "loss": 0.0007,
1757
+ "step": 7025
1758
+ },
1759
+ {
1760
+ "epoch": 414.59,
1761
+ "learning_rate": 2.196965517241379e-06,
1762
+ "loss": 0.0006,
1763
+ "step": 7050
1764
+ },
1765
+ {
1766
+ "epoch": 416.06,
1767
+ "learning_rate": 2.1900689655172415e-06,
1768
+ "loss": 0.0009,
1769
+ "step": 7075
1770
+ },
1771
+ {
1772
+ "epoch": 417.53,
1773
+ "learning_rate": 2.183172413793103e-06,
1774
+ "loss": 0.0008,
1775
+ "step": 7100
1776
+ },
1777
+ {
1778
+ "epoch": 419.0,
1779
+ "learning_rate": 2.1762758620689656e-06,
1780
+ "loss": 0.0007,
1781
+ "step": 7125
1782
+ },
1783
+ {
1784
+ "epoch": 420.47,
1785
+ "learning_rate": 2.1693793103448276e-06,
1786
+ "loss": 0.0008,
1787
+ "step": 7150
1788
+ },
1789
+ {
1790
+ "epoch": 421.94,
1791
+ "learning_rate": 2.1624827586206896e-06,
1792
+ "loss": 0.0007,
1793
+ "step": 7175
1794
+ },
1795
+ {
1796
+ "epoch": 423.41,
1797
+ "learning_rate": 2.1555862068965517e-06,
1798
+ "loss": 0.0005,
1799
+ "step": 7200
1800
+ },
1801
+ {
1802
+ "epoch": 424.88,
1803
+ "learning_rate": 2.1486896551724137e-06,
1804
+ "loss": 0.0008,
1805
+ "step": 7225
1806
+ },
1807
+ {
1808
+ "epoch": 426.35,
1809
+ "learning_rate": 2.1417931034482757e-06,
1810
+ "loss": 0.0009,
1811
+ "step": 7250
1812
+ },
1813
+ {
1814
+ "epoch": 427.82,
1815
+ "learning_rate": 2.1348965517241378e-06,
1816
+ "loss": 0.0009,
1817
+ "step": 7275
1818
+ },
1819
+ {
1820
+ "epoch": 429.29,
1821
+ "learning_rate": 2.128e-06,
1822
+ "loss": 0.0006,
1823
+ "step": 7300
1824
+ },
1825
+ {
1826
+ "epoch": 430.76,
1827
+ "learning_rate": 2.121103448275862e-06,
1828
+ "loss": 0.0006,
1829
+ "step": 7325
1830
+ },
1831
+ {
1832
+ "epoch": 432.24,
1833
+ "learning_rate": 2.1142068965517243e-06,
1834
+ "loss": 0.0006,
1835
+ "step": 7350
1836
+ },
1837
+ {
1838
+ "epoch": 433.71,
1839
+ "learning_rate": 2.107310344827586e-06,
1840
+ "loss": 0.0006,
1841
+ "step": 7375
1842
+ },
1843
+ {
1844
+ "epoch": 435.18,
1845
+ "learning_rate": 2.1004137931034483e-06,
1846
+ "loss": 0.0007,
1847
+ "step": 7400
1848
+ },
1849
+ {
1850
+ "epoch": 436.65,
1851
+ "learning_rate": 2.09351724137931e-06,
1852
+ "loss": 0.0006,
1853
+ "step": 7425
1854
+ },
1855
+ {
1856
+ "epoch": 438.12,
1857
+ "learning_rate": 2.0871724137931035e-06,
1858
+ "loss": 0.0007,
1859
+ "step": 7450
1860
+ },
1861
+ {
1862
+ "epoch": 439.59,
1863
+ "learning_rate": 2.080275862068965e-06,
1864
+ "loss": 0.0006,
1865
+ "step": 7475
1866
+ },
1867
+ {
1868
+ "epoch": 441.06,
1869
+ "learning_rate": 2.0733793103448276e-06,
1870
+ "loss": 0.0009,
1871
+ "step": 7500
1872
+ },
1873
+ {
1874
+ "epoch": 442.53,
1875
+ "learning_rate": 2.0664827586206896e-06,
1876
+ "loss": 0.0008,
1877
+ "step": 7525
1878
+ },
1879
+ {
1880
+ "epoch": 444.0,
1881
+ "learning_rate": 2.0595862068965516e-06,
1882
+ "loss": 0.0005,
1883
+ "step": 7550
1884
+ },
1885
+ {
1886
+ "epoch": 445.47,
1887
+ "learning_rate": 2.0526896551724137e-06,
1888
+ "loss": 0.0004,
1889
+ "step": 7575
1890
+ },
1891
+ {
1892
+ "epoch": 446.94,
1893
+ "learning_rate": 2.0457931034482757e-06,
1894
+ "loss": 0.0006,
1895
+ "step": 7600
1896
+ },
1897
+ {
1898
+ "epoch": 448.41,
1899
+ "learning_rate": 2.0388965517241377e-06,
1900
+ "loss": 0.0007,
1901
+ "step": 7625
1902
+ },
1903
+ {
1904
+ "epoch": 449.88,
1905
+ "learning_rate": 2.0319999999999998e-06,
1906
+ "loss": 0.0005,
1907
+ "step": 7650
1908
+ },
1909
+ {
1910
+ "epoch": 451.35,
1911
+ "learning_rate": 2.025103448275862e-06,
1912
+ "loss": 0.0005,
1913
+ "step": 7675
1914
+ },
1915
+ {
1916
+ "epoch": 452.82,
1917
+ "learning_rate": 2.018206896551724e-06,
1918
+ "loss": 0.0009,
1919
+ "step": 7700
1920
+ },
1921
+ {
1922
+ "epoch": 454.29,
1923
+ "learning_rate": 2.0113103448275863e-06,
1924
+ "loss": 0.0005,
1925
+ "step": 7725
1926
+ },
1927
+ {
1928
+ "epoch": 455.76,
1929
+ "learning_rate": 2.0044137931034483e-06,
1930
+ "loss": 0.0005,
1931
+ "step": 7750
1932
+ },
1933
+ {
1934
+ "epoch": 457.24,
1935
+ "learning_rate": 1.9975172413793104e-06,
1936
+ "loss": 0.0006,
1937
+ "step": 7775
1938
+ },
1939
+ {
1940
+ "epoch": 458.71,
1941
+ "learning_rate": 1.9906206896551724e-06,
1942
+ "loss": 0.0005,
1943
+ "step": 7800
1944
+ },
1945
+ {
1946
+ "epoch": 460.18,
1947
+ "learning_rate": 1.9837241379310344e-06,
1948
+ "loss": 0.0005,
1949
+ "step": 7825
1950
+ },
1951
+ {
1952
+ "epoch": 461.65,
1953
+ "learning_rate": 1.9768275862068965e-06,
1954
+ "loss": 0.0006,
1955
+ "step": 7850
1956
+ },
1957
+ {
1958
+ "epoch": 463.12,
1959
+ "learning_rate": 1.9699310344827585e-06,
1960
+ "loss": 0.0004,
1961
+ "step": 7875
1962
+ },
1963
+ {
1964
+ "epoch": 464.59,
1965
+ "learning_rate": 1.9630344827586205e-06,
1966
+ "loss": 0.0007,
1967
+ "step": 7900
1968
+ },
1969
+ {
1970
+ "epoch": 466.06,
1971
+ "learning_rate": 1.956137931034483e-06,
1972
+ "loss": 0.0005,
1973
+ "step": 7925
1974
+ },
1975
+ {
1976
+ "epoch": 467.53,
1977
+ "learning_rate": 1.949241379310345e-06,
1978
+ "loss": 0.0006,
1979
+ "step": 7950
1980
+ },
1981
+ {
1982
+ "epoch": 469.0,
1983
+ "learning_rate": 1.942344827586207e-06,
1984
+ "loss": 0.0006,
1985
+ "step": 7975
1986
+ },
1987
+ {
1988
+ "epoch": 470.47,
1989
+ "learning_rate": 1.935448275862069e-06,
1990
+ "loss": 0.0007,
1991
+ "step": 8000
1992
+ },
1993
+ {
1994
+ "epoch": 470.47,
1995
+ "eval_loss": 0.53857421875,
1996
+ "eval_runtime": 158.4391,
1997
+ "eval_samples_per_second": 1.717,
1998
+ "eval_steps_per_second": 0.107,
1999
+ "eval_wer": 10.131872213967311,
2000
+ "step": 8000
2001
+ },
2002
+ {
2003
+ "epoch": 471.94,
2004
+ "learning_rate": 1.928551724137931e-06,
2005
+ "loss": 0.0005,
2006
+ "step": 8025
2007
+ },
2008
+ {
2009
+ "epoch": 473.41,
2010
+ "learning_rate": 1.921655172413793e-06,
2011
+ "loss": 0.0008,
2012
+ "step": 8050
2013
+ },
2014
+ {
2015
+ "epoch": 474.88,
2016
+ "learning_rate": 1.914758620689655e-06,
2017
+ "loss": 0.0005,
2018
+ "step": 8075
2019
+ },
2020
+ {
2021
+ "epoch": 476.35,
2022
+ "learning_rate": 1.907862068965517e-06,
2023
+ "loss": 0.0004,
2024
+ "step": 8100
2025
+ },
2026
+ {
2027
+ "epoch": 477.82,
2028
+ "learning_rate": 1.9009655172413792e-06,
2029
+ "loss": 0.0005,
2030
+ "step": 8125
2031
+ },
2032
+ {
2033
+ "epoch": 479.29,
2034
+ "learning_rate": 1.8940689655172413e-06,
2035
+ "loss": 0.0004,
2036
+ "step": 8150
2037
+ },
2038
+ {
2039
+ "epoch": 480.76,
2040
+ "learning_rate": 1.8871724137931033e-06,
2041
+ "loss": 0.0007,
2042
+ "step": 8175
2043
+ },
2044
+ {
2045
+ "epoch": 482.24,
2046
+ "learning_rate": 1.8802758620689653e-06,
2047
+ "loss": 0.0005,
2048
+ "step": 8200
2049
+ },
2050
+ {
2051
+ "epoch": 483.71,
2052
+ "learning_rate": 1.8733793103448274e-06,
2053
+ "loss": 0.0007,
2054
+ "step": 8225
2055
+ },
2056
+ {
2057
+ "epoch": 485.18,
2058
+ "learning_rate": 1.8664827586206894e-06,
2059
+ "loss": 0.0005,
2060
+ "step": 8250
2061
+ },
2062
+ {
2063
+ "epoch": 486.65,
2064
+ "learning_rate": 1.8595862068965517e-06,
2065
+ "loss": 0.0004,
2066
+ "step": 8275
2067
+ },
2068
+ {
2069
+ "epoch": 488.12,
2070
+ "learning_rate": 1.8526896551724137e-06,
2071
+ "loss": 0.0005,
2072
+ "step": 8300
2073
+ },
2074
+ {
2075
+ "epoch": 489.59,
2076
+ "learning_rate": 1.845793103448276e-06,
2077
+ "loss": 0.0004,
2078
+ "step": 8325
2079
+ },
2080
+ {
2081
+ "epoch": 491.06,
2082
+ "learning_rate": 1.838896551724138e-06,
2083
+ "loss": 0.0004,
2084
+ "step": 8350
2085
+ },
2086
+ {
2087
+ "epoch": 492.53,
2088
+ "learning_rate": 1.832e-06,
2089
+ "loss": 0.0005,
2090
+ "step": 8375
2091
+ },
2092
+ {
2093
+ "epoch": 494.0,
2094
+ "learning_rate": 1.825103448275862e-06,
2095
+ "loss": 0.0004,
2096
+ "step": 8400
2097
+ },
2098
+ {
2099
+ "epoch": 495.47,
2100
+ "learning_rate": 1.818206896551724e-06,
2101
+ "loss": 0.0007,
2102
+ "step": 8425
2103
+ },
2104
+ {
2105
+ "epoch": 496.94,
2106
+ "learning_rate": 1.811862068965517e-06,
2107
+ "loss": 0.0008,
2108
+ "step": 8450
2109
+ },
2110
+ {
2111
+ "epoch": 498.41,
2112
+ "learning_rate": 1.8049655172413792e-06,
2113
+ "loss": 0.0005,
2114
+ "step": 8475
2115
+ },
2116
+ {
2117
+ "epoch": 499.88,
2118
+ "learning_rate": 1.7980689655172413e-06,
2119
+ "loss": 0.0006,
2120
+ "step": 8500
2121
+ },
2122
+ {
2123
+ "epoch": 501.35,
2124
+ "learning_rate": 1.7911724137931035e-06,
2125
+ "loss": 0.0004,
2126
+ "step": 8525
2127
+ },
2128
+ {
2129
+ "epoch": 502.82,
2130
+ "learning_rate": 1.7842758620689655e-06,
2131
+ "loss": 0.0004,
2132
+ "step": 8550
2133
+ },
2134
+ {
2135
+ "epoch": 504.29,
2136
+ "learning_rate": 1.7773793103448276e-06,
2137
+ "loss": 0.0006,
2138
+ "step": 8575
2139
+ },
2140
+ {
2141
+ "epoch": 505.76,
2142
+ "learning_rate": 1.7704827586206896e-06,
2143
+ "loss": 0.0004,
2144
+ "step": 8600
2145
+ },
2146
+ {
2147
+ "epoch": 507.24,
2148
+ "learning_rate": 1.7635862068965516e-06,
2149
+ "loss": 0.0004,
2150
+ "step": 8625
2151
+ },
2152
+ {
2153
+ "epoch": 508.71,
2154
+ "learning_rate": 1.7566896551724137e-06,
2155
+ "loss": 0.0006,
2156
+ "step": 8650
2157
+ },
2158
+ {
2159
+ "epoch": 510.18,
2160
+ "learning_rate": 1.7497931034482757e-06,
2161
+ "loss": 0.0004,
2162
+ "step": 8675
2163
+ },
2164
+ {
2165
+ "epoch": 511.65,
2166
+ "learning_rate": 1.742896551724138e-06,
2167
+ "loss": 0.0005,
2168
+ "step": 8700
2169
+ },
2170
+ {
2171
+ "epoch": 513.12,
2172
+ "learning_rate": 1.736e-06,
2173
+ "loss": 0.0006,
2174
+ "step": 8725
2175
+ },
2176
+ {
2177
+ "epoch": 514.59,
2178
+ "learning_rate": 1.729103448275862e-06,
2179
+ "loss": 0.0006,
2180
+ "step": 8750
2181
+ },
2182
+ {
2183
+ "epoch": 516.06,
2184
+ "learning_rate": 1.722206896551724e-06,
2185
+ "loss": 0.0004,
2186
+ "step": 8775
2187
+ },
2188
+ {
2189
+ "epoch": 517.53,
2190
+ "learning_rate": 1.715310344827586e-06,
2191
+ "loss": 0.0003,
2192
+ "step": 8800
2193
+ },
2194
+ {
2195
+ "epoch": 519.0,
2196
+ "learning_rate": 1.7084137931034481e-06,
2197
+ "loss": 0.0003,
2198
+ "step": 8825
2199
+ },
2200
+ {
2201
+ "epoch": 520.47,
2202
+ "learning_rate": 1.7015172413793101e-06,
2203
+ "loss": 0.0004,
2204
+ "step": 8850
2205
+ },
2206
+ {
2207
+ "epoch": 521.94,
2208
+ "learning_rate": 1.6946206896551722e-06,
2209
+ "loss": 0.0006,
2210
+ "step": 8875
2211
+ },
2212
+ {
2213
+ "epoch": 523.41,
2214
+ "learning_rate": 1.6877241379310342e-06,
2215
+ "loss": 0.0005,
2216
+ "step": 8900
2217
+ },
2218
+ {
2219
+ "epoch": 524.88,
2220
+ "learning_rate": 1.6808275862068967e-06,
2221
+ "loss": 0.0029,
2222
+ "step": 8925
2223
+ },
2224
+ {
2225
+ "epoch": 526.35,
2226
+ "learning_rate": 1.6739310344827587e-06,
2227
+ "loss": 0.0004,
2228
+ "step": 8950
2229
+ },
2230
+ {
2231
+ "epoch": 527.82,
2232
+ "learning_rate": 1.6670344827586207e-06,
2233
+ "loss": 0.0003,
2234
+ "step": 8975
2235
+ },
2236
+ {
2237
+ "epoch": 529.29,
2238
+ "learning_rate": 1.6601379310344828e-06,
2239
+ "loss": 0.0004,
2240
+ "step": 9000
2241
+ },
2242
+ {
2243
+ "epoch": 529.29,
2244
+ "eval_loss": 0.5361328125,
2245
+ "eval_runtime": 156.9399,
2246
+ "eval_samples_per_second": 1.733,
2247
+ "eval_steps_per_second": 0.108,
2248
+ "eval_wer": 9.778974739970282,
2249
+ "step": 9000
2250
+ },
2251
+ {
2252
+ "epoch": 530.76,
2253
+ "learning_rate": 1.6532413793103448e-06,
2254
+ "loss": 0.0006,
2255
+ "step": 9025
2256
+ },
2257
+ {
2258
+ "epoch": 532.24,
2259
+ "learning_rate": 1.6463448275862068e-06,
2260
+ "loss": 0.0003,
2261
+ "step": 9050
2262
+ },
2263
+ {
2264
+ "epoch": 533.71,
2265
+ "learning_rate": 1.6394482758620689e-06,
2266
+ "loss": 0.0003,
2267
+ "step": 9075
2268
+ },
2269
+ {
2270
+ "epoch": 535.18,
2271
+ "learning_rate": 1.632551724137931e-06,
2272
+ "loss": 0.0005,
2273
+ "step": 9100
2274
+ },
2275
+ {
2276
+ "epoch": 536.65,
2277
+ "learning_rate": 1.625655172413793e-06,
2278
+ "loss": 0.0006,
2279
+ "step": 9125
2280
+ },
2281
+ {
2282
+ "epoch": 538.12,
2283
+ "learning_rate": 1.6187586206896552e-06,
2284
+ "loss": 0.0003,
2285
+ "step": 9150
2286
+ },
2287
+ {
2288
+ "epoch": 539.59,
2289
+ "learning_rate": 1.6118620689655172e-06,
2290
+ "loss": 0.0004,
2291
+ "step": 9175
2292
+ },
2293
+ {
2294
+ "epoch": 541.06,
2295
+ "learning_rate": 1.6049655172413792e-06,
2296
+ "loss": 0.0003,
2297
+ "step": 9200
2298
+ },
2299
+ {
2300
+ "epoch": 542.53,
2301
+ "learning_rate": 1.5980689655172413e-06,
2302
+ "loss": 0.0004,
2303
+ "step": 9225
2304
+ },
2305
+ {
2306
+ "epoch": 544.0,
2307
+ "learning_rate": 1.5911724137931033e-06,
2308
+ "loss": 0.0006,
2309
+ "step": 9250
2310
+ },
2311
+ {
2312
+ "epoch": 545.47,
2313
+ "learning_rate": 1.5842758620689653e-06,
2314
+ "loss": 0.0002,
2315
+ "step": 9275
2316
+ },
2317
+ {
2318
+ "epoch": 546.94,
2319
+ "learning_rate": 1.5773793103448274e-06,
2320
+ "loss": 0.0003,
2321
+ "step": 9300
2322
+ },
2323
+ {
2324
+ "epoch": 548.41,
2325
+ "learning_rate": 1.5704827586206896e-06,
2326
+ "loss": 0.0003,
2327
+ "step": 9325
2328
+ },
2329
+ {
2330
+ "epoch": 549.88,
2331
+ "learning_rate": 1.5635862068965516e-06,
2332
+ "loss": 0.0003,
2333
+ "step": 9350
2334
+ },
2335
+ {
2336
+ "epoch": 551.35,
2337
+ "learning_rate": 1.5566896551724139e-06,
2338
+ "loss": 0.0004,
2339
+ "step": 9375
2340
+ },
2341
+ {
2342
+ "epoch": 552.82,
2343
+ "learning_rate": 1.549793103448276e-06,
2344
+ "loss": 0.0004,
2345
+ "step": 9400
2346
+ },
2347
+ {
2348
+ "epoch": 554.29,
2349
+ "learning_rate": 1.542896551724138e-06,
2350
+ "loss": 0.0005,
2351
+ "step": 9425
2352
+ },
2353
+ {
2354
+ "epoch": 555.76,
2355
+ "learning_rate": 1.5365517241379309e-06,
2356
+ "loss": 0.0004,
2357
+ "step": 9450
2358
+ },
2359
+ {
2360
+ "epoch": 557.24,
2361
+ "learning_rate": 1.529655172413793e-06,
2362
+ "loss": 0.0003,
2363
+ "step": 9475
2364
+ },
2365
+ {
2366
+ "epoch": 558.71,
2367
+ "learning_rate": 1.522758620689655e-06,
2368
+ "loss": 0.0003,
2369
+ "step": 9500
2370
+ },
2371
+ {
2372
+ "epoch": 560.18,
2373
+ "learning_rate": 1.5158620689655172e-06,
2374
+ "loss": 0.0003,
2375
+ "step": 9525
2376
+ },
2377
+ {
2378
+ "epoch": 561.65,
2379
+ "learning_rate": 1.5089655172413792e-06,
2380
+ "loss": 0.0005,
2381
+ "step": 9550
2382
+ },
2383
+ {
2384
+ "epoch": 563.12,
2385
+ "learning_rate": 1.5020689655172415e-06,
2386
+ "loss": 0.0004,
2387
+ "step": 9575
2388
+ },
2389
+ {
2390
+ "epoch": 564.59,
2391
+ "learning_rate": 1.4951724137931035e-06,
2392
+ "loss": 0.0004,
2393
+ "step": 9600
2394
+ },
2395
+ {
2396
+ "epoch": 566.06,
2397
+ "learning_rate": 1.4882758620689655e-06,
2398
+ "loss": 0.0003,
2399
+ "step": 9625
2400
+ },
2401
+ {
2402
+ "epoch": 567.53,
2403
+ "learning_rate": 1.4813793103448276e-06,
2404
+ "loss": 0.0005,
2405
+ "step": 9650
2406
+ },
2407
+ {
2408
+ "epoch": 569.0,
2409
+ "learning_rate": 1.4744827586206896e-06,
2410
+ "loss": 0.0003,
2411
+ "step": 9675
2412
+ },
2413
+ {
2414
+ "epoch": 570.47,
2415
+ "learning_rate": 1.4675862068965516e-06,
2416
+ "loss": 0.0003,
2417
+ "step": 9700
2418
+ },
2419
+ {
2420
+ "epoch": 571.94,
2421
+ "learning_rate": 1.4606896551724137e-06,
2422
+ "loss": 0.0003,
2423
+ "step": 9725
2424
+ },
2425
+ {
2426
+ "epoch": 573.41,
2427
+ "learning_rate": 1.4537931034482757e-06,
2428
+ "loss": 0.0002,
2429
+ "step": 9750
2430
+ },
2431
+ {
2432
+ "epoch": 574.88,
2433
+ "learning_rate": 1.4468965517241377e-06,
2434
+ "loss": 0.0002,
2435
+ "step": 9775
2436
+ },
2437
+ {
2438
+ "epoch": 576.35,
2439
+ "learning_rate": 1.44e-06,
2440
+ "loss": 0.0004,
2441
+ "step": 9800
2442
+ },
2443
+ {
2444
+ "epoch": 577.82,
2445
+ "learning_rate": 1.433103448275862e-06,
2446
+ "loss": 0.0002,
2447
+ "step": 9825
2448
+ },
2449
+ {
2450
+ "epoch": 579.29,
2451
+ "learning_rate": 1.426206896551724e-06,
2452
+ "loss": 0.0005,
2453
+ "step": 9850
2454
+ },
2455
+ {
2456
+ "epoch": 580.76,
2457
+ "learning_rate": 1.419310344827586e-06,
2458
+ "loss": 0.0004,
2459
+ "step": 9875
2460
+ },
2461
+ {
2462
+ "epoch": 582.24,
2463
+ "learning_rate": 1.4124137931034481e-06,
2464
+ "loss": 0.0003,
2465
+ "step": 9900
2466
+ },
2467
+ {
2468
+ "epoch": 583.71,
2469
+ "learning_rate": 1.4055172413793104e-06,
2470
+ "loss": 0.0004,
2471
+ "step": 9925
2472
+ },
2473
+ {
2474
+ "epoch": 585.18,
2475
+ "learning_rate": 1.3986206896551724e-06,
2476
+ "loss": 0.0004,
2477
+ "step": 9950
2478
+ },
2479
+ {
2480
+ "epoch": 586.65,
2481
+ "learning_rate": 1.3917241379310344e-06,
2482
+ "loss": 0.0004,
2483
+ "step": 9975
2484
+ },
2485
+ {
2486
+ "epoch": 588.12,
2487
+ "learning_rate": 1.3848275862068965e-06,
2488
+ "loss": 0.0003,
2489
+ "step": 10000
2490
+ },
2491
+ {
2492
+ "epoch": 588.12,
2493
+ "eval_loss": 0.54296875,
2494
+ "eval_runtime": 156.5622,
2495
+ "eval_samples_per_second": 1.737,
2496
+ "eval_steps_per_second": 0.109,
2497
+ "eval_wer": 9.973997028231798,
2498
+ "step": 10000
2499
+ },
2500
+ {
2501
+ "epoch": 589.59,
2502
+ "learning_rate": 1.3779310344827587e-06,
2503
+ "loss": 0.0002,
2504
+ "step": 10025
2505
+ },
2506
+ {
2507
+ "epoch": 591.06,
2508
+ "learning_rate": 1.3710344827586207e-06,
2509
+ "loss": 0.0003,
2510
+ "step": 10050
2511
+ },
2512
+ {
2513
+ "epoch": 592.53,
2514
+ "learning_rate": 1.3641379310344828e-06,
2515
+ "loss": 0.0002,
2516
+ "step": 10075
2517
+ },
2518
+ {
2519
+ "epoch": 594.0,
2520
+ "learning_rate": 1.3572413793103448e-06,
2521
+ "loss": 0.0003,
2522
+ "step": 10100
2523
+ },
2524
+ {
2525
+ "epoch": 595.47,
2526
+ "learning_rate": 1.3503448275862068e-06,
2527
+ "loss": 0.0003,
2528
+ "step": 10125
2529
+ },
2530
+ {
2531
+ "epoch": 596.94,
2532
+ "learning_rate": 1.3434482758620689e-06,
2533
+ "loss": 0.0002,
2534
+ "step": 10150
2535
+ },
2536
+ {
2537
+ "epoch": 598.41,
2538
+ "learning_rate": 1.3365517241379309e-06,
2539
+ "loss": 0.0004,
2540
+ "step": 10175
2541
+ },
2542
+ {
2543
+ "epoch": 599.88,
2544
+ "learning_rate": 1.329655172413793e-06,
2545
+ "loss": 0.0002,
2546
+ "step": 10200
2547
+ },
2548
+ {
2549
+ "epoch": 601.35,
2550
+ "learning_rate": 1.322758620689655e-06,
2551
+ "loss": 0.0003,
2552
+ "step": 10225
2553
+ },
2554
+ {
2555
+ "epoch": 602.82,
2556
+ "learning_rate": 1.3158620689655172e-06,
2557
+ "loss": 0.0003,
2558
+ "step": 10250
2559
+ },
2560
+ {
2561
+ "epoch": 604.29,
2562
+ "learning_rate": 1.3089655172413792e-06,
2563
+ "loss": 0.0002,
2564
+ "step": 10275
2565
+ },
2566
+ {
2567
+ "epoch": 605.76,
2568
+ "learning_rate": 1.3020689655172413e-06,
2569
+ "loss": 0.0002,
2570
+ "step": 10300
2571
+ },
2572
+ {
2573
+ "epoch": 607.24,
2574
+ "learning_rate": 1.2951724137931035e-06,
2575
+ "loss": 0.0003,
2576
+ "step": 10325
2577
+ },
2578
+ {
2579
+ "epoch": 608.71,
2580
+ "learning_rate": 1.2882758620689655e-06,
2581
+ "loss": 0.0002,
2582
+ "step": 10350
2583
+ },
2584
+ {
2585
+ "epoch": 610.18,
2586
+ "learning_rate": 1.2813793103448276e-06,
2587
+ "loss": 0.0003,
2588
+ "step": 10375
2589
+ },
2590
+ {
2591
+ "epoch": 611.65,
2592
+ "learning_rate": 1.2744827586206896e-06,
2593
+ "loss": 0.0003,
2594
+ "step": 10400
2595
+ },
2596
+ {
2597
+ "epoch": 613.12,
2598
+ "learning_rate": 1.2675862068965516e-06,
2599
+ "loss": 0.0003,
2600
+ "step": 10425
2601
+ },
2602
+ {
2603
+ "epoch": 614.59,
2604
+ "learning_rate": 1.2612413793103448e-06,
2605
+ "loss": 0.0005,
2606
+ "step": 10450
2607
+ },
2608
+ {
2609
+ "epoch": 616.06,
2610
+ "learning_rate": 1.2543448275862068e-06,
2611
+ "loss": 0.0003,
2612
+ "step": 10475
2613
+ },
2614
+ {
2615
+ "epoch": 617.53,
2616
+ "learning_rate": 1.2474482758620688e-06,
2617
+ "loss": 0.0003,
2618
+ "step": 10500
2619
+ },
2620
+ {
2621
+ "epoch": 619.0,
2622
+ "learning_rate": 1.240551724137931e-06,
2623
+ "loss": 0.0001,
2624
+ "step": 10525
2625
+ },
2626
+ {
2627
+ "epoch": 620.47,
2628
+ "learning_rate": 1.2336551724137931e-06,
2629
+ "loss": 0.0002,
2630
+ "step": 10550
2631
+ },
2632
+ {
2633
+ "epoch": 621.94,
2634
+ "learning_rate": 1.2267586206896552e-06,
2635
+ "loss": 0.0005,
2636
+ "step": 10575
2637
+ },
2638
+ {
2639
+ "epoch": 623.41,
2640
+ "learning_rate": 1.2198620689655172e-06,
2641
+ "loss": 0.0002,
2642
+ "step": 10600
2643
+ },
2644
+ {
2645
+ "epoch": 624.88,
2646
+ "learning_rate": 1.2129655172413792e-06,
2647
+ "loss": 0.0003,
2648
+ "step": 10625
2649
+ },
2650
+ {
2651
+ "epoch": 626.35,
2652
+ "learning_rate": 1.2060689655172413e-06,
2653
+ "loss": 0.0002,
2654
+ "step": 10650
2655
+ },
2656
+ {
2657
+ "epoch": 627.82,
2658
+ "learning_rate": 1.1991724137931035e-06,
2659
+ "loss": 0.0003,
2660
+ "step": 10675
2661
+ },
2662
+ {
2663
+ "epoch": 629.29,
2664
+ "learning_rate": 1.1922758620689655e-06,
2665
+ "loss": 0.0003,
2666
+ "step": 10700
2667
+ },
2668
+ {
2669
+ "epoch": 630.76,
2670
+ "learning_rate": 1.1853793103448276e-06,
2671
+ "loss": 0.0003,
2672
+ "step": 10725
2673
+ },
2674
+ {
2675
+ "epoch": 632.24,
2676
+ "learning_rate": 1.1784827586206896e-06,
2677
+ "loss": 0.0002,
2678
+ "step": 10750
2679
+ },
2680
+ {
2681
+ "epoch": 633.71,
2682
+ "learning_rate": 1.1715862068965516e-06,
2683
+ "loss": 0.0002,
2684
+ "step": 10775
2685
+ },
2686
+ {
2687
+ "epoch": 635.18,
2688
+ "learning_rate": 1.1646896551724137e-06,
2689
+ "loss": 0.0004,
2690
+ "step": 10800
2691
+ },
2692
+ {
2693
+ "epoch": 636.65,
2694
+ "learning_rate": 1.1577931034482757e-06,
2695
+ "loss": 0.0003,
2696
+ "step": 10825
2697
+ },
2698
+ {
2699
+ "epoch": 638.12,
2700
+ "learning_rate": 1.1508965517241377e-06,
2701
+ "loss": 0.0002,
2702
+ "step": 10850
2703
+ },
2704
+ {
2705
+ "epoch": 639.59,
2706
+ "learning_rate": 1.1439999999999998e-06,
2707
+ "loss": 0.0002,
2708
+ "step": 10875
2709
+ },
2710
+ {
2711
+ "epoch": 641.06,
2712
+ "learning_rate": 1.137103448275862e-06,
2713
+ "loss": 0.0003,
2714
+ "step": 10900
2715
+ },
2716
+ {
2717
+ "epoch": 642.53,
2718
+ "learning_rate": 1.1302068965517243e-06,
2719
+ "loss": 0.0002,
2720
+ "step": 10925
2721
+ },
2722
+ {
2723
+ "epoch": 644.0,
2724
+ "learning_rate": 1.1233103448275863e-06,
2725
+ "loss": 0.0004,
2726
+ "step": 10950
2727
+ },
2728
+ {
2729
+ "epoch": 645.47,
2730
+ "learning_rate": 1.1164137931034483e-06,
2731
+ "loss": 0.0004,
2732
+ "step": 10975
2733
+ },
2734
+ {
2735
+ "epoch": 646.94,
2736
+ "learning_rate": 1.1095172413793103e-06,
2737
+ "loss": 0.0002,
2738
+ "step": 11000
2739
+ },
2740
+ {
2741
+ "epoch": 646.94,
2742
+ "eval_loss": 0.5458984375,
2743
+ "eval_runtime": 157.5866,
2744
+ "eval_samples_per_second": 1.726,
2745
+ "eval_steps_per_second": 0.108,
2746
+ "eval_wer": 9.955423476968797,
2747
+ "step": 11000
2748
+ },
2749
+ {
2750
+ "epoch": 648.41,
2751
+ "learning_rate": 1.1026206896551724e-06,
2752
+ "loss": 0.0003,
2753
+ "step": 11025
2754
+ },
2755
+ {
2756
+ "epoch": 649.88,
2757
+ "learning_rate": 1.0957241379310344e-06,
2758
+ "loss": 0.0002,
2759
+ "step": 11050
2760
+ },
2761
+ {
2762
+ "epoch": 651.35,
2763
+ "learning_rate": 1.0888275862068964e-06,
2764
+ "loss": 0.0002,
2765
+ "step": 11075
2766
+ },
2767
+ {
2768
+ "epoch": 652.82,
2769
+ "learning_rate": 1.0819310344827585e-06,
2770
+ "loss": 0.0003,
2771
+ "step": 11100
2772
+ },
2773
+ {
2774
+ "epoch": 654.29,
2775
+ "learning_rate": 1.0750344827586207e-06,
2776
+ "loss": 0.0002,
2777
+ "step": 11125
2778
+ },
2779
+ {
2780
+ "epoch": 655.76,
2781
+ "learning_rate": 1.0681379310344828e-06,
2782
+ "loss": 0.0003,
2783
+ "step": 11150
2784
+ },
2785
+ {
2786
+ "epoch": 657.24,
2787
+ "learning_rate": 1.0612413793103448e-06,
2788
+ "loss": 0.0003,
2789
+ "step": 11175
2790
+ },
2791
+ {
2792
+ "epoch": 658.71,
2793
+ "learning_rate": 1.0543448275862068e-06,
2794
+ "loss": 0.0005,
2795
+ "step": 11200
2796
+ },
2797
+ {
2798
+ "epoch": 660.18,
2799
+ "learning_rate": 1.0474482758620689e-06,
2800
+ "loss": 0.0002,
2801
+ "step": 11225
2802
+ },
2803
+ {
2804
+ "epoch": 661.65,
2805
+ "learning_rate": 1.0405517241379309e-06,
2806
+ "loss": 0.0002,
2807
+ "step": 11250
2808
+ },
2809
+ {
2810
+ "epoch": 663.12,
2811
+ "learning_rate": 1.033655172413793e-06,
2812
+ "loss": 0.0003,
2813
+ "step": 11275
2814
+ },
2815
+ {
2816
+ "epoch": 664.59,
2817
+ "learning_rate": 1.026758620689655e-06,
2818
+ "loss": 0.0002,
2819
+ "step": 11300
2820
+ },
2821
+ {
2822
+ "epoch": 666.06,
2823
+ "learning_rate": 1.0198620689655172e-06,
2824
+ "loss": 0.0002,
2825
+ "step": 11325
2826
+ },
2827
+ {
2828
+ "epoch": 667.53,
2829
+ "learning_rate": 1.0129655172413794e-06,
2830
+ "loss": 0.0003,
2831
+ "step": 11350
2832
+ },
2833
+ {
2834
+ "epoch": 669.0,
2835
+ "learning_rate": 1.0060689655172415e-06,
2836
+ "loss": 0.0009,
2837
+ "step": 11375
2838
+ },
2839
+ {
2840
+ "epoch": 670.47,
2841
+ "learning_rate": 9.991724137931033e-07,
2842
+ "loss": 0.0002,
2843
+ "step": 11400
2844
+ },
2845
+ {
2846
+ "epoch": 671.94,
2847
+ "learning_rate": 9.922758620689655e-07,
2848
+ "loss": 0.0002,
2849
+ "step": 11425
2850
+ },
2851
+ {
2852
+ "epoch": 673.41,
2853
+ "learning_rate": 9.859310344827587e-07,
2854
+ "loss": 0.0003,
2855
+ "step": 11450
2856
+ },
2857
+ {
2858
+ "epoch": 674.88,
2859
+ "learning_rate": 9.790344827586207e-07,
2860
+ "loss": 0.0002,
2861
+ "step": 11475
2862
+ },
2863
+ {
2864
+ "epoch": 676.35,
2865
+ "learning_rate": 9.721379310344827e-07,
2866
+ "loss": 0.0002,
2867
+ "step": 11500
2868
+ },
2869
+ {
2870
+ "epoch": 677.82,
2871
+ "learning_rate": 9.652413793103448e-07,
2872
+ "loss": 0.0002,
2873
+ "step": 11525
2874
+ },
2875
+ {
2876
+ "epoch": 679.29,
2877
+ "learning_rate": 9.583448275862068e-07,
2878
+ "loss": 0.0003,
2879
+ "step": 11550
2880
+ },
2881
+ {
2882
+ "epoch": 680.76,
2883
+ "learning_rate": 9.514482758620688e-07,
2884
+ "loss": 0.0003,
2885
+ "step": 11575
2886
+ },
2887
+ {
2888
+ "epoch": 682.24,
2889
+ "learning_rate": 9.44551724137931e-07,
2890
+ "loss": 0.0003,
2891
+ "step": 11600
2892
+ },
2893
+ {
2894
+ "epoch": 683.71,
2895
+ "learning_rate": 9.376551724137931e-07,
2896
+ "loss": 0.0002,
2897
+ "step": 11625
2898
+ },
2899
+ {
2900
+ "epoch": 685.18,
2901
+ "learning_rate": 9.307586206896552e-07,
2902
+ "loss": 0.0002,
2903
+ "step": 11650
2904
+ },
2905
+ {
2906
+ "epoch": 686.65,
2907
+ "learning_rate": 9.238620689655172e-07,
2908
+ "loss": 0.0003,
2909
+ "step": 11675
2910
+ },
2911
+ {
2912
+ "epoch": 688.12,
2913
+ "learning_rate": 9.169655172413792e-07,
2914
+ "loss": 0.0003,
2915
+ "step": 11700
2916
+ },
2917
+ {
2918
+ "epoch": 689.59,
2919
+ "learning_rate": 9.100689655172414e-07,
2920
+ "loss": 0.0001,
2921
+ "step": 11725
2922
+ },
2923
+ {
2924
+ "epoch": 691.06,
2925
+ "learning_rate": 9.031724137931034e-07,
2926
+ "loss": 0.0004,
2927
+ "step": 11750
2928
+ },
2929
+ {
2930
+ "epoch": 692.53,
2931
+ "learning_rate": 8.962758620689654e-07,
2932
+ "loss": 0.0003,
2933
+ "step": 11775
2934
+ },
2935
+ {
2936
+ "epoch": 694.0,
2937
+ "learning_rate": 8.893793103448275e-07,
2938
+ "loss": 0.0005,
2939
+ "step": 11800
2940
+ },
2941
+ {
2942
+ "epoch": 695.47,
2943
+ "learning_rate": 8.824827586206897e-07,
2944
+ "loss": 0.0002,
2945
+ "step": 11825
2946
+ },
2947
+ {
2948
+ "epoch": 696.94,
2949
+ "learning_rate": 8.755862068965517e-07,
2950
+ "loss": 0.0002,
2951
+ "step": 11850
2952
+ },
2953
+ {
2954
+ "epoch": 698.41,
2955
+ "learning_rate": 8.686896551724138e-07,
2956
+ "loss": 0.0002,
2957
+ "step": 11875
2958
+ },
2959
+ {
2960
+ "epoch": 699.88,
2961
+ "learning_rate": 8.617931034482758e-07,
2962
+ "loss": 0.0002,
2963
+ "step": 11900
2964
+ },
2965
+ {
2966
+ "epoch": 701.35,
2967
+ "learning_rate": 8.548965517241378e-07,
2968
+ "loss": 0.0003,
2969
+ "step": 11925
2970
+ },
2971
+ {
2972
+ "epoch": 702.82,
2973
+ "learning_rate": 8.48e-07,
2974
+ "loss": 0.0002,
2975
+ "step": 11950
2976
+ },
2977
+ {
2978
+ "epoch": 704.29,
2979
+ "learning_rate": 8.41103448275862e-07,
2980
+ "loss": 0.0002,
2981
+ "step": 11975
2982
+ },
2983
+ {
2984
+ "epoch": 705.76,
2985
+ "learning_rate": 8.34206896551724e-07,
2986
+ "loss": 0.0003,
2987
+ "step": 12000
2988
+ },
2989
+ {
2990
+ "epoch": 705.76,
2991
+ "eval_loss": 0.55615234375,
2992
+ "eval_runtime": 158.1148,
2993
+ "eval_samples_per_second": 1.72,
2994
+ "eval_steps_per_second": 0.108,
2995
+ "eval_wer": 9.9832838038633,
2996
+ "step": 12000
2997
+ },
2998
+ {
2999
+ "epoch": 706.47,
3000
+ "learning_rate": 3.1968e-07,
3001
+ "loss": 0.0002,
3002
+ "step": 12025
3003
+ },
3004
+ {
3005
+ "epoch": 707.94,
3006
+ "learning_rate": 3.1168e-07,
3007
+ "loss": 0.0003,
3008
+ "step": 12050
3009
+ },
3010
+ {
3011
+ "epoch": 709.41,
3012
+ "learning_rate": 3.0368e-07,
3013
+ "loss": 0.0002,
3014
+ "step": 12075
3015
+ },
3016
+ {
3017
+ "epoch": 710.88,
3018
+ "learning_rate": 2.9568e-07,
3019
+ "loss": 0.0002,
3020
+ "step": 12100
3021
+ },
3022
+ {
3023
+ "epoch": 712.35,
3024
+ "learning_rate": 2.8768e-07,
3025
+ "loss": 0.0003,
3026
+ "step": 12125
3027
+ },
3028
+ {
3029
+ "epoch": 713.82,
3030
+ "learning_rate": 2.7968e-07,
3031
+ "loss": 0.0002,
3032
+ "step": 12150
3033
+ },
3034
+ {
3035
+ "epoch": 715.29,
3036
+ "learning_rate": 2.7167999999999996e-07,
3037
+ "loss": 0.0005,
3038
+ "step": 12175
3039
+ },
3040
+ {
3041
+ "epoch": 716.76,
3042
+ "learning_rate": 2.6368e-07,
3043
+ "loss": 0.0002,
3044
+ "step": 12200
3045
+ },
3046
+ {
3047
+ "epoch": 718.24,
3048
+ "learning_rate": 2.5568e-07,
3049
+ "loss": 0.0002,
3050
+ "step": 12225
3051
+ },
3052
+ {
3053
+ "epoch": 719.71,
3054
+ "learning_rate": 2.4768e-07,
3055
+ "loss": 0.0002,
3056
+ "step": 12250
3057
+ },
3058
+ {
3059
+ "epoch": 721.18,
3060
+ "learning_rate": 2.3968e-07,
3061
+ "loss": 0.0003,
3062
+ "step": 12275
3063
+ },
3064
+ {
3065
+ "epoch": 722.65,
3066
+ "learning_rate": 2.3168e-07,
3067
+ "loss": 0.0002,
3068
+ "step": 12300
3069
+ },
3070
+ {
3071
+ "epoch": 724.12,
3072
+ "learning_rate": 2.2367999999999998e-07,
3073
+ "loss": 0.0002,
3074
+ "step": 12325
3075
+ },
3076
+ {
3077
+ "epoch": 725.59,
3078
+ "learning_rate": 2.1568e-07,
3079
+ "loss": 0.0002,
3080
+ "step": 12350
3081
+ },
3082
+ {
3083
+ "epoch": 727.06,
3084
+ "learning_rate": 2.0768e-07,
3085
+ "loss": 0.0001,
3086
+ "step": 12375
3087
+ },
3088
+ {
3089
+ "epoch": 728.53,
3090
+ "learning_rate": 1.9968e-07,
3091
+ "loss": 0.0002,
3092
+ "step": 12400
3093
+ },
3094
+ {
3095
+ "epoch": 730.0,
3096
+ "learning_rate": 1.9167999999999998e-07,
3097
+ "loss": 0.0002,
3098
+ "step": 12425
3099
+ },
3100
+ {
3101
+ "epoch": 731.47,
3102
+ "learning_rate": 1.8432e-07,
3103
+ "loss": 0.0003,
3104
+ "step": 12450
3105
+ },
3106
+ {
3107
+ "epoch": 732.94,
3108
+ "learning_rate": 1.7632e-07,
3109
+ "loss": 0.0002,
3110
+ "step": 12475
3111
+ },
3112
+ {
3113
+ "epoch": 734.41,
3114
+ "learning_rate": 1.6832e-07,
3115
+ "loss": 0.0001,
3116
+ "step": 12500
3117
+ },
3118
+ {
3119
+ "epoch": 735.88,
3120
+ "learning_rate": 1.6032e-07,
3121
+ "loss": 0.0001,
3122
+ "step": 12525
3123
+ },
3124
+ {
3125
+ "epoch": 737.35,
3126
+ "learning_rate": 1.5232e-07,
3127
+ "loss": 0.0001,
3128
+ "step": 12550
3129
+ },
3130
+ {
3131
+ "epoch": 738.82,
3132
+ "learning_rate": 1.4431999999999998e-07,
3133
+ "loss": 0.0002,
3134
+ "step": 12575
3135
+ },
3136
+ {
3137
+ "epoch": 740.29,
3138
+ "learning_rate": 1.3632e-07,
3139
+ "loss": 0.0002,
3140
+ "step": 12600
3141
+ },
3142
+ {
3143
+ "epoch": 741.76,
3144
+ "learning_rate": 1.2831999999999997e-07,
3145
+ "loss": 0.0001,
3146
+ "step": 12625
3147
+ },
3148
+ {
3149
+ "epoch": 743.24,
3150
+ "learning_rate": 1.2031999999999998e-07,
3151
+ "loss": 0.0003,
3152
+ "step": 12650
3153
+ },
3154
+ {
3155
+ "epoch": 744.71,
3156
+ "learning_rate": 1.1232e-07,
3157
+ "loss": 0.0002,
3158
+ "step": 12675
3159
+ },
3160
+ {
3161
+ "epoch": 746.18,
3162
+ "learning_rate": 1.0432e-07,
3163
+ "loss": 0.0002,
3164
+ "step": 12700
3165
+ },
3166
+ {
3167
+ "epoch": 747.65,
3168
+ "learning_rate": 9.632e-08,
3169
+ "loss": 0.0002,
3170
+ "step": 12725
3171
+ },
3172
+ {
3173
+ "epoch": 749.12,
3174
+ "learning_rate": 8.831999999999999e-08,
3175
+ "loss": 0.0002,
3176
+ "step": 12750
3177
+ },
3178
+ {
3179
+ "epoch": 750.59,
3180
+ "learning_rate": 8.032e-08,
3181
+ "loss": 0.0002,
3182
+ "step": 12775
3183
+ },
3184
+ {
3185
+ "epoch": 752.06,
3186
+ "learning_rate": 7.231999999999999e-08,
3187
+ "loss": 0.0002,
3188
+ "step": 12800
3189
+ },
3190
+ {
3191
+ "epoch": 753.53,
3192
+ "learning_rate": 6.432e-08,
3193
+ "loss": 0.0002,
3194
+ "step": 12825
3195
+ },
3196
+ {
3197
+ "epoch": 755.0,
3198
+ "learning_rate": 5.632e-08,
3199
+ "loss": 0.0002,
3200
+ "step": 12850
3201
+ },
3202
+ {
3203
+ "epoch": 756.47,
3204
+ "learning_rate": 4.832e-08,
3205
+ "loss": 0.0002,
3206
+ "step": 12875
3207
+ },
3208
+ {
3209
+ "epoch": 757.94,
3210
+ "learning_rate": 4.032e-08,
3211
+ "loss": 0.0002,
3212
+ "step": 12900
3213
+ },
3214
+ {
3215
+ "epoch": 759.41,
3216
+ "learning_rate": 3.232e-08,
3217
+ "loss": 0.0001,
3218
+ "step": 12925
3219
+ },
3220
+ {
3221
+ "epoch": 760.88,
3222
+ "learning_rate": 2.432e-08,
3223
+ "loss": 0.0002,
3224
+ "step": 12950
3225
+ },
3226
+ {
3227
+ "epoch": 762.35,
3228
+ "learning_rate": 1.632e-08,
3229
+ "loss": 0.0001,
3230
+ "step": 12975
3231
+ },
3232
+ {
3233
+ "epoch": 763.82,
3234
+ "learning_rate": 8.32e-09,
3235
+ "loss": 0.0001,
3236
+ "step": 13000
3237
+ },
3238
+ {
3239
+ "epoch": 763.82,
3240
+ "eval_loss": 0.5546875,
3241
+ "eval_runtime": 156.9741,
3242
+ "eval_samples_per_second": 1.733,
3243
+ "eval_steps_per_second": 0.108,
3244
+ "eval_wer": 9.9925705794948,
3245
+ "step": 13000
3246
+ },
3247
+ {
3248
+ "epoch": 765.47,
3249
+ "learning_rate": 2.965925925925926e-07,
3250
+ "loss": 0.0002,
3251
+ "step": 13025
3252
+ },
3253
+ {
3254
+ "epoch": 766.94,
3255
+ "learning_rate": 2.891851851851852e-07,
3256
+ "loss": 0.0001,
3257
+ "step": 13050
3258
+ },
3259
+ {
3260
+ "epoch": 768.41,
3261
+ "learning_rate": 2.817777777777778e-07,
3262
+ "loss": 0.0002,
3263
+ "step": 13075
3264
+ },
3265
+ {
3266
+ "epoch": 769.88,
3267
+ "learning_rate": 2.7437037037037035e-07,
3268
+ "loss": 0.0001,
3269
+ "step": 13100
3270
+ },
3271
+ {
3272
+ "epoch": 771.35,
3273
+ "learning_rate": 2.6696296296296296e-07,
3274
+ "loss": 0.0001,
3275
+ "step": 13125
3276
+ },
3277
+ {
3278
+ "epoch": 772.82,
3279
+ "learning_rate": 2.595555555555555e-07,
3280
+ "loss": 0.0002,
3281
+ "step": 13150
3282
+ },
3283
+ {
3284
+ "epoch": 774.29,
3285
+ "learning_rate": 2.521481481481481e-07,
3286
+ "loss": 0.0002,
3287
+ "step": 13175
3288
+ },
3289
+ {
3290
+ "epoch": 775.76,
3291
+ "learning_rate": 2.4474074074074073e-07,
3292
+ "loss": 0.0001,
3293
+ "step": 13200
3294
+ },
3295
+ {
3296
+ "epoch": 777.24,
3297
+ "learning_rate": 2.3733333333333334e-07,
3298
+ "loss": 0.0001,
3299
+ "step": 13225
3300
+ },
3301
+ {
3302
+ "epoch": 778.71,
3303
+ "learning_rate": 2.2992592592592592e-07,
3304
+ "loss": 0.0001,
3305
+ "step": 13250
3306
+ },
3307
+ {
3308
+ "epoch": 780.18,
3309
+ "learning_rate": 2.2251851851851853e-07,
3310
+ "loss": 0.0003,
3311
+ "step": 13275
3312
+ },
3313
+ {
3314
+ "epoch": 781.65,
3315
+ "learning_rate": 2.1511111111111111e-07,
3316
+ "loss": 0.0001,
3317
+ "step": 13300
3318
+ },
3319
+ {
3320
+ "epoch": 783.12,
3321
+ "learning_rate": 2.077037037037037e-07,
3322
+ "loss": 0.0001,
3323
+ "step": 13325
3324
+ },
3325
+ {
3326
+ "epoch": 784.59,
3327
+ "learning_rate": 2.002962962962963e-07,
3328
+ "loss": 0.0001,
3329
+ "step": 13350
3330
+ },
3331
+ {
3332
+ "epoch": 786.06,
3333
+ "learning_rate": 1.9288888888888889e-07,
3334
+ "loss": 0.0001,
3335
+ "step": 13375
3336
+ },
3337
+ {
3338
+ "epoch": 787.53,
3339
+ "learning_rate": 1.8548148148148147e-07,
3340
+ "loss": 0.0002,
3341
+ "step": 13400
3342
+ },
3343
+ {
3344
+ "epoch": 789.0,
3345
+ "learning_rate": 1.7807407407407408e-07,
3346
+ "loss": 0.0002,
3347
+ "step": 13425
3348
+ },
3349
+ {
3350
+ "epoch": 790.47,
3351
+ "learning_rate": 1.7066666666666666e-07,
3352
+ "loss": 0.0002,
3353
+ "step": 13450
3354
+ },
3355
+ {
3356
+ "epoch": 791.94,
3357
+ "learning_rate": 1.6385185185185184e-07,
3358
+ "loss": 0.0001,
3359
+ "step": 13475
3360
+ },
3361
+ {
3362
+ "epoch": 793.41,
3363
+ "learning_rate": 1.5644444444444442e-07,
3364
+ "loss": 0.0003,
3365
+ "step": 13500
3366
+ },
3367
+ {
3368
+ "epoch": 794.88,
3369
+ "learning_rate": 1.49037037037037e-07,
3370
+ "loss": 0.0001,
3371
+ "step": 13525
3372
+ },
3373
+ {
3374
+ "epoch": 796.35,
3375
+ "learning_rate": 1.4162962962962962e-07,
3376
+ "loss": 0.0001,
3377
+ "step": 13550
3378
+ },
3379
+ {
3380
+ "epoch": 797.82,
3381
+ "learning_rate": 1.342222222222222e-07,
3382
+ "loss": 0.0001,
3383
+ "step": 13575
3384
+ },
3385
+ {
3386
+ "epoch": 799.29,
3387
+ "learning_rate": 1.268148148148148e-07,
3388
+ "loss": 0.0001,
3389
+ "step": 13600
3390
+ },
3391
+ {
3392
+ "epoch": 800.76,
3393
+ "learning_rate": 1.194074074074074e-07,
3394
+ "loss": 0.0002,
3395
+ "step": 13625
3396
+ },
3397
+ {
3398
+ "epoch": 802.24,
3399
+ "learning_rate": 1.12e-07,
3400
+ "loss": 0.0001,
3401
+ "step": 13650
3402
+ },
3403
+ {
3404
+ "epoch": 803.71,
3405
+ "learning_rate": 1.0459259259259259e-07,
3406
+ "loss": 0.0002,
3407
+ "step": 13675
3408
+ },
3409
+ {
3410
+ "epoch": 805.18,
3411
+ "learning_rate": 9.718518518518517e-08,
3412
+ "loss": 0.0002,
3413
+ "step": 13700
3414
+ },
3415
+ {
3416
+ "epoch": 806.65,
3417
+ "learning_rate": 8.977777777777777e-08,
3418
+ "loss": 0.0002,
3419
+ "step": 13725
3420
+ },
3421
+ {
3422
+ "epoch": 808.12,
3423
+ "learning_rate": 8.237037037037037e-08,
3424
+ "loss": 0.0002,
3425
+ "step": 13750
3426
+ },
3427
+ {
3428
+ "epoch": 809.59,
3429
+ "learning_rate": 7.496296296296296e-08,
3430
+ "loss": 0.0002,
3431
+ "step": 13775
3432
+ },
3433
+ {
3434
+ "epoch": 811.06,
3435
+ "learning_rate": 6.755555555555554e-08,
3436
+ "loss": 0.0001,
3437
+ "step": 13800
3438
+ },
3439
+ {
3440
+ "epoch": 812.53,
3441
+ "learning_rate": 6.014814814814814e-08,
3442
+ "loss": 0.0001,
3443
+ "step": 13825
3444
+ },
3445
+ {
3446
+ "epoch": 814.0,
3447
+ "learning_rate": 5.274074074074074e-08,
3448
+ "loss": 0.0002,
3449
+ "step": 13850
3450
+ },
3451
+ {
3452
+ "epoch": 815.47,
3453
+ "learning_rate": 4.5333333333333336e-08,
3454
+ "loss": 0.0001,
3455
+ "step": 13875
3456
+ },
3457
+ {
3458
+ "epoch": 816.94,
3459
+ "learning_rate": 3.7925925925925924e-08,
3460
+ "loss": 0.0002,
3461
+ "step": 13900
3462
+ },
3463
+ {
3464
+ "epoch": 818.41,
3465
+ "learning_rate": 3.051851851851851e-08,
3466
+ "loss": 0.0001,
3467
+ "step": 13925
3468
+ },
3469
+ {
3470
+ "epoch": 819.88,
3471
+ "learning_rate": 2.311111111111111e-08,
3472
+ "loss": 0.0002,
3473
+ "step": 13950
3474
+ },
3475
+ {
3476
+ "epoch": 821.35,
3477
+ "learning_rate": 1.57037037037037e-08,
3478
+ "loss": 0.0001,
3479
+ "step": 13975
3480
+ },
3481
+ {
3482
+ "epoch": 822.82,
3483
+ "learning_rate": 8.296296296296296e-09,
3484
+ "loss": 0.0001,
3485
+ "step": 14000
3486
+ },
3487
+ {
3488
+ "epoch": 822.82,
3489
+ "eval_loss": 0.5576171875,
3490
+ "eval_runtime": 157.6735,
3491
+ "eval_samples_per_second": 1.725,
3492
+ "eval_steps_per_second": 0.108,
3493
+ "eval_wer": 9.899702823179792,
3494
+ "step": 14000
3495
+ },
3496
+ {
3497
+ "epoch": 824.47,
3498
+ "learning_rate": 0.00012324102564102563,
3499
+ "loss": 7.1148,
3500
+ "step": 14025
3501
+ },
3502
+ {
3503
+ "epoch": 825.94,
3504
+ "learning_rate": 0.00012272820512820512,
3505
+ "loss": 5.3802,
3506
+ "step": 14050
3507
+ },
3508
+ {
3509
+ "epoch": 827.41,
3510
+ "learning_rate": 0.00012221538461538463,
3511
+ "loss": 4.0038,
3512
+ "step": 14075
3513
+ },
3514
+ {
3515
+ "epoch": 828.88,
3516
+ "learning_rate": 0.0001217025641025641,
3517
+ "loss": 3.0771,
3518
+ "step": 14100
3519
+ },
3520
+ {
3521
+ "epoch": 830.35,
3522
+ "learning_rate": 0.00012118974358974359,
3523
+ "loss": 2.4888,
3524
+ "step": 14125
3525
+ },
3526
+ {
3527
+ "epoch": 831.82,
3528
+ "learning_rate": 0.0001206769230769231,
3529
+ "loss": 2.0454,
3530
+ "step": 14150
3531
+ },
3532
+ {
3533
+ "epoch": 833.29,
3534
+ "learning_rate": 0.00012016410256410258,
3535
+ "loss": 1.6123,
3536
+ "step": 14175
3537
+ },
3538
+ {
3539
+ "epoch": 834.76,
3540
+ "learning_rate": 0.00011965128205128207,
3541
+ "loss": 1.1082,
3542
+ "step": 14200
3543
+ },
3544
+ {
3545
+ "epoch": 836.24,
3546
+ "learning_rate": 0.00011913846153846155,
3547
+ "loss": 0.6733,
3548
+ "step": 14225
3549
+ },
3550
+ {
3551
+ "epoch": 837.71,
3552
+ "learning_rate": 0.00011862564102564103,
3553
+ "loss": 0.4108,
3554
+ "step": 14250
3555
+ },
3556
+ {
3557
+ "epoch": 839.18,
3558
+ "learning_rate": 0.00011811282051282051,
3559
+ "loss": 0.2879,
3560
+ "step": 14275
3561
+ },
3562
+ {
3563
+ "epoch": 840.65,
3564
+ "learning_rate": 0.0001176,
3565
+ "loss": 0.2274,
3566
+ "step": 14300
3567
+ },
3568
+ {
3569
+ "epoch": 842.12,
3570
+ "learning_rate": 0.00011708717948717949,
3571
+ "loss": 0.1869,
3572
+ "step": 14325
3573
+ },
3574
+ {
3575
+ "epoch": 843.59,
3576
+ "learning_rate": 0.00011657435897435897,
3577
+ "loss": 0.1548,
3578
+ "step": 14350
3579
+ },
3580
+ {
3581
+ "epoch": 845.06,
3582
+ "learning_rate": 0.00011606153846153847,
3583
+ "loss": 2.892,
3584
+ "step": 14375
3585
+ },
3586
+ {
3587
+ "epoch": 846.53,
3588
+ "learning_rate": 0.00011556923076923078,
3589
+ "loss": 4.4433,
3590
+ "step": 14400
3591
+ },
3592
+ {
3593
+ "epoch": 848.0,
3594
+ "learning_rate": 0.00011505641025641026,
3595
+ "loss": 0.9719,
3596
+ "step": 14425
3597
+ },
3598
+ {
3599
+ "epoch": 849.47,
3600
+ "learning_rate": 0.00011454358974358974,
3601
+ "loss": 0.0969,
3602
+ "step": 14450
3603
+ },
3604
+ {
3605
+ "epoch": 850.94,
3606
+ "learning_rate": 0.00011403076923076923,
3607
+ "loss": 0.0932,
3608
+ "step": 14475
3609
+ },
3610
+ {
3611
+ "epoch": 852.41,
3612
+ "learning_rate": 0.00011351794871794871,
3613
+ "loss": 0.0829,
3614
+ "step": 14500
3615
+ },
3616
+ {
3617
+ "epoch": 853.88,
3618
+ "learning_rate": 0.0001130051282051282,
3619
+ "loss": 0.0785,
3620
+ "step": 14525
3621
+ },
3622
+ {
3623
+ "epoch": 855.35,
3624
+ "learning_rate": 0.0001124923076923077,
3625
+ "loss": 0.0679,
3626
+ "step": 14550
3627
+ },
3628
+ {
3629
+ "epoch": 856.82,
3630
+ "learning_rate": 0.00011197948717948719,
3631
+ "loss": 0.0656,
3632
+ "step": 14575
3633
+ },
3634
+ {
3635
+ "epoch": 858.29,
3636
+ "learning_rate": 0.00011146666666666667,
3637
+ "loss": 0.064,
3638
+ "step": 14600
3639
+ },
3640
+ {
3641
+ "epoch": 859.76,
3642
+ "learning_rate": 0.00011095384615384616,
3643
+ "loss": 0.0614,
3644
+ "step": 14625
3645
+ },
3646
+ {
3647
+ "epoch": 861.24,
3648
+ "learning_rate": 0.00011044102564102565,
3649
+ "loss": 0.0612,
3650
+ "step": 14650
3651
+ },
3652
+ {
3653
+ "epoch": 862.71,
3654
+ "learning_rate": 0.00010992820512820515,
3655
+ "loss": 0.0609,
3656
+ "step": 14675
3657
+ },
3658
+ {
3659
+ "epoch": 864.18,
3660
+ "learning_rate": 0.00010941538461538463,
3661
+ "loss": 0.0586,
3662
+ "step": 14700
3663
+ },
3664
+ {
3665
+ "epoch": 865.65,
3666
+ "learning_rate": 0.0001089025641025641,
3667
+ "loss": 0.0581,
3668
+ "step": 14725
3669
+ },
3670
+ {
3671
+ "epoch": 867.12,
3672
+ "learning_rate": 0.00010838974358974358,
3673
+ "loss": 0.0569,
3674
+ "step": 14750
3675
+ },
3676
+ {
3677
+ "epoch": 868.59,
3678
+ "learning_rate": 0.00010787692307692308,
3679
+ "loss": 0.0573,
3680
+ "step": 14775
3681
+ },
3682
+ {
3683
+ "epoch": 870.06,
3684
+ "learning_rate": 0.00010736410256410257,
3685
+ "loss": 0.0555,
3686
+ "step": 14800
3687
+ },
3688
+ {
3689
+ "epoch": 871.53,
3690
+ "learning_rate": 0.00010685128205128205,
3691
+ "loss": 0.0546,
3692
+ "step": 14825
3693
+ },
3694
+ {
3695
+ "epoch": 873.0,
3696
+ "learning_rate": 0.00010633846153846154,
3697
+ "loss": 0.0548,
3698
+ "step": 14850
3699
+ },
3700
+ {
3701
+ "epoch": 874.47,
3702
+ "learning_rate": 0.00010582564102564103,
3703
+ "loss": 0.0541,
3704
+ "step": 14875
3705
+ },
3706
+ {
3707
+ "epoch": 875.94,
3708
+ "learning_rate": 0.00010531282051282053,
3709
+ "loss": 0.0526,
3710
+ "step": 14900
3711
+ },
3712
+ {
3713
+ "epoch": 877.41,
3714
+ "learning_rate": 0.00010480000000000001,
3715
+ "loss": 0.0521,
3716
+ "step": 14925
3717
+ },
3718
+ {
3719
+ "epoch": 878.88,
3720
+ "learning_rate": 0.0001042871794871795,
3721
+ "loss": 0.0539,
3722
+ "step": 14950
3723
+ },
3724
+ {
3725
+ "epoch": 880.35,
3726
+ "learning_rate": 0.00010377435897435899,
3727
+ "loss": 0.0535,
3728
+ "step": 14975
3729
+ },
3730
+ {
3731
+ "epoch": 881.82,
3732
+ "learning_rate": 0.00010326153846153847,
3733
+ "loss": 0.0538,
3734
+ "step": 15000
3735
+ },
3736
+ {
3737
+ "epoch": 881.82,
3738
+ "eval_loss": 5.33984375,
3739
+ "eval_runtime": 102.1523,
3740
+ "eval_samples_per_second": 2.663,
3741
+ "eval_steps_per_second": 0.166,
3742
+ "eval_wer": 99.87927191679049,
3743
+ "step": 15000
3744
+ },
3745
+ {
3746
+ "epoch": 883.29,
3747
+ "learning_rate": 0.00010274871794871795,
3748
+ "loss": 0.0535,
3749
+ "step": 15025
3750
+ },
3751
+ {
3752
+ "epoch": 884.76,
3753
+ "learning_rate": 0.00010223589743589743,
3754
+ "loss": 0.0516,
3755
+ "step": 15050
3756
+ },
3757
+ {
3758
+ "epoch": 886.24,
3759
+ "learning_rate": 0.00010172307692307692,
3760
+ "loss": 0.0503,
3761
+ "step": 15075
3762
+ },
3763
+ {
3764
+ "epoch": 887.71,
3765
+ "learning_rate": 0.0001012102564102564,
3766
+ "loss": 0.05,
3767
+ "step": 15100
3768
+ },
3769
+ {
3770
+ "epoch": 889.18,
3771
+ "learning_rate": 0.0001006974358974359,
3772
+ "loss": 0.0512,
3773
+ "step": 15125
3774
+ },
3775
+ {
3776
+ "epoch": 890.65,
3777
+ "learning_rate": 0.00010018461538461539,
3778
+ "loss": 0.0503,
3779
+ "step": 15150
3780
+ },
3781
+ {
3782
+ "epoch": 892.12,
3783
+ "learning_rate": 9.967179487179488e-05,
3784
+ "loss": 0.0516,
3785
+ "step": 15175
3786
+ },
3787
+ {
3788
+ "epoch": 893.59,
3789
+ "learning_rate": 9.915897435897436e-05,
3790
+ "loss": 0.0518,
3791
+ "step": 15200
3792
+ },
3793
+ {
3794
+ "epoch": 895.06,
3795
+ "learning_rate": 9.864615384615385e-05,
3796
+ "loss": 0.0521,
3797
+ "step": 15225
3798
+ },
3799
+ {
3800
+ "epoch": 896.53,
3801
+ "learning_rate": 9.813333333333334e-05,
3802
+ "loss": 0.0508,
3803
+ "step": 15250
3804
+ },
3805
+ {
3806
+ "epoch": 898.0,
3807
+ "learning_rate": 9.762051282051282e-05,
3808
+ "loss": 0.0507,
3809
+ "step": 15275
3810
+ },
3811
+ {
3812
+ "epoch": 899.47,
3813
+ "learning_rate": 9.710769230769231e-05,
3814
+ "loss": 0.0506,
3815
+ "step": 15300
3816
+ },
3817
+ {
3818
+ "epoch": 900.94,
3819
+ "learning_rate": 9.65948717948718e-05,
3820
+ "loss": 0.0496,
3821
+ "step": 15325
3822
+ },
3823
+ {
3824
+ "epoch": 902.41,
3825
+ "learning_rate": 9.608205128205128e-05,
3826
+ "loss": 0.052,
3827
+ "step": 15350
3828
+ },
3829
+ {
3830
+ "epoch": 903.88,
3831
+ "learning_rate": 9.556923076923078e-05,
3832
+ "loss": 0.05,
3833
+ "step": 15375
3834
+ },
3835
+ {
3836
+ "epoch": 905.35,
3837
+ "learning_rate": 9.505641025641026e-05,
3838
+ "loss": 0.0498,
3839
+ "step": 15400
3840
+ },
3841
+ {
3842
+ "epoch": 906.82,
3843
+ "learning_rate": 9.454358974358974e-05,
3844
+ "loss": 0.0501,
3845
+ "step": 15425
3846
+ },
3847
+ {
3848
+ "epoch": 908.29,
3849
+ "learning_rate": 9.403076923076923e-05,
3850
+ "loss": 0.0512,
3851
+ "step": 15450
3852
+ },
3853
+ {
3854
+ "epoch": 909.76,
3855
+ "learning_rate": 9.351794871794872e-05,
3856
+ "loss": 0.0499,
3857
+ "step": 15475
3858
+ },
3859
+ {
3860
+ "epoch": 911.24,
3861
+ "learning_rate": 9.300512820512822e-05,
3862
+ "loss": 0.05,
3863
+ "step": 15500
3864
+ },
3865
+ {
3866
+ "epoch": 912.71,
3867
+ "learning_rate": 9.24923076923077e-05,
3868
+ "loss": 0.0516,
3869
+ "step": 15525
3870
+ },
3871
+ {
3872
+ "epoch": 914.18,
3873
+ "learning_rate": 9.197948717948719e-05,
3874
+ "loss": 0.0517,
3875
+ "step": 15550
3876
+ },
3877
+ {
3878
+ "epoch": 915.65,
3879
+ "learning_rate": 9.146666666666666e-05,
3880
+ "loss": 0.0499,
3881
+ "step": 15575
3882
+ },
3883
+ {
3884
+ "epoch": 917.12,
3885
+ "learning_rate": 9.095384615384616e-05,
3886
+ "loss": 0.0531,
3887
+ "step": 15600
3888
+ },
3889
+ {
3890
+ "epoch": 918.59,
3891
+ "learning_rate": 9.044102564102565e-05,
3892
+ "loss": 0.0502,
3893
+ "step": 15625
3894
+ },
3895
+ {
3896
+ "epoch": 920.06,
3897
+ "learning_rate": 8.992820512820514e-05,
3898
+ "loss": 0.0495,
3899
+ "step": 15650
3900
+ },
3901
+ {
3902
+ "epoch": 921.53,
3903
+ "learning_rate": 8.941538461538462e-05,
3904
+ "loss": 0.0499,
3905
+ "step": 15675
3906
+ },
3907
+ {
3908
+ "epoch": 923.0,
3909
+ "learning_rate": 8.890256410256411e-05,
3910
+ "loss": 0.0515,
3911
+ "step": 15700
3912
+ },
3913
+ {
3914
+ "epoch": 924.47,
3915
+ "learning_rate": 8.83897435897436e-05,
3916
+ "loss": 0.0491,
3917
+ "step": 15725
3918
+ },
3919
+ {
3920
+ "epoch": 925.94,
3921
+ "learning_rate": 8.787692307692308e-05,
3922
+ "loss": 0.0491,
3923
+ "step": 15750
3924
+ },
3925
+ {
3926
+ "epoch": 927.41,
3927
+ "learning_rate": 8.736410256410257e-05,
3928
+ "loss": 0.0482,
3929
+ "step": 15775
3930
+ },
3931
+ {
3932
+ "epoch": 928.88,
3933
+ "learning_rate": 8.685128205128206e-05,
3934
+ "loss": 0.0487,
3935
+ "step": 15800
3936
+ },
3937
+ {
3938
+ "epoch": 930.35,
3939
+ "learning_rate": 8.633846153846154e-05,
3940
+ "loss": 0.0494,
3941
+ "step": 15825
3942
+ },
3943
+ {
3944
+ "epoch": 931.82,
3945
+ "learning_rate": 8.582564102564103e-05,
3946
+ "loss": 0.0491,
3947
+ "step": 15850
3948
+ },
3949
+ {
3950
+ "epoch": 933.29,
3951
+ "learning_rate": 8.531282051282051e-05,
3952
+ "loss": 0.0483,
3953
+ "step": 15875
3954
+ },
3955
+ {
3956
+ "epoch": 934.76,
3957
+ "learning_rate": 8.48e-05,
3958
+ "loss": 0.048,
3959
+ "step": 15900
3960
+ },
3961
+ {
3962
+ "epoch": 936.24,
3963
+ "learning_rate": 8.428717948717949e-05,
3964
+ "loss": 0.0488,
3965
+ "step": 15925
3966
+ },
3967
+ {
3968
+ "epoch": 937.71,
3969
+ "learning_rate": 8.377435897435897e-05,
3970
+ "loss": 0.0494,
3971
+ "step": 15950
3972
+ },
3973
+ {
3974
+ "epoch": 939.18,
3975
+ "learning_rate": 8.326153846153847e-05,
3976
+ "loss": 0.0491,
3977
+ "step": 15975
3978
+ },
3979
+ {
3980
+ "epoch": 940.65,
3981
+ "learning_rate": 8.274871794871796e-05,
3982
+ "loss": 0.0482,
3983
+ "step": 16000
3984
+ },
3985
+ {
3986
+ "epoch": 940.65,
3987
+ "eval_loss": 5.62109375,
3988
+ "eval_runtime": 164.5773,
3989
+ "eval_samples_per_second": 1.653,
3990
+ "eval_steps_per_second": 0.103,
3991
+ "eval_wer": 136.06983655274888,
3992
+ "step": 16000
3993
+ },
3994
+ {
3995
+ "epoch": 942.12,
3996
+ "learning_rate": 8.223589743589743e-05,
3997
+ "loss": 0.0492,
3998
+ "step": 16025
3999
+ },
4000
+ {
4001
+ "epoch": 943.59,
4002
+ "learning_rate": 8.172307692307692e-05,
4003
+ "loss": 0.0485,
4004
+ "step": 16050
4005
+ },
4006
+ {
4007
+ "epoch": 945.06,
4008
+ "learning_rate": 8.121025641025641e-05,
4009
+ "loss": 0.0489,
4010
+ "step": 16075
4011
+ },
4012
+ {
4013
+ "epoch": 946.53,
4014
+ "learning_rate": 8.069743589743591e-05,
4015
+ "loss": 0.0494,
4016
+ "step": 16100
4017
+ },
4018
+ {
4019
+ "epoch": 948.0,
4020
+ "learning_rate": 8.01846153846154e-05,
4021
+ "loss": 0.0487,
4022
+ "step": 16125
4023
+ },
4024
+ {
4025
+ "epoch": 949.47,
4026
+ "learning_rate": 7.967179487179488e-05,
4027
+ "loss": 0.0473,
4028
+ "step": 16150
4029
+ },
4030
+ {
4031
+ "epoch": 950.94,
4032
+ "learning_rate": 7.915897435897435e-05,
4033
+ "loss": 0.0489,
4034
+ "step": 16175
4035
+ },
4036
+ {
4037
+ "epoch": 952.41,
4038
+ "learning_rate": 7.864615384615385e-05,
4039
+ "loss": 0.048,
4040
+ "step": 16200
4041
+ },
4042
+ {
4043
+ "epoch": 953.88,
4044
+ "learning_rate": 7.813333333333334e-05,
4045
+ "loss": 0.0479,
4046
+ "step": 16225
4047
+ },
4048
+ {
4049
+ "epoch": 955.35,
4050
+ "learning_rate": 7.762051282051283e-05,
4051
+ "loss": 0.0549,
4052
+ "step": 16250
4053
+ },
4054
+ {
4055
+ "epoch": 956.82,
4056
+ "learning_rate": 7.710769230769231e-05,
4057
+ "loss": 0.0479,
4058
+ "step": 16275
4059
+ },
4060
+ {
4061
+ "epoch": 958.29,
4062
+ "learning_rate": 7.65948717948718e-05,
4063
+ "loss": 0.0468,
4064
+ "step": 16300
4065
+ },
4066
+ {
4067
+ "epoch": 959.76,
4068
+ "learning_rate": 7.608205128205129e-05,
4069
+ "loss": 0.0477,
4070
+ "step": 16325
4071
+ },
4072
+ {
4073
+ "epoch": 961.24,
4074
+ "learning_rate": 7.556923076923077e-05,
4075
+ "loss": 0.0482,
4076
+ "step": 16350
4077
+ },
4078
+ {
4079
+ "epoch": 962.71,
4080
+ "learning_rate": 7.505641025641026e-05,
4081
+ "loss": 0.0493,
4082
+ "step": 16375
4083
+ },
4084
+ {
4085
+ "epoch": 964.18,
4086
+ "learning_rate": 7.454358974358975e-05,
4087
+ "loss": 0.0499,
4088
+ "step": 16400
4089
+ },
4090
+ {
4091
+ "epoch": 965.65,
4092
+ "learning_rate": 7.403076923076923e-05,
4093
+ "loss": 0.0516,
4094
+ "step": 16425
4095
+ },
4096
+ {
4097
+ "epoch": 967.12,
4098
+ "learning_rate": 7.351794871794873e-05,
4099
+ "loss": 0.052,
4100
+ "step": 16450
4101
+ },
4102
+ {
4103
+ "epoch": 968.59,
4104
+ "learning_rate": 7.30051282051282e-05,
4105
+ "loss": 0.0495,
4106
+ "step": 16475
4107
+ },
4108
+ {
4109
+ "epoch": 970.06,
4110
+ "learning_rate": 7.249230769230769e-05,
4111
+ "loss": 0.0495,
4112
+ "step": 16500
4113
+ },
4114
+ {
4115
+ "epoch": 971.53,
4116
+ "learning_rate": 7.197948717948718e-05,
4117
+ "loss": 0.0482,
4118
+ "step": 16525
4119
+ },
4120
+ {
4121
+ "epoch": 973.0,
4122
+ "learning_rate": 7.146666666666666e-05,
4123
+ "loss": 0.0511,
4124
+ "step": 16550
4125
+ },
4126
+ {
4127
+ "epoch": 974.47,
4128
+ "learning_rate": 7.095384615384616e-05,
4129
+ "loss": 0.0487,
4130
+ "step": 16575
4131
+ },
4132
+ {
4133
+ "epoch": 975.94,
4134
+ "learning_rate": 7.044102564102565e-05,
4135
+ "loss": 0.049,
4136
+ "step": 16600
4137
+ },
4138
+ {
4139
+ "epoch": 977.41,
4140
+ "learning_rate": 6.992820512820512e-05,
4141
+ "loss": 0.048,
4142
+ "step": 16625
4143
+ },
4144
+ {
4145
+ "epoch": 978.88,
4146
+ "learning_rate": 6.941538461538461e-05,
4147
+ "loss": 0.0485,
4148
+ "step": 16650
4149
+ },
4150
+ {
4151
+ "epoch": 980.35,
4152
+ "learning_rate": 6.890256410256411e-05,
4153
+ "loss": 0.0525,
4154
+ "step": 16675
4155
+ },
4156
+ {
4157
+ "epoch": 981.82,
4158
+ "learning_rate": 6.83897435897436e-05,
4159
+ "loss": 0.0478,
4160
+ "step": 16700
4161
+ },
4162
+ {
4163
+ "epoch": 983.29,
4164
+ "learning_rate": 6.787692307692308e-05,
4165
+ "loss": 0.0481,
4166
+ "step": 16725
4167
+ },
4168
+ {
4169
+ "epoch": 984.76,
4170
+ "learning_rate": 6.736410256410257e-05,
4171
+ "loss": 0.0494,
4172
+ "step": 16750
4173
+ },
4174
+ {
4175
+ "epoch": 986.24,
4176
+ "learning_rate": 6.685128205128204e-05,
4177
+ "loss": 0.0468,
4178
+ "step": 16775
4179
+ },
4180
+ {
4181
+ "epoch": 987.71,
4182
+ "learning_rate": 6.633846153846154e-05,
4183
+ "loss": 0.0631,
4184
+ "step": 16800
4185
+ },
4186
+ {
4187
+ "epoch": 989.18,
4188
+ "learning_rate": 6.582564102564103e-05,
4189
+ "loss": 0.0468,
4190
+ "step": 16825
4191
+ },
4192
+ {
4193
+ "epoch": 990.65,
4194
+ "learning_rate": 6.531282051282052e-05,
4195
+ "loss": 0.0464,
4196
+ "step": 16850
4197
+ },
4198
+ {
4199
+ "epoch": 992.12,
4200
+ "learning_rate": 6.48e-05,
4201
+ "loss": 0.0625,
4202
+ "step": 16875
4203
+ },
4204
+ {
4205
+ "epoch": 993.59,
4206
+ "learning_rate": 6.428717948717949e-05,
4207
+ "loss": 0.0497,
4208
+ "step": 16900
4209
+ },
4210
+ {
4211
+ "epoch": 995.06,
4212
+ "learning_rate": 6.377435897435898e-05,
4213
+ "loss": 0.0481,
4214
+ "step": 16925
4215
+ },
4216
+ {
4217
+ "epoch": 996.53,
4218
+ "learning_rate": 6.326153846153846e-05,
4219
+ "loss": 0.0484,
4220
+ "step": 16950
4221
+ },
4222
+ {
4223
+ "epoch": 998.0,
4224
+ "learning_rate": 6.274871794871795e-05,
4225
+ "loss": 0.0506,
4226
+ "step": 16975
4227
+ },
4228
+ {
4229
+ "epoch": 999.47,
4230
+ "learning_rate": 6.223589743589744e-05,
4231
+ "loss": 0.0471,
4232
+ "step": 17000
4233
+ },
4234
+ {
4235
+ "epoch": 999.47,
4236
+ "eval_loss": 5.6484375,
4237
+ "eval_runtime": 155.9288,
4238
+ "eval_samples_per_second": 1.744,
4239
+ "eval_steps_per_second": 0.109,
4240
+ "eval_wer": 121.2481426448737,
4241
+ "step": 17000
4242
+ },
4243
+ {
4244
+ "epoch": 1000.94,
4245
+ "learning_rate": 6.172307692307692e-05,
4246
+ "loss": 0.0499,
4247
+ "step": 17025
4248
+ },
4249
+ {
4250
+ "epoch": 1002.41,
4251
+ "learning_rate": 6.121025641025642e-05,
4252
+ "loss": 0.0476,
4253
+ "step": 17050
4254
+ },
4255
+ {
4256
+ "epoch": 1003.88,
4257
+ "learning_rate": 6.069743589743591e-05,
4258
+ "loss": 0.0482,
4259
+ "step": 17075
4260
+ },
4261
+ {
4262
+ "epoch": 1005.35,
4263
+ "learning_rate": 6.018461538461538e-05,
4264
+ "loss": 0.0471,
4265
+ "step": 17100
4266
+ },
4267
+ {
4268
+ "epoch": 1006.82,
4269
+ "learning_rate": 5.9671794871794875e-05,
4270
+ "loss": 0.0461,
4271
+ "step": 17125
4272
+ },
4273
+ {
4274
+ "epoch": 1008.29,
4275
+ "learning_rate": 5.915897435897436e-05,
4276
+ "loss": 0.046,
4277
+ "step": 17150
4278
+ },
4279
+ {
4280
+ "epoch": 1009.76,
4281
+ "learning_rate": 5.864615384615385e-05,
4282
+ "loss": 0.0466,
4283
+ "step": 17175
4284
+ },
4285
+ {
4286
+ "epoch": 1011.24,
4287
+ "learning_rate": 5.813333333333334e-05,
4288
+ "loss": 0.0462,
4289
+ "step": 17200
4290
+ },
4291
+ {
4292
+ "epoch": 1012.71,
4293
+ "learning_rate": 5.762051282051283e-05,
4294
+ "loss": 0.0468,
4295
+ "step": 17225
4296
+ },
4297
+ {
4298
+ "epoch": 1014.18,
4299
+ "learning_rate": 5.710769230769231e-05,
4300
+ "loss": 0.0463,
4301
+ "step": 17250
4302
+ },
4303
+ {
4304
+ "epoch": 1015.65,
4305
+ "learning_rate": 5.6594871794871794e-05,
4306
+ "loss": 0.0463,
4307
+ "step": 17275
4308
+ },
4309
+ {
4310
+ "epoch": 1017.12,
4311
+ "learning_rate": 5.608205128205129e-05,
4312
+ "loss": 0.0451,
4313
+ "step": 17300
4314
+ },
4315
+ {
4316
+ "epoch": 1018.59,
4317
+ "learning_rate": 5.5569230769230774e-05,
4318
+ "loss": 0.0465,
4319
+ "step": 17325
4320
+ },
4321
+ {
4322
+ "epoch": 1020.06,
4323
+ "learning_rate": 5.505641025641026e-05,
4324
+ "loss": 0.0473,
4325
+ "step": 17350
4326
+ },
4327
+ {
4328
+ "epoch": 1021.53,
4329
+ "learning_rate": 5.4543589743589754e-05,
4330
+ "loss": 0.0456,
4331
+ "step": 17375
4332
+ },
4333
+ {
4334
+ "epoch": 1023.0,
4335
+ "learning_rate": 5.403076923076923e-05,
4336
+ "loss": 0.0466,
4337
+ "step": 17400
4338
+ },
4339
+ {
4340
+ "epoch": 1024.47,
4341
+ "learning_rate": 5.351794871794872e-05,
4342
+ "loss": 0.046,
4343
+ "step": 17425
4344
+ },
4345
+ {
4346
+ "epoch": 1025.94,
4347
+ "learning_rate": 5.300512820512821e-05,
4348
+ "loss": 0.0475,
4349
+ "step": 17450
4350
+ },
4351
+ {
4352
+ "epoch": 1027.41,
4353
+ "learning_rate": 5.249230769230769e-05,
4354
+ "loss": 0.0461,
4355
+ "step": 17475
4356
+ },
4357
+ {
4358
+ "epoch": 1028.88,
4359
+ "learning_rate": 5.1979487179487187e-05,
4360
+ "loss": 0.0467,
4361
+ "step": 17500
4362
+ },
4363
+ {
4364
+ "epoch": 1030.35,
4365
+ "learning_rate": 5.146666666666667e-05,
4366
+ "loss": 0.0457,
4367
+ "step": 17525
4368
+ },
4369
+ {
4370
+ "epoch": 1031.82,
4371
+ "learning_rate": 5.095384615384615e-05,
4372
+ "loss": 0.0454,
4373
+ "step": 17550
4374
+ },
4375
+ {
4376
+ "epoch": 1033.29,
4377
+ "learning_rate": 5.044102564102564e-05,
4378
+ "loss": 0.0458,
4379
+ "step": 17575
4380
+ },
4381
+ {
4382
+ "epoch": 1034.76,
4383
+ "learning_rate": 4.992820512820513e-05,
4384
+ "loss": 0.0445,
4385
+ "step": 17600
4386
+ },
4387
+ {
4388
+ "epoch": 1036.24,
4389
+ "learning_rate": 4.941538461538462e-05,
4390
+ "loss": 0.0456,
4391
+ "step": 17625
4392
+ },
4393
+ {
4394
+ "epoch": 1037.71,
4395
+ "learning_rate": 4.8902564102564106e-05,
4396
+ "loss": 0.0442,
4397
+ "step": 17650
4398
+ },
4399
+ {
4400
+ "epoch": 1039.18,
4401
+ "learning_rate": 4.838974358974359e-05,
4402
+ "loss": 0.0438,
4403
+ "step": 17675
4404
+ },
4405
+ {
4406
+ "epoch": 1040.65,
4407
+ "learning_rate": 4.787692307692308e-05,
4408
+ "loss": 0.0443,
4409
+ "step": 17700
4410
+ },
4411
+ {
4412
+ "epoch": 1042.12,
4413
+ "learning_rate": 4.7364102564102565e-05,
4414
+ "loss": 0.0439,
4415
+ "step": 17725
4416
+ },
4417
+ {
4418
+ "epoch": 1043.59,
4419
+ "learning_rate": 4.685128205128205e-05,
4420
+ "loss": 0.0428,
4421
+ "step": 17750
4422
+ },
4423
+ {
4424
+ "epoch": 1045.06,
4425
+ "learning_rate": 4.633846153846154e-05,
4426
+ "loss": 0.0427,
4427
+ "step": 17775
4428
+ },
4429
+ {
4430
+ "epoch": 1046.53,
4431
+ "learning_rate": 4.5825641025641025e-05,
4432
+ "loss": 0.0418,
4433
+ "step": 17800
4434
+ },
4435
+ {
4436
+ "epoch": 1048.0,
4437
+ "learning_rate": 4.531282051282051e-05,
4438
+ "loss": 0.0416,
4439
+ "step": 17825
4440
+ },
4441
+ {
4442
+ "epoch": 1049.47,
4443
+ "learning_rate": 4.4800000000000005e-05,
4444
+ "loss": 0.0417,
4445
+ "step": 17850
4446
+ },
4447
+ {
4448
+ "epoch": 1050.94,
4449
+ "learning_rate": 4.428717948717949e-05,
4450
+ "loss": 0.0418,
4451
+ "step": 17875
4452
+ },
4453
+ {
4454
+ "epoch": 1052.41,
4455
+ "learning_rate": 4.377435897435898e-05,
4456
+ "loss": 0.0408,
4457
+ "step": 17900
4458
+ },
4459
+ {
4460
+ "epoch": 1053.88,
4461
+ "learning_rate": 4.3261538461538464e-05,
4462
+ "loss": 0.0412,
4463
+ "step": 17925
4464
+ },
4465
+ {
4466
+ "epoch": 1055.35,
4467
+ "learning_rate": 4.274871794871795e-05,
4468
+ "loss": 0.0407,
4469
+ "step": 17950
4470
+ },
4471
+ {
4472
+ "epoch": 1056.82,
4473
+ "learning_rate": 4.223589743589744e-05,
4474
+ "loss": 0.0408,
4475
+ "step": 17975
4476
+ },
4477
+ {
4478
+ "epoch": 1058.29,
4479
+ "learning_rate": 4.1723076923076924e-05,
4480
+ "loss": 0.0405,
4481
+ "step": 18000
4482
+ },
4483
+ {
4484
+ "epoch": 1058.29,
4485
+ "eval_loss": 5.7265625,
4486
+ "eval_runtime": 164.1653,
4487
+ "eval_samples_per_second": 1.657,
4488
+ "eval_steps_per_second": 0.104,
4489
+ "eval_wer": 119.38150074294205,
4490
+ "step": 18000
4491
+ }
4492
+ ],
4493
+ "max_steps": 20000,
4494
+ "num_train_epochs": 1177,
4495
+ "total_flos": 5.559796194543938e+20,
4496
+ "trial_name": null,
4497
+ "trial_params": null
4498
+ }
checkpoint-18000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6e3ac4aeab20cf895e188b7a0ae60077219ad0067d587dfa1da35e123e14fa0
3
+ size 4795
checkpoint-18000/zero_to_fp32.py ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
4
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
5
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
6
+ # application.
7
+ #
8
+ # example: python zero_to_fp32.py . pytorch_model.bin
9
+
10
+ import argparse
11
+ import torch
12
+ import glob
13
+ import math
14
+ import os
15
+ import re
16
+ from collections import OrderedDict
17
+
18
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
19
+ # DeepSpeed data structures it has to be available in the current python environment.
20
+ from deepspeed.utils import logger
21
+ from deepspeed.checkpoint.constants import (DS_VERSION,
22
+ OPTIMIZER_STATE_DICT,
23
+ SINGLE_PARTITION_OF_FP32_GROUPS,
24
+ FP32_FLAT_GROUPS,
25
+ ZERO_STAGE,
26
+ PARTITION_COUNT,
27
+ PARAM_SHAPES,
28
+ BUFFER_NAMES)
29
+
30
+ debug = 0
31
+
32
+ # load to cpu
33
+ device = torch.device('cpu')
34
+
35
+
36
+ def atoi(text):
37
+ return int(text) if text.isdigit() else text
38
+
39
+
40
+ def natural_keys(text):
41
+ '''
42
+ alist.sort(key=natural_keys) sorts in human order
43
+ http://nedbatchelder.com/blog/200712/human_sorting.html
44
+ (See Toothy's implementation in the comments)
45
+ '''
46
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
47
+
48
+
49
+ def get_model_state_file(checkpoint_dir, zero_stage):
50
+ if not os.path.isdir(checkpoint_dir):
51
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
52
+
53
+ # there should be only one file
54
+ if zero_stage == 2:
55
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
56
+ elif zero_stage == 3:
57
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
58
+
59
+ if not os.path.exists(file):
60
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
61
+
62
+ return file
63
+
64
+
65
+ def get_optim_files(checkpoint_dir):
66
+ # XXX: need to test that this simple glob rule works for multi-node setup too
67
+ optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
68
+ "*_optim_states.pt")),
69
+ key=natural_keys)
70
+
71
+ if len(optim_files) == 0:
72
+ raise FileNotFoundError(
73
+ f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
74
+
75
+ return optim_files
76
+
77
+
78
+ def parse_model_state(file):
79
+ state_dict = torch.load(file, map_location=device)
80
+
81
+ if BUFFER_NAMES not in state_dict:
82
+ raise ValueError(f"{file} is not a model state checkpoint")
83
+ buffer_names = state_dict[BUFFER_NAMES]
84
+ if debug:
85
+ print("Found buffers:", buffer_names)
86
+
87
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
88
+ buffers = {
89
+ k: v.float()
90
+ for k,
91
+ v in state_dict["module"].items() if k in buffer_names
92
+ }
93
+ param_shapes = state_dict[PARAM_SHAPES]
94
+
95
+ ds_version = state_dict.get(DS_VERSION, None)
96
+
97
+ return buffers, param_shapes, ds_version
98
+
99
+
100
+ def parse_optim_states(files, ds_checkpoint_dir):
101
+
102
+ total_files = len(files)
103
+ state_dicts = []
104
+ for f in files:
105
+ state_dicts.append(torch.load(f, map_location=device))
106
+
107
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
108
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
109
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
110
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
111
+
112
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
113
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
114
+ # use the max of the partition_count to get the dp world_size.
115
+
116
+ if type(world_size) is list:
117
+ world_size = max(world_size)
118
+
119
+ if world_size != total_files:
120
+ raise ValueError(
121
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
122
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
123
+ )
124
+
125
+ # the groups are named differently in each stage
126
+ if zero_stage == 2:
127
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
128
+ elif zero_stage == 3:
129
+ fp32_groups_key = FP32_FLAT_GROUPS
130
+ else:
131
+ raise ValueError(f"unknown zero stage {zero_stage}")
132
+
133
+ if zero_stage == 2:
134
+ fp32_flat_groups = [
135
+ state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
136
+ for i in range(len(state_dicts))
137
+ ]
138
+ elif zero_stage == 3:
139
+ # if there is more than one param group, there will be multiple flattened tensors - one
140
+ # flattened tensor per group - for simplicity merge them into a single tensor
141
+ #
142
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
143
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
144
+
145
+ fp32_flat_groups = [
146
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
147
+ 0) for i in range(len(state_dicts))
148
+ ]
149
+
150
+ return zero_stage, world_size, fp32_flat_groups
151
+
152
+
153
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
154
+ """
155
+ Returns fp32 state_dict reconstructed from ds checkpoint
156
+
157
+ Args:
158
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
159
+
160
+ """
161
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
162
+
163
+ optim_files = get_optim_files(ds_checkpoint_dir)
164
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
165
+ print(
166
+ f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
167
+
168
+ model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
169
+ buffers, param_shapes, ds_version = parse_model_state(model_file)
170
+ print(f'Parsing checkpoint created by deepspeed=={ds_version}')
171
+
172
+ if zero_stage == 2:
173
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
174
+ param_shapes,
175
+ fp32_flat_groups,
176
+ buffers)
177
+ elif zero_stage == 3:
178
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
179
+ param_shapes,
180
+ fp32_flat_groups,
181
+ buffers)
182
+
183
+
184
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
185
+ param_shapes,
186
+ fp32_flat_groups,
187
+ buffers):
188
+
189
+ # Reconstruction protocol:
190
+ #
191
+ # XXX: document this
192
+
193
+ if debug:
194
+ for i in range(world_size):
195
+ for j in range(len(fp32_flat_groups[0])):
196
+ print(
197
+ f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
198
+
199
+ # XXX: memory usage doubles here (zero2)
200
+ num_param_groups = len(fp32_flat_groups[0])
201
+ merged_single_partition_of_fp32_groups = []
202
+ for i in range(num_param_groups):
203
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
204
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
205
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
206
+ avail_numel = sum([
207
+ full_single_fp32_vector.numel()
208
+ for full_single_fp32_vector in merged_single_partition_of_fp32_groups
209
+ ])
210
+
211
+ if debug:
212
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
213
+ wanted_numel = sum(
214
+ [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
215
+ # not asserting if there is a mismatch due to possible padding
216
+ print(f"Have {avail_numel} numels to process.")
217
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
218
+
219
+ state_dict = OrderedDict()
220
+
221
+ # buffers
222
+ state_dict.update(buffers)
223
+ if debug:
224
+ print(f"added {len(buffers)} buffers")
225
+
226
+ # params
227
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
228
+ # out-of-core computing solution
229
+ total_numel = 0
230
+ total_params = 0
231
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
232
+ offset = 0
233
+ avail_numel = full_single_fp32_vector.numel()
234
+ for name, shape in shapes.items():
235
+
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+ total_params += 1
239
+
240
+ if debug:
241
+ print(
242
+ f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
243
+ )
244
+ state_dict[name] = full_single_fp32_vector.narrow(
245
+ 0,
246
+ offset,
247
+ unpartitioned_numel).view(shape)
248
+ offset += unpartitioned_numel
249
+
250
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
251
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
252
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
253
+ # live optimizer object, so we are checking that the numbers are within the right range
254
+ align_to = 2 * world_size
255
+
256
+ def zero2_align(x):
257
+ return align_to * math.ceil(x / align_to)
258
+
259
+ if debug:
260
+ print(f"original offset={offset}, avail_numel={avail_numel}")
261
+
262
+ offset = zero2_align(offset)
263
+ avail_numel = zero2_align(avail_numel)
264
+
265
+ if debug:
266
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
267
+
268
+ # Sanity check
269
+ if offset != avail_numel:
270
+ raise ValueError(
271
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
272
+
273
+ print(
274
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
275
+ )
276
+
277
+ return state_dict
278
+
279
+
280
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
281
+ remainder = unpartitioned_numel % world_size
282
+ padding_numel = (world_size - remainder) if remainder else 0
283
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
284
+ return partitioned_numel, padding_numel
285
+
286
+
287
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
288
+ param_shapes,
289
+ fp32_flat_groups,
290
+ buffers):
291
+
292
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
293
+ # param, re-consolidating each param, while dealing with padding if any
294
+
295
+ avail_numel = fp32_flat_groups[0].numel() * world_size
296
+ # merge list of dicts, preserving order
297
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
298
+
299
+ if debug:
300
+ for i in range(world_size):
301
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
302
+
303
+ wanted_params = len(param_shapes)
304
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
305
+ # not asserting if there is a mismatch due to possible padding
306
+ print(f"Have {avail_numel} numels to process.")
307
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
308
+
309
+ state_dict = OrderedDict()
310
+
311
+ # buffers
312
+ state_dict.update(buffers)
313
+ if debug:
314
+ print(f"added {len(buffers)} buffers")
315
+
316
+ # params
317
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
318
+ # out-of-core computing solution
319
+ offset = 0
320
+ total_numel = 0
321
+ total_params = 0
322
+ for name, shape in param_shapes.items():
323
+
324
+ unpartitioned_numel = shape.numel()
325
+ total_numel += unpartitioned_numel
326
+ total_params += 1
327
+
328
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
329
+
330
+ if debug:
331
+ print(
332
+ f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
333
+ )
334
+
335
+ # XXX: memory usage doubles here
336
+ state_dict[name] = torch.cat(
337
+ tuple(fp32_flat_groups[i].narrow(0,
338
+ offset,
339
+ partitioned_numel)
340
+ for i in range(world_size)),
341
+ 0).narrow(0,
342
+ 0,
343
+ unpartitioned_numel).view(shape)
344
+ offset += partitioned_numel
345
+
346
+ offset *= world_size
347
+
348
+ # Sanity check
349
+ if offset != avail_numel:
350
+ raise ValueError(
351
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
352
+
353
+ print(
354
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
355
+ )
356
+
357
+ return state_dict
358
+
359
+
360
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
361
+ """
362
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
363
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
364
+ via a model hub.
365
+
366
+ Args:
367
+ - ``checkpoint_dir``: path to the desired checkpoint folder
368
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
369
+
370
+ Returns:
371
+ - pytorch ``state_dict``
372
+
373
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
374
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
375
+ the checkpoint.
376
+
377
+ A typical usage might be ::
378
+
379
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
380
+ # do the training and checkpoint saving
381
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
382
+ model = model.cpu() # move to cpu
383
+ model.load_state_dict(state_dict)
384
+ # submit to model hub or save the model to share with others
385
+
386
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
387
+ application. i.e. you will need to re-initialize the deepspeed engine, since
388
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
389
+
390
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
391
+
392
+ """
393
+ if tag is None:
394
+ latest_path = os.path.join(checkpoint_dir, 'latest')
395
+ if os.path.isfile(latest_path):
396
+ with open(latest_path, 'r') as fd:
397
+ tag = fd.read().strip()
398
+ else:
399
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
400
+
401
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
402
+
403
+ if not os.path.isdir(ds_checkpoint_dir):
404
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
405
+
406
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
407
+
408
+
409
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
410
+ """
411
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
412
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
413
+
414
+ Args:
415
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
416
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
417
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
418
+ """
419
+
420
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
421
+ print(f"Saving fp32 state dict to {output_file}")
422
+ torch.save(state_dict, output_file)
423
+
424
+
425
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
426
+ """
427
+ 1. Put the provided model to cpu
428
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
429
+ 3. Load it into the provided model
430
+
431
+ Args:
432
+ - ``model``: the model object to update
433
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
434
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
435
+
436
+ Returns:
437
+ - ``model`: modified model
438
+
439
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
440
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
441
+ conveniently placed for you in the checkpoint folder.
442
+
443
+ A typical usage might be ::
444
+
445
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
446
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
447
+ # submit to model hub or save the model to share with others
448
+
449
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
450
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
451
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
452
+
453
+ """
454
+ logger.info(f"Extracting fp32 weights")
455
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
456
+
457
+ logger.info(f"Overwriting model with fp32 weights")
458
+ model = model.cpu()
459
+ model.load_state_dict(state_dict, strict=False)
460
+
461
+ return model
462
+
463
+
464
+ if __name__ == "__main__":
465
+
466
+ parser = argparse.ArgumentParser()
467
+ parser.add_argument(
468
+ "checkpoint_dir",
469
+ type=str,
470
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
471
+ parser.add_argument(
472
+ "output_file",
473
+ type=str,
474
+ help=
475
+ "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
476
+ )
477
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
478
+ args = parser.parse_args()
479
+
480
+ debug = args.debug
481
+
482
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)
checkpoint-19000/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "emilios/whisper-medium-el-n2",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 24,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.1,
21
+ "encoder_attention_heads": 16,
22
+ "encoder_ffn_dim": 4096,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 24,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": null,
27
+ "init_std": 0.02,
28
+ "is_encoder_decoder": true,
29
+ "max_length": 448,
30
+ "max_source_positions": 1500,
31
+ "max_target_positions": 448,
32
+ "model_type": "whisper",
33
+ "num_hidden_layers": 24,
34
+ "num_mel_bins": 80,
35
+ "pad_token_id": 50257,
36
+ "scale_embedding": false,
37
+ "torch_dtype": "float16",
38
+ "transformers_version": "4.26.0.dev0",
39
+ "use_cache": false,
40
+ "vocab_size": 51865
41
+ }
checkpoint-19000/global_step19000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1f2be49b925a3d3428deee50bbd236f1d87683eaae35ea61d83d0224b3efd8b
3
+ size 1527967899
checkpoint-19000/global_step19000/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fef809917e6d1d45bf95028fa0c8ceaa29a86ad364079f9addbf82b613073dce
3
+ size 9166378846
checkpoint-19000/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step19000
checkpoint-19000/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-19000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3587abd7721170295163f03d9eef91f3c21b17801e7b32024f49b33cbda1966a
3
+ size 1527847357
checkpoint-19000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0bd9c357a54e16eb97a9c6322c553eeab7612d1e7683e1e97d776bf49546c54
3
+ size 14575
checkpoint-19000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-19000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4970c744c02fa43cf558652daa2e94e99eacce59b5a2466f72e429e0ad7414e6
3
+ size 4795