nadahlberg commited on
Commit
c8b4fc0
1 Parent(s): 9f35832

Training in progress, step 194000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b819f02a5a8e8b4c04f840f21141141896aead6b814f5a9e49f2e4d1eccc11a5
3
  size 325690872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a89c437e50b0f8ca4fd9af67e3083993a3fdc03fe2a27a76b7b04ce1d97eabf5
3
  size 325690872
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:118e793fba1469f81a23583a6d60b59e1cab1cb86c092182d47a52fc238b48d8
3
  size 651550778
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97b0a64aecfff2c26bdd5120e954a1d5857e9b29b7d91dd33d19b696968af0e1
3
  size 651550778
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64ab7d05c5118dde6db80867cc2f5370cc515df45961a8e889cf725e044c5702
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e074e22eba76ab6e9b544774e027528ef7c10076e885ecdf1f0ce9d0a4ffb058
3
  size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcc5e36dc2398935b28e41223e5c2b3b94afff9495dd3b230f8aea8fb5c5ba7d
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:522379038ff9f72eee4f16fc13c7c527fd89328d593b292e47a71417cbf520bb
3
  size 15920
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4eb9f1ff0aa12cc784aa4b7dcfb98fbb5f2137d1ea9a51668c5ebbbbcd927316
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73b398f0dddef58a9282d66a49fd8e220898de17f6d838d7ffa52e9fffff8e6e
3
  size 15920
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4b89514ea98ea1c35f3af47c3ade107ba32a3c3250dc39c14ee01bb39c61afe
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dcd63f04d0d7f5e8660b206f8c68db860f40c06e41516d296fd35ca06bcd87f
3
  size 15920
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeffb5474b5a368c5c99b5eafd0259d77819c563140eb7e86f7baa3ee0e80850
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afe9ba037bd933543b4aeb0ca749e02c10800870c577fe4d6537ec92694a8a38
3
  size 15920
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:729d1d4435962983cff0ac4574b7851e6db614278d0f948a75e3e98967526142
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfd5271ec811e7349df93e4abc5b00c0534fac3f75454b6fc73f3a9b47df954c
3
  size 15920
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1fbb6646d1a6981b0b753a025194028df05b8a68ba6e6c857c43a4c6843740f
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07489b2fd53c594e60aa0037945803fe0cdb07239fa8fae63abd5b6720b1edf9
3
  size 15920
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d60636e7659209459be516328913c44af3ee3085a0af6c1b190c97355024e22
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4819284f798dc8aad3d5a2e6f2bfc72c0dffbf7156d8c82abd86685ea2d9c9df
3
  size 15920
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d6c495c228be690a0720034f9d881ff3236631a1636ecab1d775dfbf2f4cd28
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81927fcd203392c94aa10d8605dd1de0e335a27d0e6da83feec012c338917f03
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.96,
5
  "eval_steps": 2000,
6
- "global_step": 192000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -135175,6 +135175,1414 @@
135175
  "eval_samples_per_second": 55.2,
135176
  "eval_steps_per_second": 0.11,
135177
  "step": 192000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135178
  }
135179
  ],
135180
  "logging_steps": 10,
@@ -135194,7 +136602,7 @@
135194
  "attributes": {}
135195
  }
135196
  },
135197
- "total_flos": 5.077814850512486e+18,
135198
  "train_batch_size": 64,
135199
  "trial_name": null,
135200
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.97,
5
  "eval_steps": 2000,
6
+ "global_step": 194000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
135175
  "eval_samples_per_second": 55.2,
135176
  "eval_steps_per_second": 0.11,
135177
  "step": 192000
135178
+ },
135179
+ {
135180
+ "epoch": 0.96005,
135181
+ "grad_norm": 0.76171875,
135182
+ "learning_rate": 0.00012045226130653266,
135183
+ "loss": 2.0404,
135184
+ "step": 192010
135185
+ },
135186
+ {
135187
+ "epoch": 0.9601,
135188
+ "grad_norm": 0.5703125,
135189
+ "learning_rate": 0.00012030150753768844,
135190
+ "loss": 2.1006,
135191
+ "step": 192020
135192
+ },
135193
+ {
135194
+ "epoch": 0.96015,
135195
+ "grad_norm": 0.61328125,
135196
+ "learning_rate": 0.00012015075376884422,
135197
+ "loss": 2.0222,
135198
+ "step": 192030
135199
+ },
135200
+ {
135201
+ "epoch": 0.9602,
135202
+ "grad_norm": 0.671875,
135203
+ "learning_rate": 0.00012,
135204
+ "loss": 2.1103,
135205
+ "step": 192040
135206
+ },
135207
+ {
135208
+ "epoch": 0.96025,
135209
+ "grad_norm": 0.5703125,
135210
+ "learning_rate": 0.00011984924623115578,
135211
+ "loss": 2.0192,
135212
+ "step": 192050
135213
+ },
135214
+ {
135215
+ "epoch": 0.9603,
135216
+ "grad_norm": 0.609375,
135217
+ "learning_rate": 0.00011969849246231156,
135218
+ "loss": 2.0935,
135219
+ "step": 192060
135220
+ },
135221
+ {
135222
+ "epoch": 0.96035,
135223
+ "grad_norm": 0.57421875,
135224
+ "learning_rate": 0.00011954773869346734,
135225
+ "loss": 2.0174,
135226
+ "step": 192070
135227
+ },
135228
+ {
135229
+ "epoch": 0.9604,
135230
+ "grad_norm": 0.57421875,
135231
+ "learning_rate": 0.00011939698492462312,
135232
+ "loss": 2.0971,
135233
+ "step": 192080
135234
+ },
135235
+ {
135236
+ "epoch": 0.96045,
135237
+ "grad_norm": 0.6640625,
135238
+ "learning_rate": 0.0001192462311557789,
135239
+ "loss": 2.0244,
135240
+ "step": 192090
135241
+ },
135242
+ {
135243
+ "epoch": 0.9605,
135244
+ "grad_norm": 0.57421875,
135245
+ "learning_rate": 0.00011909547738693468,
135246
+ "loss": 2.0763,
135247
+ "step": 192100
135248
+ },
135249
+ {
135250
+ "epoch": 0.96055,
135251
+ "grad_norm": 0.57421875,
135252
+ "learning_rate": 0.00011894472361809046,
135253
+ "loss": 2.0698,
135254
+ "step": 192110
135255
+ },
135256
+ {
135257
+ "epoch": 0.9606,
135258
+ "grad_norm": 0.62109375,
135259
+ "learning_rate": 0.00011879396984924624,
135260
+ "loss": 2.0831,
135261
+ "step": 192120
135262
+ },
135263
+ {
135264
+ "epoch": 0.96065,
135265
+ "grad_norm": 0.6171875,
135266
+ "learning_rate": 0.00011864321608040202,
135267
+ "loss": 2.0726,
135268
+ "step": 192130
135269
+ },
135270
+ {
135271
+ "epoch": 0.9607,
135272
+ "grad_norm": 0.625,
135273
+ "learning_rate": 0.0001184924623115578,
135274
+ "loss": 2.0523,
135275
+ "step": 192140
135276
+ },
135277
+ {
135278
+ "epoch": 0.96075,
135279
+ "grad_norm": 0.63671875,
135280
+ "learning_rate": 0.00011834170854271358,
135281
+ "loss": 2.1123,
135282
+ "step": 192150
135283
+ },
135284
+ {
135285
+ "epoch": 0.9608,
135286
+ "grad_norm": 0.640625,
135287
+ "learning_rate": 0.00011819095477386936,
135288
+ "loss": 2.0639,
135289
+ "step": 192160
135290
+ },
135291
+ {
135292
+ "epoch": 0.96085,
135293
+ "grad_norm": 0.6015625,
135294
+ "learning_rate": 0.00011804020100502514,
135295
+ "loss": 2.0743,
135296
+ "step": 192170
135297
+ },
135298
+ {
135299
+ "epoch": 0.9609,
135300
+ "grad_norm": 0.6328125,
135301
+ "learning_rate": 0.0001178894472361809,
135302
+ "loss": 2.0775,
135303
+ "step": 192180
135304
+ },
135305
+ {
135306
+ "epoch": 0.96095,
135307
+ "grad_norm": 0.546875,
135308
+ "learning_rate": 0.00011773869346733669,
135309
+ "loss": 2.1002,
135310
+ "step": 192190
135311
+ },
135312
+ {
135313
+ "epoch": 0.961,
135314
+ "grad_norm": 0.6015625,
135315
+ "learning_rate": 0.00011758793969849247,
135316
+ "loss": 2.0077,
135317
+ "step": 192200
135318
+ },
135319
+ {
135320
+ "epoch": 0.96105,
135321
+ "grad_norm": 0.6171875,
135322
+ "learning_rate": 0.00011743718592964824,
135323
+ "loss": 2.053,
135324
+ "step": 192210
135325
+ },
135326
+ {
135327
+ "epoch": 0.9611,
135328
+ "grad_norm": 0.62890625,
135329
+ "learning_rate": 0.00011728643216080402,
135330
+ "loss": 2.0919,
135331
+ "step": 192220
135332
+ },
135333
+ {
135334
+ "epoch": 0.96115,
135335
+ "grad_norm": 0.56640625,
135336
+ "learning_rate": 0.0001171356783919598,
135337
+ "loss": 2.025,
135338
+ "step": 192230
135339
+ },
135340
+ {
135341
+ "epoch": 0.9612,
135342
+ "grad_norm": 0.62890625,
135343
+ "learning_rate": 0.00011698492462311558,
135344
+ "loss": 2.1223,
135345
+ "step": 192240
135346
+ },
135347
+ {
135348
+ "epoch": 0.96125,
135349
+ "grad_norm": 0.69140625,
135350
+ "learning_rate": 0.00011683417085427136,
135351
+ "loss": 2.0455,
135352
+ "step": 192250
135353
+ },
135354
+ {
135355
+ "epoch": 0.9613,
135356
+ "grad_norm": 0.69140625,
135357
+ "learning_rate": 0.00011668341708542714,
135358
+ "loss": 2.0944,
135359
+ "step": 192260
135360
+ },
135361
+ {
135362
+ "epoch": 0.96135,
135363
+ "grad_norm": 0.7265625,
135364
+ "learning_rate": 0.00011653266331658292,
135365
+ "loss": 2.0621,
135366
+ "step": 192270
135367
+ },
135368
+ {
135369
+ "epoch": 0.9614,
135370
+ "grad_norm": 0.61328125,
135371
+ "learning_rate": 0.0001163819095477387,
135372
+ "loss": 2.0937,
135373
+ "step": 192280
135374
+ },
135375
+ {
135376
+ "epoch": 0.96145,
135377
+ "grad_norm": 0.578125,
135378
+ "learning_rate": 0.00011623115577889448,
135379
+ "loss": 2.0469,
135380
+ "step": 192290
135381
+ },
135382
+ {
135383
+ "epoch": 0.9615,
135384
+ "grad_norm": 0.65234375,
135385
+ "learning_rate": 0.00011608040201005026,
135386
+ "loss": 2.0901,
135387
+ "step": 192300
135388
+ },
135389
+ {
135390
+ "epoch": 0.96155,
135391
+ "grad_norm": 0.60546875,
135392
+ "learning_rate": 0.00011592964824120604,
135393
+ "loss": 2.0236,
135394
+ "step": 192310
135395
+ },
135396
+ {
135397
+ "epoch": 0.9616,
135398
+ "grad_norm": 0.5703125,
135399
+ "learning_rate": 0.00011577889447236182,
135400
+ "loss": 2.0918,
135401
+ "step": 192320
135402
+ },
135403
+ {
135404
+ "epoch": 0.96165,
135405
+ "grad_norm": 0.58203125,
135406
+ "learning_rate": 0.0001156281407035176,
135407
+ "loss": 2.0807,
135408
+ "step": 192330
135409
+ },
135410
+ {
135411
+ "epoch": 0.9617,
135412
+ "grad_norm": 0.58984375,
135413
+ "learning_rate": 0.00011547738693467338,
135414
+ "loss": 2.0503,
135415
+ "step": 192340
135416
+ },
135417
+ {
135418
+ "epoch": 0.96175,
135419
+ "grad_norm": 0.59765625,
135420
+ "learning_rate": 0.00011532663316582916,
135421
+ "loss": 2.1287,
135422
+ "step": 192350
135423
+ },
135424
+ {
135425
+ "epoch": 0.9618,
135426
+ "grad_norm": 0.63671875,
135427
+ "learning_rate": 0.00011517587939698494,
135428
+ "loss": 2.1066,
135429
+ "step": 192360
135430
+ },
135431
+ {
135432
+ "epoch": 0.96185,
135433
+ "grad_norm": 0.63671875,
135434
+ "learning_rate": 0.00011502512562814072,
135435
+ "loss": 2.1153,
135436
+ "step": 192370
135437
+ },
135438
+ {
135439
+ "epoch": 0.9619,
135440
+ "grad_norm": 0.609375,
135441
+ "learning_rate": 0.0001148743718592965,
135442
+ "loss": 2.0449,
135443
+ "step": 192380
135444
+ },
135445
+ {
135446
+ "epoch": 0.96195,
135447
+ "grad_norm": 0.5859375,
135448
+ "learning_rate": 0.00011472361809045227,
135449
+ "loss": 2.0976,
135450
+ "step": 192390
135451
+ },
135452
+ {
135453
+ "epoch": 0.962,
135454
+ "grad_norm": 0.6484375,
135455
+ "learning_rate": 0.00011457286432160805,
135456
+ "loss": 2.0822,
135457
+ "step": 192400
135458
+ },
135459
+ {
135460
+ "epoch": 0.96205,
135461
+ "grad_norm": 0.6171875,
135462
+ "learning_rate": 0.00011442211055276383,
135463
+ "loss": 2.0671,
135464
+ "step": 192410
135465
+ },
135466
+ {
135467
+ "epoch": 0.9621,
135468
+ "grad_norm": 0.60546875,
135469
+ "learning_rate": 0.0001142713567839196,
135470
+ "loss": 2.0809,
135471
+ "step": 192420
135472
+ },
135473
+ {
135474
+ "epoch": 0.96215,
135475
+ "grad_norm": 0.5703125,
135476
+ "learning_rate": 0.00011412060301507539,
135477
+ "loss": 2.0569,
135478
+ "step": 192430
135479
+ },
135480
+ {
135481
+ "epoch": 0.9622,
135482
+ "grad_norm": 0.671875,
135483
+ "learning_rate": 0.00011396984924623116,
135484
+ "loss": 2.085,
135485
+ "step": 192440
135486
+ },
135487
+ {
135488
+ "epoch": 0.96225,
135489
+ "grad_norm": 0.59375,
135490
+ "learning_rate": 0.00011381909547738694,
135491
+ "loss": 2.0742,
135492
+ "step": 192450
135493
+ },
135494
+ {
135495
+ "epoch": 0.9623,
135496
+ "grad_norm": 0.59765625,
135497
+ "learning_rate": 0.00011366834170854272,
135498
+ "loss": 2.0884,
135499
+ "step": 192460
135500
+ },
135501
+ {
135502
+ "epoch": 0.96235,
135503
+ "grad_norm": 0.625,
135504
+ "learning_rate": 0.0001135175879396985,
135505
+ "loss": 2.0131,
135506
+ "step": 192470
135507
+ },
135508
+ {
135509
+ "epoch": 0.9624,
135510
+ "grad_norm": 0.640625,
135511
+ "learning_rate": 0.00011336683417085426,
135512
+ "loss": 2.081,
135513
+ "step": 192480
135514
+ },
135515
+ {
135516
+ "epoch": 0.96245,
135517
+ "grad_norm": 0.546875,
135518
+ "learning_rate": 0.00011321608040201004,
135519
+ "loss": 2.0745,
135520
+ "step": 192490
135521
+ },
135522
+ {
135523
+ "epoch": 0.9625,
135524
+ "grad_norm": 0.5546875,
135525
+ "learning_rate": 0.00011306532663316582,
135526
+ "loss": 2.079,
135527
+ "step": 192500
135528
+ },
135529
+ {
135530
+ "epoch": 0.96255,
135531
+ "grad_norm": 0.71875,
135532
+ "learning_rate": 0.0001129145728643216,
135533
+ "loss": 2.0357,
135534
+ "step": 192510
135535
+ },
135536
+ {
135537
+ "epoch": 0.9626,
135538
+ "grad_norm": 0.62890625,
135539
+ "learning_rate": 0.00011276381909547738,
135540
+ "loss": 2.099,
135541
+ "step": 192520
135542
+ },
135543
+ {
135544
+ "epoch": 0.96265,
135545
+ "grad_norm": 0.70703125,
135546
+ "learning_rate": 0.00011261306532663316,
135547
+ "loss": 2.0364,
135548
+ "step": 192530
135549
+ },
135550
+ {
135551
+ "epoch": 0.9627,
135552
+ "grad_norm": 0.60546875,
135553
+ "learning_rate": 0.00011246231155778894,
135554
+ "loss": 2.0981,
135555
+ "step": 192540
135556
+ },
135557
+ {
135558
+ "epoch": 0.96275,
135559
+ "grad_norm": 0.63671875,
135560
+ "learning_rate": 0.00011231155778894471,
135561
+ "loss": 2.0898,
135562
+ "step": 192550
135563
+ },
135564
+ {
135565
+ "epoch": 0.9628,
135566
+ "grad_norm": 0.5703125,
135567
+ "learning_rate": 0.0001121608040201005,
135568
+ "loss": 2.0789,
135569
+ "step": 192560
135570
+ },
135571
+ {
135572
+ "epoch": 0.96285,
135573
+ "grad_norm": 0.59765625,
135574
+ "learning_rate": 0.00011201005025125627,
135575
+ "loss": 2.1286,
135576
+ "step": 192570
135577
+ },
135578
+ {
135579
+ "epoch": 0.9629,
135580
+ "grad_norm": 0.609375,
135581
+ "learning_rate": 0.00011185929648241205,
135582
+ "loss": 2.0632,
135583
+ "step": 192580
135584
+ },
135585
+ {
135586
+ "epoch": 0.96295,
135587
+ "grad_norm": 0.55078125,
135588
+ "learning_rate": 0.00011170854271356783,
135589
+ "loss": 2.0861,
135590
+ "step": 192590
135591
+ },
135592
+ {
135593
+ "epoch": 0.963,
135594
+ "grad_norm": 0.5703125,
135595
+ "learning_rate": 0.00011155778894472361,
135596
+ "loss": 2.0272,
135597
+ "step": 192600
135598
+ },
135599
+ {
135600
+ "epoch": 0.96305,
135601
+ "grad_norm": 0.66015625,
135602
+ "learning_rate": 0.00011140703517587939,
135603
+ "loss": 2.1351,
135604
+ "step": 192610
135605
+ },
135606
+ {
135607
+ "epoch": 0.9631,
135608
+ "grad_norm": 0.5703125,
135609
+ "learning_rate": 0.00011125628140703517,
135610
+ "loss": 2.1006,
135611
+ "step": 192620
135612
+ },
135613
+ {
135614
+ "epoch": 0.96315,
135615
+ "grad_norm": 0.6328125,
135616
+ "learning_rate": 0.00011110552763819095,
135617
+ "loss": 2.0411,
135618
+ "step": 192630
135619
+ },
135620
+ {
135621
+ "epoch": 0.9632,
135622
+ "grad_norm": 0.6171875,
135623
+ "learning_rate": 0.00011095477386934673,
135624
+ "loss": 2.0426,
135625
+ "step": 192640
135626
+ },
135627
+ {
135628
+ "epoch": 0.96325,
135629
+ "grad_norm": 0.578125,
135630
+ "learning_rate": 0.00011080402010050251,
135631
+ "loss": 2.0275,
135632
+ "step": 192650
135633
+ },
135634
+ {
135635
+ "epoch": 0.9633,
135636
+ "grad_norm": 0.59765625,
135637
+ "learning_rate": 0.00011065326633165829,
135638
+ "loss": 2.0903,
135639
+ "step": 192660
135640
+ },
135641
+ {
135642
+ "epoch": 0.96335,
135643
+ "grad_norm": 0.63671875,
135644
+ "learning_rate": 0.00011050251256281407,
135645
+ "loss": 2.0704,
135646
+ "step": 192670
135647
+ },
135648
+ {
135649
+ "epoch": 0.9634,
135650
+ "grad_norm": 0.6640625,
135651
+ "learning_rate": 0.00011035175879396985,
135652
+ "loss": 2.0627,
135653
+ "step": 192680
135654
+ },
135655
+ {
135656
+ "epoch": 0.96345,
135657
+ "grad_norm": 0.6484375,
135658
+ "learning_rate": 0.00011020100502512562,
135659
+ "loss": 2.0467,
135660
+ "step": 192690
135661
+ },
135662
+ {
135663
+ "epoch": 0.9635,
135664
+ "grad_norm": 0.5859375,
135665
+ "learning_rate": 0.0001100502512562814,
135666
+ "loss": 2.134,
135667
+ "step": 192700
135668
+ },
135669
+ {
135670
+ "epoch": 0.96355,
135671
+ "grad_norm": 0.65625,
135672
+ "learning_rate": 0.00010989949748743718,
135673
+ "loss": 2.0142,
135674
+ "step": 192710
135675
+ },
135676
+ {
135677
+ "epoch": 0.9636,
135678
+ "grad_norm": 0.6015625,
135679
+ "learning_rate": 0.00010974874371859296,
135680
+ "loss": 2.1523,
135681
+ "step": 192720
135682
+ },
135683
+ {
135684
+ "epoch": 0.96365,
135685
+ "grad_norm": 0.60546875,
135686
+ "learning_rate": 0.00010959798994974874,
135687
+ "loss": 2.0758,
135688
+ "step": 192730
135689
+ },
135690
+ {
135691
+ "epoch": 0.9637,
135692
+ "grad_norm": 0.55859375,
135693
+ "learning_rate": 0.00010944723618090452,
135694
+ "loss": 2.0931,
135695
+ "step": 192740
135696
+ },
135697
+ {
135698
+ "epoch": 0.96375,
135699
+ "grad_norm": 0.5859375,
135700
+ "learning_rate": 0.0001092964824120603,
135701
+ "loss": 2.0748,
135702
+ "step": 192750
135703
+ },
135704
+ {
135705
+ "epoch": 0.9638,
135706
+ "grad_norm": 0.6171875,
135707
+ "learning_rate": 0.00010914572864321608,
135708
+ "loss": 2.089,
135709
+ "step": 192760
135710
+ },
135711
+ {
135712
+ "epoch": 0.96385,
135713
+ "grad_norm": 0.5703125,
135714
+ "learning_rate": 0.00010899497487437186,
135715
+ "loss": 2.09,
135716
+ "step": 192770
135717
+ },
135718
+ {
135719
+ "epoch": 0.9639,
135720
+ "grad_norm": 0.625,
135721
+ "learning_rate": 0.00010884422110552763,
135722
+ "loss": 2.0736,
135723
+ "step": 192780
135724
+ },
135725
+ {
135726
+ "epoch": 0.96395,
135727
+ "grad_norm": 0.58203125,
135728
+ "learning_rate": 0.00010869346733668341,
135729
+ "loss": 2.0489,
135730
+ "step": 192790
135731
+ },
135732
+ {
135733
+ "epoch": 0.964,
135734
+ "grad_norm": 0.64453125,
135735
+ "learning_rate": 0.0001085427135678392,
135736
+ "loss": 2.056,
135737
+ "step": 192800
135738
+ },
135739
+ {
135740
+ "epoch": 0.96405,
135741
+ "grad_norm": 0.61328125,
135742
+ "learning_rate": 0.00010839195979899497,
135743
+ "loss": 2.1303,
135744
+ "step": 192810
135745
+ },
135746
+ {
135747
+ "epoch": 0.9641,
135748
+ "grad_norm": 0.53515625,
135749
+ "learning_rate": 0.00010824120603015075,
135750
+ "loss": 2.0366,
135751
+ "step": 192820
135752
+ },
135753
+ {
135754
+ "epoch": 0.96415,
135755
+ "grad_norm": 0.6640625,
135756
+ "learning_rate": 0.00010809045226130653,
135757
+ "loss": 2.1007,
135758
+ "step": 192830
135759
+ },
135760
+ {
135761
+ "epoch": 0.9642,
135762
+ "grad_norm": 0.64453125,
135763
+ "learning_rate": 0.00010793969849246231,
135764
+ "loss": 2.0316,
135765
+ "step": 192840
135766
+ },
135767
+ {
135768
+ "epoch": 0.96425,
135769
+ "grad_norm": 0.55859375,
135770
+ "learning_rate": 0.00010778894472361809,
135771
+ "loss": 2.0614,
135772
+ "step": 192850
135773
+ },
135774
+ {
135775
+ "epoch": 0.9643,
135776
+ "grad_norm": 0.60546875,
135777
+ "learning_rate": 0.00010763819095477387,
135778
+ "loss": 2.087,
135779
+ "step": 192860
135780
+ },
135781
+ {
135782
+ "epoch": 0.96435,
135783
+ "grad_norm": 0.62109375,
135784
+ "learning_rate": 0.00010748743718592965,
135785
+ "loss": 2.0492,
135786
+ "step": 192870
135787
+ },
135788
+ {
135789
+ "epoch": 0.9644,
135790
+ "grad_norm": 0.66796875,
135791
+ "learning_rate": 0.00010733668341708543,
135792
+ "loss": 2.103,
135793
+ "step": 192880
135794
+ },
135795
+ {
135796
+ "epoch": 0.96445,
135797
+ "grad_norm": 0.62890625,
135798
+ "learning_rate": 0.00010718592964824121,
135799
+ "loss": 2.0502,
135800
+ "step": 192890
135801
+ },
135802
+ {
135803
+ "epoch": 0.9645,
135804
+ "grad_norm": 0.61328125,
135805
+ "learning_rate": 0.00010703517587939698,
135806
+ "loss": 2.0828,
135807
+ "step": 192900
135808
+ },
135809
+ {
135810
+ "epoch": 0.96455,
135811
+ "grad_norm": 0.6484375,
135812
+ "learning_rate": 0.00010688442211055276,
135813
+ "loss": 2.0552,
135814
+ "step": 192910
135815
+ },
135816
+ {
135817
+ "epoch": 0.9646,
135818
+ "grad_norm": 0.62890625,
135819
+ "learning_rate": 0.00010673366834170854,
135820
+ "loss": 2.0825,
135821
+ "step": 192920
135822
+ },
135823
+ {
135824
+ "epoch": 0.96465,
135825
+ "grad_norm": 0.66796875,
135826
+ "learning_rate": 0.00010658291457286432,
135827
+ "loss": 2.0672,
135828
+ "step": 192930
135829
+ },
135830
+ {
135831
+ "epoch": 0.9647,
135832
+ "grad_norm": 0.609375,
135833
+ "learning_rate": 0.0001064321608040201,
135834
+ "loss": 2.0596,
135835
+ "step": 192940
135836
+ },
135837
+ {
135838
+ "epoch": 0.96475,
135839
+ "grad_norm": 0.640625,
135840
+ "learning_rate": 0.00010628140703517588,
135841
+ "loss": 2.0168,
135842
+ "step": 192950
135843
+ },
135844
+ {
135845
+ "epoch": 0.9648,
135846
+ "grad_norm": 0.5859375,
135847
+ "learning_rate": 0.00010613065326633166,
135848
+ "loss": 2.0733,
135849
+ "step": 192960
135850
+ },
135851
+ {
135852
+ "epoch": 0.96485,
135853
+ "grad_norm": 0.61328125,
135854
+ "learning_rate": 0.00010597989949748744,
135855
+ "loss": 2.1119,
135856
+ "step": 192970
135857
+ },
135858
+ {
135859
+ "epoch": 0.9649,
135860
+ "grad_norm": 0.5703125,
135861
+ "learning_rate": 0.00010582914572864322,
135862
+ "loss": 2.0478,
135863
+ "step": 192980
135864
+ },
135865
+ {
135866
+ "epoch": 0.96495,
135867
+ "grad_norm": 0.61328125,
135868
+ "learning_rate": 0.000105678391959799,
135869
+ "loss": 2.0958,
135870
+ "step": 192990
135871
+ },
135872
+ {
135873
+ "epoch": 0.965,
135874
+ "grad_norm": 0.578125,
135875
+ "learning_rate": 0.00010552763819095478,
135876
+ "loss": 2.044,
135877
+ "step": 193000
135878
+ },
135879
+ {
135880
+ "epoch": 0.96505,
135881
+ "grad_norm": 0.58984375,
135882
+ "learning_rate": 0.00010537688442211056,
135883
+ "loss": 2.0735,
135884
+ "step": 193010
135885
+ },
135886
+ {
135887
+ "epoch": 0.9651,
135888
+ "grad_norm": 0.6640625,
135889
+ "learning_rate": 0.00010522613065326633,
135890
+ "loss": 2.013,
135891
+ "step": 193020
135892
+ },
135893
+ {
135894
+ "epoch": 0.96515,
135895
+ "grad_norm": 0.6328125,
135896
+ "learning_rate": 0.00010507537688442211,
135897
+ "loss": 2.0933,
135898
+ "step": 193030
135899
+ },
135900
+ {
135901
+ "epoch": 0.9652,
135902
+ "grad_norm": 0.6015625,
135903
+ "learning_rate": 0.0001049246231155779,
135904
+ "loss": 2.0616,
135905
+ "step": 193040
135906
+ },
135907
+ {
135908
+ "epoch": 0.96525,
135909
+ "grad_norm": 0.5390625,
135910
+ "learning_rate": 0.00010477386934673367,
135911
+ "loss": 2.0499,
135912
+ "step": 193050
135913
+ },
135914
+ {
135915
+ "epoch": 0.9653,
135916
+ "grad_norm": 0.65625,
135917
+ "learning_rate": 0.00010462311557788945,
135918
+ "loss": 2.1086,
135919
+ "step": 193060
135920
+ },
135921
+ {
135922
+ "epoch": 0.96535,
135923
+ "grad_norm": 0.609375,
135924
+ "learning_rate": 0.00010447236180904523,
135925
+ "loss": 2.0909,
135926
+ "step": 193070
135927
+ },
135928
+ {
135929
+ "epoch": 0.9654,
135930
+ "grad_norm": 0.6796875,
135931
+ "learning_rate": 0.00010432160804020101,
135932
+ "loss": 2.0897,
135933
+ "step": 193080
135934
+ },
135935
+ {
135936
+ "epoch": 0.96545,
135937
+ "grad_norm": 0.60546875,
135938
+ "learning_rate": 0.00010417085427135679,
135939
+ "loss": 2.0741,
135940
+ "step": 193090
135941
+ },
135942
+ {
135943
+ "epoch": 0.9655,
135944
+ "grad_norm": 0.6484375,
135945
+ "learning_rate": 0.00010402010050251256,
135946
+ "loss": 2.128,
135947
+ "step": 193100
135948
+ },
135949
+ {
135950
+ "epoch": 0.96555,
135951
+ "grad_norm": 0.61328125,
135952
+ "learning_rate": 0.00010386934673366834,
135953
+ "loss": 2.0221,
135954
+ "step": 193110
135955
+ },
135956
+ {
135957
+ "epoch": 0.9656,
135958
+ "grad_norm": 0.625,
135959
+ "learning_rate": 0.00010371859296482412,
135960
+ "loss": 2.1525,
135961
+ "step": 193120
135962
+ },
135963
+ {
135964
+ "epoch": 0.96565,
135965
+ "grad_norm": 0.69140625,
135966
+ "learning_rate": 0.0001035678391959799,
135967
+ "loss": 2.0668,
135968
+ "step": 193130
135969
+ },
135970
+ {
135971
+ "epoch": 0.9657,
135972
+ "grad_norm": 0.65625,
135973
+ "learning_rate": 0.00010341708542713568,
135974
+ "loss": 2.133,
135975
+ "step": 193140
135976
+ },
135977
+ {
135978
+ "epoch": 0.96575,
135979
+ "grad_norm": 0.59765625,
135980
+ "learning_rate": 0.00010326633165829146,
135981
+ "loss": 2.0543,
135982
+ "step": 193150
135983
+ },
135984
+ {
135985
+ "epoch": 0.9658,
135986
+ "grad_norm": 0.5859375,
135987
+ "learning_rate": 0.00010311557788944724,
135988
+ "loss": 2.0828,
135989
+ "step": 193160
135990
+ },
135991
+ {
135992
+ "epoch": 0.96585,
135993
+ "grad_norm": 0.5859375,
135994
+ "learning_rate": 0.00010296482412060302,
135995
+ "loss": 2.1479,
135996
+ "step": 193170
135997
+ },
135998
+ {
135999
+ "epoch": 0.9659,
136000
+ "grad_norm": 0.5625,
136001
+ "learning_rate": 0.0001028140703517588,
136002
+ "loss": 2.0801,
136003
+ "step": 193180
136004
+ },
136005
+ {
136006
+ "epoch": 0.96595,
136007
+ "grad_norm": 0.6171875,
136008
+ "learning_rate": 0.00010266331658291458,
136009
+ "loss": 2.1114,
136010
+ "step": 193190
136011
+ },
136012
+ {
136013
+ "epoch": 0.966,
136014
+ "grad_norm": 0.66796875,
136015
+ "learning_rate": 0.00010251256281407036,
136016
+ "loss": 2.0793,
136017
+ "step": 193200
136018
+ },
136019
+ {
136020
+ "epoch": 0.96605,
136021
+ "grad_norm": 0.63671875,
136022
+ "learning_rate": 0.00010236180904522614,
136023
+ "loss": 2.1357,
136024
+ "step": 193210
136025
+ },
136026
+ {
136027
+ "epoch": 0.9661,
136028
+ "grad_norm": 0.63671875,
136029
+ "learning_rate": 0.00010221105527638192,
136030
+ "loss": 2.0579,
136031
+ "step": 193220
136032
+ },
136033
+ {
136034
+ "epoch": 0.96615,
136035
+ "grad_norm": 0.65234375,
136036
+ "learning_rate": 0.0001020603015075377,
136037
+ "loss": 2.1389,
136038
+ "step": 193230
136039
+ },
136040
+ {
136041
+ "epoch": 0.9662,
136042
+ "grad_norm": 0.55859375,
136043
+ "learning_rate": 0.00010190954773869348,
136044
+ "loss": 2.082,
136045
+ "step": 193240
136046
+ },
136047
+ {
136048
+ "epoch": 0.96625,
136049
+ "grad_norm": 0.57421875,
136050
+ "learning_rate": 0.00010175879396984925,
136051
+ "loss": 2.0782,
136052
+ "step": 193250
136053
+ },
136054
+ {
136055
+ "epoch": 0.9663,
136056
+ "grad_norm": 0.62109375,
136057
+ "learning_rate": 0.00010160804020100503,
136058
+ "loss": 2.0719,
136059
+ "step": 193260
136060
+ },
136061
+ {
136062
+ "epoch": 0.96635,
136063
+ "grad_norm": 0.59375,
136064
+ "learning_rate": 0.00010145728643216081,
136065
+ "loss": 2.1004,
136066
+ "step": 193270
136067
+ },
136068
+ {
136069
+ "epoch": 0.9664,
136070
+ "grad_norm": 0.58984375,
136071
+ "learning_rate": 0.0001013065326633166,
136072
+ "loss": 2.0673,
136073
+ "step": 193280
136074
+ },
136075
+ {
136076
+ "epoch": 0.96645,
136077
+ "grad_norm": 0.6171875,
136078
+ "learning_rate": 0.00010115577889447237,
136079
+ "loss": 2.0743,
136080
+ "step": 193290
136081
+ },
136082
+ {
136083
+ "epoch": 0.9665,
136084
+ "grad_norm": 0.73828125,
136085
+ "learning_rate": 0.00010100502512562815,
136086
+ "loss": 2.0939,
136087
+ "step": 193300
136088
+ },
136089
+ {
136090
+ "epoch": 0.96655,
136091
+ "grad_norm": 0.5859375,
136092
+ "learning_rate": 0.00010085427135678392,
136093
+ "loss": 2.0247,
136094
+ "step": 193310
136095
+ },
136096
+ {
136097
+ "epoch": 0.9666,
136098
+ "grad_norm": 0.58203125,
136099
+ "learning_rate": 0.0001007035175879397,
136100
+ "loss": 2.1287,
136101
+ "step": 193320
136102
+ },
136103
+ {
136104
+ "epoch": 0.96665,
136105
+ "grad_norm": 0.6875,
136106
+ "learning_rate": 0.00010055276381909548,
136107
+ "loss": 2.0658,
136108
+ "step": 193330
136109
+ },
136110
+ {
136111
+ "epoch": 0.9667,
136112
+ "grad_norm": 0.58203125,
136113
+ "learning_rate": 0.00010040201005025126,
136114
+ "loss": 2.1019,
136115
+ "step": 193340
136116
+ },
136117
+ {
136118
+ "epoch": 0.96675,
136119
+ "grad_norm": 0.640625,
136120
+ "learning_rate": 0.00010025125628140704,
136121
+ "loss": 2.0826,
136122
+ "step": 193350
136123
+ },
136124
+ {
136125
+ "epoch": 0.9668,
136126
+ "grad_norm": 0.6328125,
136127
+ "learning_rate": 0.00010010050251256282,
136128
+ "loss": 2.0846,
136129
+ "step": 193360
136130
+ },
136131
+ {
136132
+ "epoch": 0.96685,
136133
+ "grad_norm": 0.68359375,
136134
+ "learning_rate": 9.99497487437186e-05,
136135
+ "loss": 2.1111,
136136
+ "step": 193370
136137
+ },
136138
+ {
136139
+ "epoch": 0.9669,
136140
+ "grad_norm": 0.60546875,
136141
+ "learning_rate": 9.979899497487438e-05,
136142
+ "loss": 2.0623,
136143
+ "step": 193380
136144
+ },
136145
+ {
136146
+ "epoch": 0.96695,
136147
+ "grad_norm": 0.62890625,
136148
+ "learning_rate": 9.964824120603016e-05,
136149
+ "loss": 2.1013,
136150
+ "step": 193390
136151
+ },
136152
+ {
136153
+ "epoch": 0.967,
136154
+ "grad_norm": 0.58984375,
136155
+ "learning_rate": 9.949748743718594e-05,
136156
+ "loss": 2.069,
136157
+ "step": 193400
136158
+ },
136159
+ {
136160
+ "epoch": 0.96705,
136161
+ "grad_norm": 0.64453125,
136162
+ "learning_rate": 9.934673366834172e-05,
136163
+ "loss": 2.1121,
136164
+ "step": 193410
136165
+ },
136166
+ {
136167
+ "epoch": 0.9671,
136168
+ "grad_norm": 0.6328125,
136169
+ "learning_rate": 9.91959798994975e-05,
136170
+ "loss": 2.0587,
136171
+ "step": 193420
136172
+ },
136173
+ {
136174
+ "epoch": 0.96715,
136175
+ "grad_norm": 0.64453125,
136176
+ "learning_rate": 9.904522613065328e-05,
136177
+ "loss": 2.0443,
136178
+ "step": 193430
136179
+ },
136180
+ {
136181
+ "epoch": 0.9672,
136182
+ "grad_norm": 0.6171875,
136183
+ "learning_rate": 9.889447236180906e-05,
136184
+ "loss": 2.0459,
136185
+ "step": 193440
136186
+ },
136187
+ {
136188
+ "epoch": 0.96725,
136189
+ "grad_norm": 0.60546875,
136190
+ "learning_rate": 9.874371859296484e-05,
136191
+ "loss": 2.0208,
136192
+ "step": 193450
136193
+ },
136194
+ {
136195
+ "epoch": 0.9673,
136196
+ "grad_norm": 0.640625,
136197
+ "learning_rate": 9.859296482412062e-05,
136198
+ "loss": 2.1135,
136199
+ "step": 193460
136200
+ },
136201
+ {
136202
+ "epoch": 0.96735,
136203
+ "grad_norm": 0.59375,
136204
+ "learning_rate": 9.84422110552764e-05,
136205
+ "loss": 2.0626,
136206
+ "step": 193470
136207
+ },
136208
+ {
136209
+ "epoch": 0.9674,
136210
+ "grad_norm": 0.6484375,
136211
+ "learning_rate": 9.829145728643218e-05,
136212
+ "loss": 2.1517,
136213
+ "step": 193480
136214
+ },
136215
+ {
136216
+ "epoch": 0.96745,
136217
+ "grad_norm": 0.56640625,
136218
+ "learning_rate": 9.814070351758795e-05,
136219
+ "loss": 2.0249,
136220
+ "step": 193490
136221
+ },
136222
+ {
136223
+ "epoch": 0.9675,
136224
+ "grad_norm": 0.56640625,
136225
+ "learning_rate": 9.798994974874373e-05,
136226
+ "loss": 2.116,
136227
+ "step": 193500
136228
+ },
136229
+ {
136230
+ "epoch": 0.96755,
136231
+ "grad_norm": 0.64453125,
136232
+ "learning_rate": 9.783919597989951e-05,
136233
+ "loss": 2.0581,
136234
+ "step": 193510
136235
+ },
136236
+ {
136237
+ "epoch": 0.9676,
136238
+ "grad_norm": 0.6328125,
136239
+ "learning_rate": 9.768844221105528e-05,
136240
+ "loss": 2.1323,
136241
+ "step": 193520
136242
+ },
136243
+ {
136244
+ "epoch": 0.96765,
136245
+ "grad_norm": 0.67578125,
136246
+ "learning_rate": 9.753768844221106e-05,
136247
+ "loss": 2.0611,
136248
+ "step": 193530
136249
+ },
136250
+ {
136251
+ "epoch": 0.9677,
136252
+ "grad_norm": 0.64453125,
136253
+ "learning_rate": 9.738693467336684e-05,
136254
+ "loss": 2.0753,
136255
+ "step": 193540
136256
+ },
136257
+ {
136258
+ "epoch": 0.96775,
136259
+ "grad_norm": 0.640625,
136260
+ "learning_rate": 9.723618090452262e-05,
136261
+ "loss": 2.0894,
136262
+ "step": 193550
136263
+ },
136264
+ {
136265
+ "epoch": 0.9678,
136266
+ "grad_norm": 0.578125,
136267
+ "learning_rate": 9.70854271356784e-05,
136268
+ "loss": 2.0414,
136269
+ "step": 193560
136270
+ },
136271
+ {
136272
+ "epoch": 0.96785,
136273
+ "grad_norm": 0.69921875,
136274
+ "learning_rate": 9.693467336683418e-05,
136275
+ "loss": 2.0803,
136276
+ "step": 193570
136277
+ },
136278
+ {
136279
+ "epoch": 0.9679,
136280
+ "grad_norm": 0.56640625,
136281
+ "learning_rate": 9.678391959798996e-05,
136282
+ "loss": 2.0275,
136283
+ "step": 193580
136284
+ },
136285
+ {
136286
+ "epoch": 0.96795,
136287
+ "grad_norm": 0.60546875,
136288
+ "learning_rate": 9.663316582914574e-05,
136289
+ "loss": 2.1239,
136290
+ "step": 193590
136291
+ },
136292
+ {
136293
+ "epoch": 0.968,
136294
+ "grad_norm": 0.5859375,
136295
+ "learning_rate": 9.648241206030152e-05,
136296
+ "loss": 2.0337,
136297
+ "step": 193600
136298
+ },
136299
+ {
136300
+ "epoch": 0.96805,
136301
+ "grad_norm": 0.62109375,
136302
+ "learning_rate": 9.63316582914573e-05,
136303
+ "loss": 2.0741,
136304
+ "step": 193610
136305
+ },
136306
+ {
136307
+ "epoch": 0.9681,
136308
+ "grad_norm": 0.66015625,
136309
+ "learning_rate": 9.618090452261308e-05,
136310
+ "loss": 2.0599,
136311
+ "step": 193620
136312
+ },
136313
+ {
136314
+ "epoch": 0.96815,
136315
+ "grad_norm": 0.62890625,
136316
+ "learning_rate": 9.603015075376886e-05,
136317
+ "loss": 2.0559,
136318
+ "step": 193630
136319
+ },
136320
+ {
136321
+ "epoch": 0.9682,
136322
+ "grad_norm": 0.640625,
136323
+ "learning_rate": 9.587939698492461e-05,
136324
+ "loss": 2.1606,
136325
+ "step": 193640
136326
+ },
136327
+ {
136328
+ "epoch": 0.96825,
136329
+ "grad_norm": 0.62109375,
136330
+ "learning_rate": 9.572864321608039e-05,
136331
+ "loss": 2.0655,
136332
+ "step": 193650
136333
+ },
136334
+ {
136335
+ "epoch": 0.9683,
136336
+ "grad_norm": 0.58203125,
136337
+ "learning_rate": 9.557788944723617e-05,
136338
+ "loss": 2.0956,
136339
+ "step": 193660
136340
+ },
136341
+ {
136342
+ "epoch": 0.96835,
136343
+ "grad_norm": 0.6328125,
136344
+ "learning_rate": 9.542713567839195e-05,
136345
+ "loss": 2.0487,
136346
+ "step": 193670
136347
+ },
136348
+ {
136349
+ "epoch": 0.9684,
136350
+ "grad_norm": 0.60546875,
136351
+ "learning_rate": 9.527638190954773e-05,
136352
+ "loss": 2.0721,
136353
+ "step": 193680
136354
+ },
136355
+ {
136356
+ "epoch": 0.96845,
136357
+ "grad_norm": 0.62109375,
136358
+ "learning_rate": 9.512562814070351e-05,
136359
+ "loss": 2.108,
136360
+ "step": 193690
136361
+ },
136362
+ {
136363
+ "epoch": 0.9685,
136364
+ "grad_norm": 0.6484375,
136365
+ "learning_rate": 9.497487437185929e-05,
136366
+ "loss": 2.09,
136367
+ "step": 193700
136368
+ },
136369
+ {
136370
+ "epoch": 0.96855,
136371
+ "grad_norm": 0.61328125,
136372
+ "learning_rate": 9.482412060301507e-05,
136373
+ "loss": 2.0447,
136374
+ "step": 193710
136375
+ },
136376
+ {
136377
+ "epoch": 0.9686,
136378
+ "grad_norm": 0.578125,
136379
+ "learning_rate": 9.467336683417085e-05,
136380
+ "loss": 2.0791,
136381
+ "step": 193720
136382
+ },
136383
+ {
136384
+ "epoch": 0.96865,
136385
+ "grad_norm": 0.5859375,
136386
+ "learning_rate": 9.452261306532663e-05,
136387
+ "loss": 2.1149,
136388
+ "step": 193730
136389
+ },
136390
+ {
136391
+ "epoch": 0.9687,
136392
+ "grad_norm": 0.59765625,
136393
+ "learning_rate": 9.437185929648241e-05,
136394
+ "loss": 2.0379,
136395
+ "step": 193740
136396
+ },
136397
+ {
136398
+ "epoch": 0.96875,
136399
+ "grad_norm": 0.53125,
136400
+ "learning_rate": 9.422110552763819e-05,
136401
+ "loss": 2.0959,
136402
+ "step": 193750
136403
+ },
136404
+ {
136405
+ "epoch": 0.9688,
136406
+ "grad_norm": 0.58984375,
136407
+ "learning_rate": 9.407035175879397e-05,
136408
+ "loss": 2.0678,
136409
+ "step": 193760
136410
+ },
136411
+ {
136412
+ "epoch": 0.96885,
136413
+ "grad_norm": 0.66796875,
136414
+ "learning_rate": 9.391959798994975e-05,
136415
+ "loss": 2.1388,
136416
+ "step": 193770
136417
+ },
136418
+ {
136419
+ "epoch": 0.9689,
136420
+ "grad_norm": 0.60546875,
136421
+ "learning_rate": 9.376884422110553e-05,
136422
+ "loss": 2.09,
136423
+ "step": 193780
136424
+ },
136425
+ {
136426
+ "epoch": 0.96895,
136427
+ "grad_norm": 0.60546875,
136428
+ "learning_rate": 9.36180904522613e-05,
136429
+ "loss": 2.1365,
136430
+ "step": 193790
136431
+ },
136432
+ {
136433
+ "epoch": 0.969,
136434
+ "grad_norm": 0.609375,
136435
+ "learning_rate": 9.346733668341709e-05,
136436
+ "loss": 2.0655,
136437
+ "step": 193800
136438
+ },
136439
+ {
136440
+ "epoch": 0.96905,
136441
+ "grad_norm": 0.57421875,
136442
+ "learning_rate": 9.331658291457287e-05,
136443
+ "loss": 2.0588,
136444
+ "step": 193810
136445
+ },
136446
+ {
136447
+ "epoch": 0.9691,
136448
+ "grad_norm": 0.6640625,
136449
+ "learning_rate": 9.316582914572864e-05,
136450
+ "loss": 2.1514,
136451
+ "step": 193820
136452
+ },
136453
+ {
136454
+ "epoch": 0.96915,
136455
+ "grad_norm": 0.67578125,
136456
+ "learning_rate": 9.301507537688442e-05,
136457
+ "loss": 2.0114,
136458
+ "step": 193830
136459
+ },
136460
+ {
136461
+ "epoch": 0.9692,
136462
+ "grad_norm": 0.56640625,
136463
+ "learning_rate": 9.28643216080402e-05,
136464
+ "loss": 2.0999,
136465
+ "step": 193840
136466
+ },
136467
+ {
136468
+ "epoch": 0.96925,
136469
+ "grad_norm": 0.6328125,
136470
+ "learning_rate": 9.271356783919598e-05,
136471
+ "loss": 2.0517,
136472
+ "step": 193850
136473
+ },
136474
+ {
136475
+ "epoch": 0.9693,
136476
+ "grad_norm": 0.6796875,
136477
+ "learning_rate": 9.256281407035176e-05,
136478
+ "loss": 2.1083,
136479
+ "step": 193860
136480
+ },
136481
+ {
136482
+ "epoch": 0.96935,
136483
+ "grad_norm": 0.58984375,
136484
+ "learning_rate": 9.241206030150754e-05,
136485
+ "loss": 2.0611,
136486
+ "step": 193870
136487
+ },
136488
+ {
136489
+ "epoch": 0.9694,
136490
+ "grad_norm": 0.625,
136491
+ "learning_rate": 9.226130653266332e-05,
136492
+ "loss": 2.0811,
136493
+ "step": 193880
136494
+ },
136495
+ {
136496
+ "epoch": 0.96945,
136497
+ "grad_norm": 0.58984375,
136498
+ "learning_rate": 9.21105527638191e-05,
136499
+ "loss": 2.0413,
136500
+ "step": 193890
136501
+ },
136502
+ {
136503
+ "epoch": 0.9695,
136504
+ "grad_norm": 0.625,
136505
+ "learning_rate": 9.195979899497488e-05,
136506
+ "loss": 2.0435,
136507
+ "step": 193900
136508
+ },
136509
+ {
136510
+ "epoch": 0.96955,
136511
+ "grad_norm": 0.625,
136512
+ "learning_rate": 9.180904522613066e-05,
136513
+ "loss": 2.1016,
136514
+ "step": 193910
136515
+ },
136516
+ {
136517
+ "epoch": 0.9696,
136518
+ "grad_norm": 0.58203125,
136519
+ "learning_rate": 9.165829145728644e-05,
136520
+ "loss": 2.1049,
136521
+ "step": 193920
136522
+ },
136523
+ {
136524
+ "epoch": 0.96965,
136525
+ "grad_norm": 0.609375,
136526
+ "learning_rate": 9.150753768844221e-05,
136527
+ "loss": 2.0746,
136528
+ "step": 193930
136529
+ },
136530
+ {
136531
+ "epoch": 0.9697,
136532
+ "grad_norm": 0.66796875,
136533
+ "learning_rate": 9.135678391959799e-05,
136534
+ "loss": 2.0927,
136535
+ "step": 193940
136536
+ },
136537
+ {
136538
+ "epoch": 0.96975,
136539
+ "grad_norm": 0.63671875,
136540
+ "learning_rate": 9.120603015075377e-05,
136541
+ "loss": 2.109,
136542
+ "step": 193950
136543
+ },
136544
+ {
136545
+ "epoch": 0.9698,
136546
+ "grad_norm": 0.66796875,
136547
+ "learning_rate": 9.105527638190955e-05,
136548
+ "loss": 2.0889,
136549
+ "step": 193960
136550
+ },
136551
+ {
136552
+ "epoch": 0.96985,
136553
+ "grad_norm": 0.60546875,
136554
+ "learning_rate": 9.090452261306533e-05,
136555
+ "loss": 2.0564,
136556
+ "step": 193970
136557
+ },
136558
+ {
136559
+ "epoch": 0.9699,
136560
+ "grad_norm": 0.7265625,
136561
+ "learning_rate": 9.075376884422111e-05,
136562
+ "loss": 2.0829,
136563
+ "step": 193980
136564
+ },
136565
+ {
136566
+ "epoch": 0.96995,
136567
+ "grad_norm": 0.6953125,
136568
+ "learning_rate": 9.060301507537689e-05,
136569
+ "loss": 2.0556,
136570
+ "step": 193990
136571
+ },
136572
+ {
136573
+ "epoch": 0.97,
136574
+ "grad_norm": 0.6328125,
136575
+ "learning_rate": 9.045226130653267e-05,
136576
+ "loss": 2.1009,
136577
+ "step": 194000
136578
+ },
136579
+ {
136580
+ "epoch": 0.97,
136581
+ "eval_loss": 2.07601261138916,
136582
+ "eval_runtime": 47.1258,
136583
+ "eval_samples_per_second": 53.049,
136584
+ "eval_steps_per_second": 0.106,
136585
+ "step": 194000
136586
  }
136587
  ],
136588
  "logging_steps": 10,
 
136602
  "attributes": {}
136603
  }
136604
  },
136605
+ "total_flos": 5.130708755205325e+18,
136606
  "train_batch_size": 64,
136607
  "trial_name": null,
136608
  "trial_params": null