nadahlberg commited on
Commit
1553183
1 Parent(s): 1098d3f

Training in progress, step 102000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a9fb22dfbf1adc73d11a6527c8ebd09e9de9f8d2ce94bcf0e2e38ad8bc008f2
3
  size 2997015224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7821e784728fa25fcfdc9d782f516d1e1f9b338613feccdf922f1d690b910a85
3
  size 2997015224
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:049690e98946a6570607b09ecfddfbeb54d74eb978af5867bb8c0b94765e8bb2
3
  size 5994123294
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f7fc10eaccc24db892ba90a3c868629f6f74d56d4af8eec371960b9460a18fd
3
  size 5994123294
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d318fd59de9c15420b24bad3ee415d0f58efeb8ddbe8cbe9f07769fc11bdce52
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cf990d87a7056e9e4b0a252156f6b30795db2a686df2fb1c99f6e3a11d9838f
3
  size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:790a8d0aecd000e37864a66ba30b275a932ba9b1ab4c5ac19b0c081fb9d48df5
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76db20779ac73e63d7d686b30d532d98ecbc31c9d6b323e5308a241a5f958a43
3
  size 15920
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:961c56df575cbd2196a1b125a5e8e464c8417937fea7d4404dce35fc1bdf6cf7
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce61d981d04468713b611e91a2f0d4a82eab3612f8080a73371b4286cb63626d
3
  size 15920
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38e45d5c754d89cd11b6d7de61e88e5c67ed61b63c9e7a49b2da51d151a01209
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f071c6bab394ba16aa2e81723b39d293147108a946add4b7cb9e8995f7a2eda6
3
  size 15920
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f50b3d7706395c48d98a75996e20753150133fc58e7321aecf42b14e73d0885b
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7365d7bb904d56209b9f093837e31cc34422778fa119f818fd18ab35415e1bdc
3
  size 15920
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f7404f15132697d1111f9ca68ee917a35328c8bafc9ee6e791f19ea3a1ec103
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9adc39521416cf27c3bd1677c4843d13bc25278518dfb7775d844fc53cf3a78f
3
  size 15920
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4413be3df8baa9fd0aeb101ad71b6f8098bd0ac078b146677c9dba8828916897
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4647715470bd8450f77c0b9ab529748ba4914076bb6ec38c8c5c5b31cf8666dd
3
  size 15920
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71b2337ad94cacffcb46150617e0a3c81cdaa147f8da41669c78a811b7520787
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7904bd2bef78f87b42aca6356cfdbb66809fe1ebc3b4a680d13a9faa13505e0
3
  size 15920
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e020f36fd4e9aca8f3b879030144c70997cf3f0bc868594a1ced58b8eb197b0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a01d2518bf51c6d331e2a2fd21d75544661ef5a07c1d1563fa31a405709da4a5
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5,
5
  "eval_steps": 2000,
6
- "global_step": 100000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -70407,6 +70407,1414 @@
70407
  "eval_samples_per_second": 27.748,
70408
  "eval_steps_per_second": 0.444,
70409
  "step": 100000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70410
  }
70411
  ],
70412
  "logging_steps": 10,
@@ -70426,7 +71834,7 @@
70426
  "attributes": {}
70427
  }
70428
  },
70429
- "total_flos": 2.4227419614590534e+19,
70430
  "train_batch_size": 8,
70431
  "trial_name": null,
70432
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.51,
5
  "eval_steps": 2000,
6
+ "global_step": 102000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
70407
  "eval_samples_per_second": 27.748,
70408
  "eval_steps_per_second": 0.444,
70409
  "step": 100000
70410
+ },
70411
+ {
70412
+ "epoch": 0.50005,
70413
+ "grad_norm": 2.046875,
70414
+ "learning_rate": 0.00015073869346733667,
70415
+ "loss": 2.0577,
70416
+ "step": 100010
70417
+ },
70418
+ {
70419
+ "epoch": 0.5001,
70420
+ "grad_norm": 1.96875,
70421
+ "learning_rate": 0.00015072361809045226,
70422
+ "loss": 2.04,
70423
+ "step": 100020
70424
+ },
70425
+ {
70426
+ "epoch": 0.50015,
70427
+ "grad_norm": 2.046875,
70428
+ "learning_rate": 0.00015070854271356782,
70429
+ "loss": 2.0467,
70430
+ "step": 100030
70431
+ },
70432
+ {
70433
+ "epoch": 0.5002,
70434
+ "grad_norm": 2.078125,
70435
+ "learning_rate": 0.0001506934673366834,
70436
+ "loss": 2.0113,
70437
+ "step": 100040
70438
+ },
70439
+ {
70440
+ "epoch": 0.50025,
70441
+ "grad_norm": 2.078125,
70442
+ "learning_rate": 0.000150678391959799,
70443
+ "loss": 2.0461,
70444
+ "step": 100050
70445
+ },
70446
+ {
70447
+ "epoch": 0.5003,
70448
+ "grad_norm": 1.859375,
70449
+ "learning_rate": 0.00015066331658291455,
70450
+ "loss": 1.9994,
70451
+ "step": 100060
70452
+ },
70453
+ {
70454
+ "epoch": 0.50035,
70455
+ "grad_norm": 2.0625,
70456
+ "learning_rate": 0.00015064824120603016,
70457
+ "loss": 2.0283,
70458
+ "step": 100070
70459
+ },
70460
+ {
70461
+ "epoch": 0.5004,
70462
+ "grad_norm": 2.0625,
70463
+ "learning_rate": 0.00015063316582914572,
70464
+ "loss": 2.0629,
70465
+ "step": 100080
70466
+ },
70467
+ {
70468
+ "epoch": 0.50045,
70469
+ "grad_norm": 2.09375,
70470
+ "learning_rate": 0.00015061809045226128,
70471
+ "loss": 2.0144,
70472
+ "step": 100090
70473
+ },
70474
+ {
70475
+ "epoch": 0.5005,
70476
+ "grad_norm": 1.828125,
70477
+ "learning_rate": 0.0001506030150753769,
70478
+ "loss": 2.082,
70479
+ "step": 100100
70480
+ },
70481
+ {
70482
+ "epoch": 0.50055,
70483
+ "grad_norm": 2.140625,
70484
+ "learning_rate": 0.00015058793969849245,
70485
+ "loss": 2.0549,
70486
+ "step": 100110
70487
+ },
70488
+ {
70489
+ "epoch": 0.5006,
70490
+ "grad_norm": 1.9296875,
70491
+ "learning_rate": 0.000150572864321608,
70492
+ "loss": 2.0591,
70493
+ "step": 100120
70494
+ },
70495
+ {
70496
+ "epoch": 0.50065,
70497
+ "grad_norm": 1.9296875,
70498
+ "learning_rate": 0.0001505577889447236,
70499
+ "loss": 1.9906,
70500
+ "step": 100130
70501
+ },
70502
+ {
70503
+ "epoch": 0.5007,
70504
+ "grad_norm": 1.9453125,
70505
+ "learning_rate": 0.00015054271356783919,
70506
+ "loss": 2.0959,
70507
+ "step": 100140
70508
+ },
70509
+ {
70510
+ "epoch": 0.50075,
70511
+ "grad_norm": 1.875,
70512
+ "learning_rate": 0.00015052763819095477,
70513
+ "loss": 2.0214,
70514
+ "step": 100150
70515
+ },
70516
+ {
70517
+ "epoch": 0.5008,
70518
+ "grad_norm": 2.0625,
70519
+ "learning_rate": 0.00015051256281407033,
70520
+ "loss": 2.0448,
70521
+ "step": 100160
70522
+ },
70523
+ {
70524
+ "epoch": 0.50085,
70525
+ "grad_norm": 1.96875,
70526
+ "learning_rate": 0.00015049748743718592,
70527
+ "loss": 2.05,
70528
+ "step": 100170
70529
+ },
70530
+ {
70531
+ "epoch": 0.5009,
70532
+ "grad_norm": 2.21875,
70533
+ "learning_rate": 0.0001504824120603015,
70534
+ "loss": 2.014,
70535
+ "step": 100180
70536
+ },
70537
+ {
70538
+ "epoch": 0.50095,
70539
+ "grad_norm": 1.7734375,
70540
+ "learning_rate": 0.00015046733668341706,
70541
+ "loss": 1.9987,
70542
+ "step": 100190
70543
+ },
70544
+ {
70545
+ "epoch": 0.501,
70546
+ "grad_norm": 2.0,
70547
+ "learning_rate": 0.00015045226130653267,
70548
+ "loss": 2.0231,
70549
+ "step": 100200
70550
+ },
70551
+ {
70552
+ "epoch": 0.50105,
70553
+ "grad_norm": 1.859375,
70554
+ "learning_rate": 0.00015043718592964823,
70555
+ "loss": 2.0519,
70556
+ "step": 100210
70557
+ },
70558
+ {
70559
+ "epoch": 0.5011,
70560
+ "grad_norm": 2.015625,
70561
+ "learning_rate": 0.0001504221105527638,
70562
+ "loss": 2.0005,
70563
+ "step": 100220
70564
+ },
70565
+ {
70566
+ "epoch": 0.50115,
70567
+ "grad_norm": 1.9296875,
70568
+ "learning_rate": 0.0001504070351758794,
70569
+ "loss": 2.0114,
70570
+ "step": 100230
70571
+ },
70572
+ {
70573
+ "epoch": 0.5012,
70574
+ "grad_norm": 1.8359375,
70575
+ "learning_rate": 0.00015039195979899496,
70576
+ "loss": 2.0219,
70577
+ "step": 100240
70578
+ },
70579
+ {
70580
+ "epoch": 0.50125,
70581
+ "grad_norm": 2.140625,
70582
+ "learning_rate": 0.00015037688442211052,
70583
+ "loss": 2.0079,
70584
+ "step": 100250
70585
+ },
70586
+ {
70587
+ "epoch": 0.5013,
70588
+ "grad_norm": 1.9453125,
70589
+ "learning_rate": 0.00015036180904522614,
70590
+ "loss": 2.0236,
70591
+ "step": 100260
70592
+ },
70593
+ {
70594
+ "epoch": 0.50135,
70595
+ "grad_norm": 2.09375,
70596
+ "learning_rate": 0.0001503467336683417,
70597
+ "loss": 2.0025,
70598
+ "step": 100270
70599
+ },
70600
+ {
70601
+ "epoch": 0.5014,
70602
+ "grad_norm": 1.90625,
70603
+ "learning_rate": 0.00015033165829145726,
70604
+ "loss": 2.0407,
70605
+ "step": 100280
70606
+ },
70607
+ {
70608
+ "epoch": 0.50145,
70609
+ "grad_norm": 2.296875,
70610
+ "learning_rate": 0.00015031658291457284,
70611
+ "loss": 2.0013,
70612
+ "step": 100290
70613
+ },
70614
+ {
70615
+ "epoch": 0.5015,
70616
+ "grad_norm": 2.046875,
70617
+ "learning_rate": 0.00015030150753768843,
70618
+ "loss": 1.9921,
70619
+ "step": 100300
70620
+ },
70621
+ {
70622
+ "epoch": 0.50155,
70623
+ "grad_norm": 1.9453125,
70624
+ "learning_rate": 0.000150286432160804,
70625
+ "loss": 2.0336,
70626
+ "step": 100310
70627
+ },
70628
+ {
70629
+ "epoch": 0.5016,
70630
+ "grad_norm": 2.171875,
70631
+ "learning_rate": 0.00015027135678391957,
70632
+ "loss": 2.0329,
70633
+ "step": 100320
70634
+ },
70635
+ {
70636
+ "epoch": 0.50165,
70637
+ "grad_norm": 1.9921875,
70638
+ "learning_rate": 0.00015025628140703519,
70639
+ "loss": 2.0327,
70640
+ "step": 100330
70641
+ },
70642
+ {
70643
+ "epoch": 0.5017,
70644
+ "grad_norm": 1.78125,
70645
+ "learning_rate": 0.00015024120603015074,
70646
+ "loss": 2.0726,
70647
+ "step": 100340
70648
+ },
70649
+ {
70650
+ "epoch": 0.50175,
70651
+ "grad_norm": 1.828125,
70652
+ "learning_rate": 0.0001502261306532663,
70653
+ "loss": 2.0025,
70654
+ "step": 100350
70655
+ },
70656
+ {
70657
+ "epoch": 0.5018,
70658
+ "grad_norm": 1.9140625,
70659
+ "learning_rate": 0.00015021105527638192,
70660
+ "loss": 2.0309,
70661
+ "step": 100360
70662
+ },
70663
+ {
70664
+ "epoch": 0.50185,
70665
+ "grad_norm": 1.9609375,
70666
+ "learning_rate": 0.00015019597989949748,
70667
+ "loss": 2.0078,
70668
+ "step": 100370
70669
+ },
70670
+ {
70671
+ "epoch": 0.5019,
70672
+ "grad_norm": 1.921875,
70673
+ "learning_rate": 0.00015018090452261303,
70674
+ "loss": 2.0121,
70675
+ "step": 100380
70676
+ },
70677
+ {
70678
+ "epoch": 0.50195,
70679
+ "grad_norm": 1.8984375,
70680
+ "learning_rate": 0.00015016582914572865,
70681
+ "loss": 2.0177,
70682
+ "step": 100390
70683
+ },
70684
+ {
70685
+ "epoch": 0.502,
70686
+ "grad_norm": 1.8515625,
70687
+ "learning_rate": 0.0001501507537688442,
70688
+ "loss": 2.012,
70689
+ "step": 100400
70690
+ },
70691
+ {
70692
+ "epoch": 0.50205,
70693
+ "grad_norm": 1.9453125,
70694
+ "learning_rate": 0.00015013567839195977,
70695
+ "loss": 2.0336,
70696
+ "step": 100410
70697
+ },
70698
+ {
70699
+ "epoch": 0.5021,
70700
+ "grad_norm": 2.03125,
70701
+ "learning_rate": 0.00015012060301507538,
70702
+ "loss": 2.0294,
70703
+ "step": 100420
70704
+ },
70705
+ {
70706
+ "epoch": 0.50215,
70707
+ "grad_norm": 1.875,
70708
+ "learning_rate": 0.00015010552763819094,
70709
+ "loss": 2.0701,
70710
+ "step": 100430
70711
+ },
70712
+ {
70713
+ "epoch": 0.5022,
70714
+ "grad_norm": 2.140625,
70715
+ "learning_rate": 0.00015009045226130652,
70716
+ "loss": 2.042,
70717
+ "step": 100440
70718
+ },
70719
+ {
70720
+ "epoch": 0.50225,
70721
+ "grad_norm": 2.15625,
70722
+ "learning_rate": 0.00015007537688442208,
70723
+ "loss": 2.0562,
70724
+ "step": 100450
70725
+ },
70726
+ {
70727
+ "epoch": 0.5023,
70728
+ "grad_norm": 1.71875,
70729
+ "learning_rate": 0.0001500603015075377,
70730
+ "loss": 1.9848,
70731
+ "step": 100460
70732
+ },
70733
+ {
70734
+ "epoch": 0.50235,
70735
+ "grad_norm": 2.03125,
70736
+ "learning_rate": 0.00015004522613065326,
70737
+ "loss": 2.0385,
70738
+ "step": 100470
70739
+ },
70740
+ {
70741
+ "epoch": 0.5024,
70742
+ "grad_norm": 1.8515625,
70743
+ "learning_rate": 0.00015003015075376881,
70744
+ "loss": 2.0441,
70745
+ "step": 100480
70746
+ },
70747
+ {
70748
+ "epoch": 0.50245,
70749
+ "grad_norm": 1.8828125,
70750
+ "learning_rate": 0.00015001507537688443,
70751
+ "loss": 1.9888,
70752
+ "step": 100490
70753
+ },
70754
+ {
70755
+ "epoch": 0.5025,
70756
+ "grad_norm": 2.109375,
70757
+ "learning_rate": 0.00015,
70758
+ "loss": 2.0933,
70759
+ "step": 100500
70760
+ },
70761
+ {
70762
+ "epoch": 0.50255,
70763
+ "grad_norm": 1.921875,
70764
+ "learning_rate": 0.00014998492462311557,
70765
+ "loss": 1.9895,
70766
+ "step": 100510
70767
+ },
70768
+ {
70769
+ "epoch": 0.5026,
70770
+ "grad_norm": 2.0625,
70771
+ "learning_rate": 0.00014996984924623113,
70772
+ "loss": 2.0241,
70773
+ "step": 100520
70774
+ },
70775
+ {
70776
+ "epoch": 0.50265,
70777
+ "grad_norm": 1.9296875,
70778
+ "learning_rate": 0.00014995477386934672,
70779
+ "loss": 2.0868,
70780
+ "step": 100530
70781
+ },
70782
+ {
70783
+ "epoch": 0.5027,
70784
+ "grad_norm": 1.90625,
70785
+ "learning_rate": 0.00014993969849246228,
70786
+ "loss": 1.9649,
70787
+ "step": 100540
70788
+ },
70789
+ {
70790
+ "epoch": 0.50275,
70791
+ "grad_norm": 2.09375,
70792
+ "learning_rate": 0.00014992462311557786,
70793
+ "loss": 2.0709,
70794
+ "step": 100550
70795
+ },
70796
+ {
70797
+ "epoch": 0.5028,
70798
+ "grad_norm": 2.046875,
70799
+ "learning_rate": 0.00014990954773869345,
70800
+ "loss": 2.0065,
70801
+ "step": 100560
70802
+ },
70803
+ {
70804
+ "epoch": 0.50285,
70805
+ "grad_norm": 1.828125,
70806
+ "learning_rate": 0.00014989447236180904,
70807
+ "loss": 2.0105,
70808
+ "step": 100570
70809
+ },
70810
+ {
70811
+ "epoch": 0.5029,
70812
+ "grad_norm": 2.09375,
70813
+ "learning_rate": 0.00014987939698492462,
70814
+ "loss": 2.004,
70815
+ "step": 100580
70816
+ },
70817
+ {
70818
+ "epoch": 0.50295,
70819
+ "grad_norm": 1.9921875,
70820
+ "learning_rate": 0.0001498643216080402,
70821
+ "loss": 2.0418,
70822
+ "step": 100590
70823
+ },
70824
+ {
70825
+ "epoch": 0.503,
70826
+ "grad_norm": 1.921875,
70827
+ "learning_rate": 0.00014984924623115577,
70828
+ "loss": 1.9932,
70829
+ "step": 100600
70830
+ },
70831
+ {
70832
+ "epoch": 0.50305,
70833
+ "grad_norm": 2.03125,
70834
+ "learning_rate": 0.00014983417085427135,
70835
+ "loss": 2.0319,
70836
+ "step": 100610
70837
+ },
70838
+ {
70839
+ "epoch": 0.5031,
70840
+ "grad_norm": 2.09375,
70841
+ "learning_rate": 0.0001498190954773869,
70842
+ "loss": 2.0594,
70843
+ "step": 100620
70844
+ },
70845
+ {
70846
+ "epoch": 0.50315,
70847
+ "grad_norm": 1.875,
70848
+ "learning_rate": 0.0001498040201005025,
70849
+ "loss": 2.0186,
70850
+ "step": 100630
70851
+ },
70852
+ {
70853
+ "epoch": 0.5032,
70854
+ "grad_norm": 1.875,
70855
+ "learning_rate": 0.00014978894472361808,
70856
+ "loss": 2.0526,
70857
+ "step": 100640
70858
+ },
70859
+ {
70860
+ "epoch": 0.50325,
70861
+ "grad_norm": 2.265625,
70862
+ "learning_rate": 0.00014977386934673364,
70863
+ "loss": 1.9934,
70864
+ "step": 100650
70865
+ },
70866
+ {
70867
+ "epoch": 0.5033,
70868
+ "grad_norm": 1.9765625,
70869
+ "learning_rate": 0.00014975879396984923,
70870
+ "loss": 1.9944,
70871
+ "step": 100660
70872
+ },
70873
+ {
70874
+ "epoch": 0.50335,
70875
+ "grad_norm": 1.890625,
70876
+ "learning_rate": 0.00014974371859296482,
70877
+ "loss": 2.0696,
70878
+ "step": 100670
70879
+ },
70880
+ {
70881
+ "epoch": 0.5034,
70882
+ "grad_norm": 1.9609375,
70883
+ "learning_rate": 0.00014972864321608037,
70884
+ "loss": 2.0373,
70885
+ "step": 100680
70886
+ },
70887
+ {
70888
+ "epoch": 0.50345,
70889
+ "grad_norm": 2.046875,
70890
+ "learning_rate": 0.00014971356783919596,
70891
+ "loss": 2.0697,
70892
+ "step": 100690
70893
+ },
70894
+ {
70895
+ "epoch": 0.5035,
70896
+ "grad_norm": 2.078125,
70897
+ "learning_rate": 0.00014969849246231155,
70898
+ "loss": 2.015,
70899
+ "step": 100700
70900
+ },
70901
+ {
70902
+ "epoch": 0.50355,
70903
+ "grad_norm": 2.015625,
70904
+ "learning_rate": 0.00014968341708542713,
70905
+ "loss": 2.026,
70906
+ "step": 100710
70907
+ },
70908
+ {
70909
+ "epoch": 0.5036,
70910
+ "grad_norm": 1.859375,
70911
+ "learning_rate": 0.00014966834170854272,
70912
+ "loss": 1.9736,
70913
+ "step": 100720
70914
+ },
70915
+ {
70916
+ "epoch": 0.50365,
70917
+ "grad_norm": 2.109375,
70918
+ "learning_rate": 0.00014965326633165828,
70919
+ "loss": 1.9758,
70920
+ "step": 100730
70921
+ },
70922
+ {
70923
+ "epoch": 0.5037,
70924
+ "grad_norm": 2.03125,
70925
+ "learning_rate": 0.00014963819095477386,
70926
+ "loss": 2.0381,
70927
+ "step": 100740
70928
+ },
70929
+ {
70930
+ "epoch": 0.50375,
70931
+ "grad_norm": 1.984375,
70932
+ "learning_rate": 0.00014962311557788945,
70933
+ "loss": 2.005,
70934
+ "step": 100750
70935
+ },
70936
+ {
70937
+ "epoch": 0.5038,
70938
+ "grad_norm": 2.1875,
70939
+ "learning_rate": 0.000149608040201005,
70940
+ "loss": 2.0392,
70941
+ "step": 100760
70942
+ },
70943
+ {
70944
+ "epoch": 0.50385,
70945
+ "grad_norm": 1.921875,
70946
+ "learning_rate": 0.0001495929648241206,
70947
+ "loss": 1.9888,
70948
+ "step": 100770
70949
+ },
70950
+ {
70951
+ "epoch": 0.5039,
70952
+ "grad_norm": 2.0,
70953
+ "learning_rate": 0.00014957788944723615,
70954
+ "loss": 2.0717,
70955
+ "step": 100780
70956
+ },
70957
+ {
70958
+ "epoch": 0.50395,
70959
+ "grad_norm": 2.09375,
70960
+ "learning_rate": 0.00014956281407035174,
70961
+ "loss": 2.0205,
70962
+ "step": 100790
70963
+ },
70964
+ {
70965
+ "epoch": 0.504,
70966
+ "grad_norm": 2.421875,
70967
+ "learning_rate": 0.00014954773869346733,
70968
+ "loss": 2.0285,
70969
+ "step": 100800
70970
+ },
70971
+ {
70972
+ "epoch": 0.50405,
70973
+ "grad_norm": 1.9453125,
70974
+ "learning_rate": 0.00014953266331658289,
70975
+ "loss": 2.086,
70976
+ "step": 100810
70977
+ },
70978
+ {
70979
+ "epoch": 0.5041,
70980
+ "grad_norm": 2.015625,
70981
+ "learning_rate": 0.00014951758793969847,
70982
+ "loss": 1.9678,
70983
+ "step": 100820
70984
+ },
70985
+ {
70986
+ "epoch": 0.50415,
70987
+ "grad_norm": 1.828125,
70988
+ "learning_rate": 0.00014950251256281406,
70989
+ "loss": 2.1181,
70990
+ "step": 100830
70991
+ },
70992
+ {
70993
+ "epoch": 0.5042,
70994
+ "grad_norm": 1.765625,
70995
+ "learning_rate": 0.00014948743718592964,
70996
+ "loss": 2.0051,
70997
+ "step": 100840
70998
+ },
70999
+ {
71000
+ "epoch": 0.50425,
71001
+ "grad_norm": 1.875,
71002
+ "learning_rate": 0.0001494723618090452,
71003
+ "loss": 2.0193,
71004
+ "step": 100850
71005
+ },
71006
+ {
71007
+ "epoch": 0.5043,
71008
+ "grad_norm": 1.875,
71009
+ "learning_rate": 0.0001494572864321608,
71010
+ "loss": 2.0467,
71011
+ "step": 100860
71012
+ },
71013
+ {
71014
+ "epoch": 0.50435,
71015
+ "grad_norm": 2.09375,
71016
+ "learning_rate": 0.00014944221105527637,
71017
+ "loss": 2.0017,
71018
+ "step": 100870
71019
+ },
71020
+ {
71021
+ "epoch": 0.5044,
71022
+ "grad_norm": 1.90625,
71023
+ "learning_rate": 0.00014942713567839196,
71024
+ "loss": 2.0517,
71025
+ "step": 100880
71026
+ },
71027
+ {
71028
+ "epoch": 0.50445,
71029
+ "grad_norm": 2.078125,
71030
+ "learning_rate": 0.00014941206030150752,
71031
+ "loss": 1.9835,
71032
+ "step": 100890
71033
+ },
71034
+ {
71035
+ "epoch": 0.5045,
71036
+ "grad_norm": 1.8203125,
71037
+ "learning_rate": 0.0001493969849246231,
71038
+ "loss": 2.0614,
71039
+ "step": 100900
71040
+ },
71041
+ {
71042
+ "epoch": 0.50455,
71043
+ "grad_norm": 1.953125,
71044
+ "learning_rate": 0.0001493819095477387,
71045
+ "loss": 2.0655,
71046
+ "step": 100910
71047
+ },
71048
+ {
71049
+ "epoch": 0.5046,
71050
+ "grad_norm": 2.109375,
71051
+ "learning_rate": 0.00014936683417085425,
71052
+ "loss": 1.9778,
71053
+ "step": 100920
71054
+ },
71055
+ {
71056
+ "epoch": 0.50465,
71057
+ "grad_norm": 1.9296875,
71058
+ "learning_rate": 0.00014935175879396984,
71059
+ "loss": 2.0143,
71060
+ "step": 100930
71061
+ },
71062
+ {
71063
+ "epoch": 0.5047,
71064
+ "grad_norm": 2.0625,
71065
+ "learning_rate": 0.0001493366834170854,
71066
+ "loss": 1.9989,
71067
+ "step": 100940
71068
+ },
71069
+ {
71070
+ "epoch": 0.50475,
71071
+ "grad_norm": 2.03125,
71072
+ "learning_rate": 0.00014932160804020098,
71073
+ "loss": 2.0735,
71074
+ "step": 100950
71075
+ },
71076
+ {
71077
+ "epoch": 0.5048,
71078
+ "grad_norm": 1.984375,
71079
+ "learning_rate": 0.00014930653266331657,
71080
+ "loss": 2.0113,
71081
+ "step": 100960
71082
+ },
71083
+ {
71084
+ "epoch": 0.50485,
71085
+ "grad_norm": 1.765625,
71086
+ "learning_rate": 0.00014929145728643215,
71087
+ "loss": 2.046,
71088
+ "step": 100970
71089
+ },
71090
+ {
71091
+ "epoch": 0.5049,
71092
+ "grad_norm": 2.015625,
71093
+ "learning_rate": 0.0001492763819095477,
71094
+ "loss": 1.9569,
71095
+ "step": 100980
71096
+ },
71097
+ {
71098
+ "epoch": 0.50495,
71099
+ "grad_norm": 2.0625,
71100
+ "learning_rate": 0.0001492613065326633,
71101
+ "loss": 2.0329,
71102
+ "step": 100990
71103
+ },
71104
+ {
71105
+ "epoch": 0.505,
71106
+ "grad_norm": 1.96875,
71107
+ "learning_rate": 0.00014924623115577889,
71108
+ "loss": 1.9891,
71109
+ "step": 101000
71110
+ },
71111
+ {
71112
+ "epoch": 0.50505,
71113
+ "grad_norm": 2.015625,
71114
+ "learning_rate": 0.00014923115577889447,
71115
+ "loss": 1.9908,
71116
+ "step": 101010
71117
+ },
71118
+ {
71119
+ "epoch": 0.5051,
71120
+ "grad_norm": 1.7421875,
71121
+ "learning_rate": 0.00014921608040201003,
71122
+ "loss": 2.0593,
71123
+ "step": 101020
71124
+ },
71125
+ {
71126
+ "epoch": 0.50515,
71127
+ "grad_norm": 1.9609375,
71128
+ "learning_rate": 0.00014920100502512562,
71129
+ "loss": 1.9822,
71130
+ "step": 101030
71131
+ },
71132
+ {
71133
+ "epoch": 0.5052,
71134
+ "grad_norm": 1.7734375,
71135
+ "learning_rate": 0.0001491859296482412,
71136
+ "loss": 2.0216,
71137
+ "step": 101040
71138
+ },
71139
+ {
71140
+ "epoch": 0.50525,
71141
+ "grad_norm": 1.8359375,
71142
+ "learning_rate": 0.00014917085427135676,
71143
+ "loss": 1.9841,
71144
+ "step": 101050
71145
+ },
71146
+ {
71147
+ "epoch": 0.5053,
71148
+ "grad_norm": 1.9296875,
71149
+ "learning_rate": 0.00014915577889447235,
71150
+ "loss": 1.9997,
71151
+ "step": 101060
71152
+ },
71153
+ {
71154
+ "epoch": 0.50535,
71155
+ "grad_norm": 1.984375,
71156
+ "learning_rate": 0.00014914070351758793,
71157
+ "loss": 2.044,
71158
+ "step": 101070
71159
+ },
71160
+ {
71161
+ "epoch": 0.5054,
71162
+ "grad_norm": 2.03125,
71163
+ "learning_rate": 0.0001491256281407035,
71164
+ "loss": 2.0443,
71165
+ "step": 101080
71166
+ },
71167
+ {
71168
+ "epoch": 0.50545,
71169
+ "grad_norm": 1.9453125,
71170
+ "learning_rate": 0.00014911055276381908,
71171
+ "loss": 2.0243,
71172
+ "step": 101090
71173
+ },
71174
+ {
71175
+ "epoch": 0.5055,
71176
+ "grad_norm": 1.8359375,
71177
+ "learning_rate": 0.00014909547738693467,
71178
+ "loss": 2.0287,
71179
+ "step": 101100
71180
+ },
71181
+ {
71182
+ "epoch": 0.50555,
71183
+ "grad_norm": 1.8046875,
71184
+ "learning_rate": 0.00014908040201005022,
71185
+ "loss": 2.036,
71186
+ "step": 101110
71187
+ },
71188
+ {
71189
+ "epoch": 0.5056,
71190
+ "grad_norm": 2.046875,
71191
+ "learning_rate": 0.0001490653266331658,
71192
+ "loss": 2.0173,
71193
+ "step": 101120
71194
+ },
71195
+ {
71196
+ "epoch": 0.50565,
71197
+ "grad_norm": 1.921875,
71198
+ "learning_rate": 0.0001490502512562814,
71199
+ "loss": 1.9824,
71200
+ "step": 101130
71201
+ },
71202
+ {
71203
+ "epoch": 0.5057,
71204
+ "grad_norm": 1.765625,
71205
+ "learning_rate": 0.00014903517587939698,
71206
+ "loss": 2.0331,
71207
+ "step": 101140
71208
+ },
71209
+ {
71210
+ "epoch": 0.50575,
71211
+ "grad_norm": 2.21875,
71212
+ "learning_rate": 0.00014902010050251257,
71213
+ "loss": 2.0081,
71214
+ "step": 101150
71215
+ },
71216
+ {
71217
+ "epoch": 0.5058,
71218
+ "grad_norm": 1.9921875,
71219
+ "learning_rate": 0.00014900502512562813,
71220
+ "loss": 2.0302,
71221
+ "step": 101160
71222
+ },
71223
+ {
71224
+ "epoch": 0.50585,
71225
+ "grad_norm": 1.8203125,
71226
+ "learning_rate": 0.00014898994974874371,
71227
+ "loss": 2.0202,
71228
+ "step": 101170
71229
+ },
71230
+ {
71231
+ "epoch": 0.5059,
71232
+ "grad_norm": 2.125,
71233
+ "learning_rate": 0.00014897487437185927,
71234
+ "loss": 2.0086,
71235
+ "step": 101180
71236
+ },
71237
+ {
71238
+ "epoch": 0.50595,
71239
+ "grad_norm": 1.96875,
71240
+ "learning_rate": 0.00014895979899497486,
71241
+ "loss": 2.0516,
71242
+ "step": 101190
71243
+ },
71244
+ {
71245
+ "epoch": 0.506,
71246
+ "grad_norm": 1.9453125,
71247
+ "learning_rate": 0.00014894472361809044,
71248
+ "loss": 2.0198,
71249
+ "step": 101200
71250
+ },
71251
+ {
71252
+ "epoch": 0.50605,
71253
+ "grad_norm": 1.8984375,
71254
+ "learning_rate": 0.000148929648241206,
71255
+ "loss": 2.068,
71256
+ "step": 101210
71257
+ },
71258
+ {
71259
+ "epoch": 0.5061,
71260
+ "grad_norm": 1.90625,
71261
+ "learning_rate": 0.0001489145728643216,
71262
+ "loss": 1.9807,
71263
+ "step": 101220
71264
+ },
71265
+ {
71266
+ "epoch": 0.50615,
71267
+ "grad_norm": 1.8828125,
71268
+ "learning_rate": 0.00014889949748743718,
71269
+ "loss": 2.0408,
71270
+ "step": 101230
71271
+ },
71272
+ {
71273
+ "epoch": 0.5062,
71274
+ "grad_norm": 1.9609375,
71275
+ "learning_rate": 0.00014888442211055274,
71276
+ "loss": 2.0004,
71277
+ "step": 101240
71278
+ },
71279
+ {
71280
+ "epoch": 0.50625,
71281
+ "grad_norm": 1.8515625,
71282
+ "learning_rate": 0.00014886934673366832,
71283
+ "loss": 2.0351,
71284
+ "step": 101250
71285
+ },
71286
+ {
71287
+ "epoch": 0.5063,
71288
+ "grad_norm": 2.203125,
71289
+ "learning_rate": 0.0001488542713567839,
71290
+ "loss": 2.0074,
71291
+ "step": 101260
71292
+ },
71293
+ {
71294
+ "epoch": 0.50635,
71295
+ "grad_norm": 1.9296875,
71296
+ "learning_rate": 0.0001488391959798995,
71297
+ "loss": 2.0361,
71298
+ "step": 101270
71299
+ },
71300
+ {
71301
+ "epoch": 0.5064,
71302
+ "grad_norm": 1.8984375,
71303
+ "learning_rate": 0.00014882412060301508,
71304
+ "loss": 2.0219,
71305
+ "step": 101280
71306
+ },
71307
+ {
71308
+ "epoch": 0.50645,
71309
+ "grad_norm": 2.0,
71310
+ "learning_rate": 0.00014880904522613064,
71311
+ "loss": 2.0133,
71312
+ "step": 101290
71313
+ },
71314
+ {
71315
+ "epoch": 0.5065,
71316
+ "grad_norm": 1.796875,
71317
+ "learning_rate": 0.00014879396984924622,
71318
+ "loss": 1.9861,
71319
+ "step": 101300
71320
+ },
71321
+ {
71322
+ "epoch": 0.50655,
71323
+ "grad_norm": 1.9375,
71324
+ "learning_rate": 0.0001487788944723618,
71325
+ "loss": 2.0125,
71326
+ "step": 101310
71327
+ },
71328
+ {
71329
+ "epoch": 0.5066,
71330
+ "grad_norm": 1.7265625,
71331
+ "learning_rate": 0.00014876381909547737,
71332
+ "loss": 2.0377,
71333
+ "step": 101320
71334
+ },
71335
+ {
71336
+ "epoch": 0.50665,
71337
+ "grad_norm": 2.0,
71338
+ "learning_rate": 0.00014874874371859296,
71339
+ "loss": 2.0377,
71340
+ "step": 101330
71341
+ },
71342
+ {
71343
+ "epoch": 0.5067,
71344
+ "grad_norm": 1.921875,
71345
+ "learning_rate": 0.00014873366834170851,
71346
+ "loss": 2.0038,
71347
+ "step": 101340
71348
+ },
71349
+ {
71350
+ "epoch": 0.50675,
71351
+ "grad_norm": 1.828125,
71352
+ "learning_rate": 0.0001487185929648241,
71353
+ "loss": 2.0242,
71354
+ "step": 101350
71355
+ },
71356
+ {
71357
+ "epoch": 0.5068,
71358
+ "grad_norm": 1.8984375,
71359
+ "learning_rate": 0.0001487035175879397,
71360
+ "loss": 2.0106,
71361
+ "step": 101360
71362
+ },
71363
+ {
71364
+ "epoch": 0.50685,
71365
+ "grad_norm": 1.8125,
71366
+ "learning_rate": 0.00014868844221105525,
71367
+ "loss": 2.0362,
71368
+ "step": 101370
71369
+ },
71370
+ {
71371
+ "epoch": 0.5069,
71372
+ "grad_norm": 2.078125,
71373
+ "learning_rate": 0.00014867336683417083,
71374
+ "loss": 2.0324,
71375
+ "step": 101380
71376
+ },
71377
+ {
71378
+ "epoch": 0.50695,
71379
+ "grad_norm": 1.921875,
71380
+ "learning_rate": 0.00014865829145728642,
71381
+ "loss": 2.0529,
71382
+ "step": 101390
71383
+ },
71384
+ {
71385
+ "epoch": 0.507,
71386
+ "grad_norm": 1.9765625,
71387
+ "learning_rate": 0.000148643216080402,
71388
+ "loss": 2.0232,
71389
+ "step": 101400
71390
+ },
71391
+ {
71392
+ "epoch": 0.50705,
71393
+ "grad_norm": 1.9296875,
71394
+ "learning_rate": 0.0001486281407035176,
71395
+ "loss": 2.0135,
71396
+ "step": 101410
71397
+ },
71398
+ {
71399
+ "epoch": 0.5071,
71400
+ "grad_norm": 1.875,
71401
+ "learning_rate": 0.00014861306532663315,
71402
+ "loss": 1.9847,
71403
+ "step": 101420
71404
+ },
71405
+ {
71406
+ "epoch": 0.50715,
71407
+ "grad_norm": 2.046875,
71408
+ "learning_rate": 0.00014859798994974874,
71409
+ "loss": 2.0022,
71410
+ "step": 101430
71411
+ },
71412
+ {
71413
+ "epoch": 0.5072,
71414
+ "grad_norm": 2.09375,
71415
+ "learning_rate": 0.00014858291457286432,
71416
+ "loss": 2.0359,
71417
+ "step": 101440
71418
+ },
71419
+ {
71420
+ "epoch": 0.50725,
71421
+ "grad_norm": 2.0,
71422
+ "learning_rate": 0.00014856783919597988,
71423
+ "loss": 1.9862,
71424
+ "step": 101450
71425
+ },
71426
+ {
71427
+ "epoch": 0.5073,
71428
+ "grad_norm": 2.15625,
71429
+ "learning_rate": 0.00014855276381909547,
71430
+ "loss": 2.0993,
71431
+ "step": 101460
71432
+ },
71433
+ {
71434
+ "epoch": 0.50735,
71435
+ "grad_norm": 1.84375,
71436
+ "learning_rate": 0.00014853768844221105,
71437
+ "loss": 1.9999,
71438
+ "step": 101470
71439
+ },
71440
+ {
71441
+ "epoch": 0.5074,
71442
+ "grad_norm": 2.21875,
71443
+ "learning_rate": 0.0001485226130653266,
71444
+ "loss": 2.0576,
71445
+ "step": 101480
71446
+ },
71447
+ {
71448
+ "epoch": 0.50745,
71449
+ "grad_norm": 1.96875,
71450
+ "learning_rate": 0.0001485075376884422,
71451
+ "loss": 2.0463,
71452
+ "step": 101490
71453
+ },
71454
+ {
71455
+ "epoch": 0.5075,
71456
+ "grad_norm": 2.046875,
71457
+ "learning_rate": 0.00014849246231155776,
71458
+ "loss": 1.9991,
71459
+ "step": 101500
71460
+ },
71461
+ {
71462
+ "epoch": 0.50755,
71463
+ "grad_norm": 2.015625,
71464
+ "learning_rate": 0.00014847738693467334,
71465
+ "loss": 2.0225,
71466
+ "step": 101510
71467
+ },
71468
+ {
71469
+ "epoch": 0.5076,
71470
+ "grad_norm": 2.0,
71471
+ "learning_rate": 0.00014846231155778893,
71472
+ "loss": 1.9933,
71473
+ "step": 101520
71474
+ },
71475
+ {
71476
+ "epoch": 0.50765,
71477
+ "grad_norm": 2.03125,
71478
+ "learning_rate": 0.00014844723618090452,
71479
+ "loss": 2.068,
71480
+ "step": 101530
71481
+ },
71482
+ {
71483
+ "epoch": 0.5077,
71484
+ "grad_norm": 2.140625,
71485
+ "learning_rate": 0.0001484321608040201,
71486
+ "loss": 2.0123,
71487
+ "step": 101540
71488
+ },
71489
+ {
71490
+ "epoch": 0.50775,
71491
+ "grad_norm": 2.046875,
71492
+ "learning_rate": 0.00014841708542713566,
71493
+ "loss": 2.0615,
71494
+ "step": 101550
71495
+ },
71496
+ {
71497
+ "epoch": 0.5078,
71498
+ "grad_norm": 1.78125,
71499
+ "learning_rate": 0.00014840201005025125,
71500
+ "loss": 2.0239,
71501
+ "step": 101560
71502
+ },
71503
+ {
71504
+ "epoch": 0.50785,
71505
+ "grad_norm": 1.9140625,
71506
+ "learning_rate": 0.00014838693467336683,
71507
+ "loss": 2.0751,
71508
+ "step": 101570
71509
+ },
71510
+ {
71511
+ "epoch": 0.5079,
71512
+ "grad_norm": 2.0625,
71513
+ "learning_rate": 0.0001483718592964824,
71514
+ "loss": 2.0723,
71515
+ "step": 101580
71516
+ },
71517
+ {
71518
+ "epoch": 0.50795,
71519
+ "grad_norm": 2.265625,
71520
+ "learning_rate": 0.00014835678391959798,
71521
+ "loss": 1.9995,
71522
+ "step": 101590
71523
+ },
71524
+ {
71525
+ "epoch": 0.508,
71526
+ "grad_norm": 1.9765625,
71527
+ "learning_rate": 0.00014834170854271356,
71528
+ "loss": 1.9753,
71529
+ "step": 101600
71530
+ },
71531
+ {
71532
+ "epoch": 0.50805,
71533
+ "grad_norm": 2.046875,
71534
+ "learning_rate": 0.00014832663316582912,
71535
+ "loss": 1.977,
71536
+ "step": 101610
71537
+ },
71538
+ {
71539
+ "epoch": 0.5081,
71540
+ "grad_norm": 2.09375,
71541
+ "learning_rate": 0.0001483115577889447,
71542
+ "loss": 2.0894,
71543
+ "step": 101620
71544
+ },
71545
+ {
71546
+ "epoch": 0.50815,
71547
+ "grad_norm": 1.9140625,
71548
+ "learning_rate": 0.0001482964824120603,
71549
+ "loss": 1.9872,
71550
+ "step": 101630
71551
+ },
71552
+ {
71553
+ "epoch": 0.5082,
71554
+ "grad_norm": 2.03125,
71555
+ "learning_rate": 0.00014828140703517585,
71556
+ "loss": 2.0307,
71557
+ "step": 101640
71558
+ },
71559
+ {
71560
+ "epoch": 0.50825,
71561
+ "grad_norm": 1.7109375,
71562
+ "learning_rate": 0.00014826633165829144,
71563
+ "loss": 2.0058,
71564
+ "step": 101650
71565
+ },
71566
+ {
71567
+ "epoch": 0.5083,
71568
+ "grad_norm": 1.8828125,
71569
+ "learning_rate": 0.00014825125628140703,
71570
+ "loss": 2.0163,
71571
+ "step": 101660
71572
+ },
71573
+ {
71574
+ "epoch": 0.50835,
71575
+ "grad_norm": 1.7734375,
71576
+ "learning_rate": 0.0001482361809045226,
71577
+ "loss": 2.0325,
71578
+ "step": 101670
71579
+ },
71580
+ {
71581
+ "epoch": 0.5084,
71582
+ "grad_norm": 1.9296875,
71583
+ "learning_rate": 0.00014822110552763817,
71584
+ "loss": 2.0308,
71585
+ "step": 101680
71586
+ },
71587
+ {
71588
+ "epoch": 0.50845,
71589
+ "grad_norm": 1.9921875,
71590
+ "learning_rate": 0.00014820603015075376,
71591
+ "loss": 2.0286,
71592
+ "step": 101690
71593
+ },
71594
+ {
71595
+ "epoch": 0.5085,
71596
+ "grad_norm": 2.015625,
71597
+ "learning_rate": 0.00014819095477386934,
71598
+ "loss": 2.0159,
71599
+ "step": 101700
71600
+ },
71601
+ {
71602
+ "epoch": 0.50855,
71603
+ "grad_norm": 1.890625,
71604
+ "learning_rate": 0.0001481758793969849,
71605
+ "loss": 2.04,
71606
+ "step": 101710
71607
+ },
71608
+ {
71609
+ "epoch": 0.5086,
71610
+ "grad_norm": 1.9296875,
71611
+ "learning_rate": 0.0001481608040201005,
71612
+ "loss": 2.0119,
71613
+ "step": 101720
71614
+ },
71615
+ {
71616
+ "epoch": 0.50865,
71617
+ "grad_norm": 2.0,
71618
+ "learning_rate": 0.00014814572864321607,
71619
+ "loss": 1.9724,
71620
+ "step": 101730
71621
+ },
71622
+ {
71623
+ "epoch": 0.5087,
71624
+ "grad_norm": 2.078125,
71625
+ "learning_rate": 0.00014813065326633163,
71626
+ "loss": 2.0856,
71627
+ "step": 101740
71628
+ },
71629
+ {
71630
+ "epoch": 0.50875,
71631
+ "grad_norm": 1.9375,
71632
+ "learning_rate": 0.00014811557788944722,
71633
+ "loss": 2.027,
71634
+ "step": 101750
71635
+ },
71636
+ {
71637
+ "epoch": 0.5088,
71638
+ "grad_norm": 2.015625,
71639
+ "learning_rate": 0.0001481005025125628,
71640
+ "loss": 2.0297,
71641
+ "step": 101760
71642
+ },
71643
+ {
71644
+ "epoch": 0.50885,
71645
+ "grad_norm": 1.9375,
71646
+ "learning_rate": 0.00014808542713567836,
71647
+ "loss": 2.0425,
71648
+ "step": 101770
71649
+ },
71650
+ {
71651
+ "epoch": 0.5089,
71652
+ "grad_norm": 2.40625,
71653
+ "learning_rate": 0.00014807035175879395,
71654
+ "loss": 2.0579,
71655
+ "step": 101780
71656
+ },
71657
+ {
71658
+ "epoch": 0.50895,
71659
+ "grad_norm": 1.9140625,
71660
+ "learning_rate": 0.00014805527638190954,
71661
+ "loss": 1.9863,
71662
+ "step": 101790
71663
+ },
71664
+ {
71665
+ "epoch": 0.509,
71666
+ "grad_norm": 2.0625,
71667
+ "learning_rate": 0.00014804020100502512,
71668
+ "loss": 1.9923,
71669
+ "step": 101800
71670
+ },
71671
+ {
71672
+ "epoch": 0.50905,
71673
+ "grad_norm": 2.109375,
71674
+ "learning_rate": 0.00014802512562814068,
71675
+ "loss": 2.0216,
71676
+ "step": 101810
71677
+ },
71678
+ {
71679
+ "epoch": 0.5091,
71680
+ "grad_norm": 2.140625,
71681
+ "learning_rate": 0.00014801005025125627,
71682
+ "loss": 2.0225,
71683
+ "step": 101820
71684
+ },
71685
+ {
71686
+ "epoch": 0.50915,
71687
+ "grad_norm": 1.921875,
71688
+ "learning_rate": 0.00014799497487437185,
71689
+ "loss": 1.9862,
71690
+ "step": 101830
71691
+ },
71692
+ {
71693
+ "epoch": 0.5092,
71694
+ "grad_norm": 1.984375,
71695
+ "learning_rate": 0.00014797989949748744,
71696
+ "loss": 1.9726,
71697
+ "step": 101840
71698
+ },
71699
+ {
71700
+ "epoch": 0.50925,
71701
+ "grad_norm": 1.8359375,
71702
+ "learning_rate": 0.000147964824120603,
71703
+ "loss": 2.0245,
71704
+ "step": 101850
71705
+ },
71706
+ {
71707
+ "epoch": 0.5093,
71708
+ "grad_norm": 1.890625,
71709
+ "learning_rate": 0.00014794974874371859,
71710
+ "loss": 2.0225,
71711
+ "step": 101860
71712
+ },
71713
+ {
71714
+ "epoch": 0.50935,
71715
+ "grad_norm": 1.9609375,
71716
+ "learning_rate": 0.00014793467336683414,
71717
+ "loss": 2.0797,
71718
+ "step": 101870
71719
+ },
71720
+ {
71721
+ "epoch": 0.5094,
71722
+ "grad_norm": 1.9765625,
71723
+ "learning_rate": 0.00014791959798994973,
71724
+ "loss": 2.0775,
71725
+ "step": 101880
71726
+ },
71727
+ {
71728
+ "epoch": 0.50945,
71729
+ "grad_norm": 2.03125,
71730
+ "learning_rate": 0.00014790452261306532,
71731
+ "loss": 2.0884,
71732
+ "step": 101890
71733
+ },
71734
+ {
71735
+ "epoch": 0.5095,
71736
+ "grad_norm": 1.71875,
71737
+ "learning_rate": 0.00014788944723618088,
71738
+ "loss": 2.0254,
71739
+ "step": 101900
71740
+ },
71741
+ {
71742
+ "epoch": 0.50955,
71743
+ "grad_norm": 1.90625,
71744
+ "learning_rate": 0.00014787437185929646,
71745
+ "loss": 1.9893,
71746
+ "step": 101910
71747
+ },
71748
+ {
71749
+ "epoch": 0.5096,
71750
+ "grad_norm": 1.9296875,
71751
+ "learning_rate": 0.00014785929648241205,
71752
+ "loss": 2.0477,
71753
+ "step": 101920
71754
+ },
71755
+ {
71756
+ "epoch": 0.50965,
71757
+ "grad_norm": 2.125,
71758
+ "learning_rate": 0.00014784422110552763,
71759
+ "loss": 2.0333,
71760
+ "step": 101930
71761
+ },
71762
+ {
71763
+ "epoch": 0.5097,
71764
+ "grad_norm": 1.828125,
71765
+ "learning_rate": 0.0001478291457286432,
71766
+ "loss": 2.009,
71767
+ "step": 101940
71768
+ },
71769
+ {
71770
+ "epoch": 0.50975,
71771
+ "grad_norm": 2.046875,
71772
+ "learning_rate": 0.00014781407035175878,
71773
+ "loss": 2.0322,
71774
+ "step": 101950
71775
+ },
71776
+ {
71777
+ "epoch": 0.5098,
71778
+ "grad_norm": 2.03125,
71779
+ "learning_rate": 0.00014779899497487437,
71780
+ "loss": 1.9683,
71781
+ "step": 101960
71782
+ },
71783
+ {
71784
+ "epoch": 0.50985,
71785
+ "grad_norm": 1.828125,
71786
+ "learning_rate": 0.00014778391959798995,
71787
+ "loss": 2.0255,
71788
+ "step": 101970
71789
+ },
71790
+ {
71791
+ "epoch": 0.5099,
71792
+ "grad_norm": 2.09375,
71793
+ "learning_rate": 0.0001477688442211055,
71794
+ "loss": 2.016,
71795
+ "step": 101980
71796
+ },
71797
+ {
71798
+ "epoch": 0.50995,
71799
+ "grad_norm": 1.8515625,
71800
+ "learning_rate": 0.0001477537688442211,
71801
+ "loss": 2.0768,
71802
+ "step": 101990
71803
+ },
71804
+ {
71805
+ "epoch": 0.51,
71806
+ "grad_norm": 2.109375,
71807
+ "learning_rate": 0.00014773869346733668,
71808
+ "loss": 1.9938,
71809
+ "step": 102000
71810
+ },
71811
+ {
71812
+ "epoch": 0.51,
71813
+ "eval_loss": 2.0225799083709717,
71814
+ "eval_runtime": 90.2278,
71815
+ "eval_samples_per_second": 27.708,
71816
+ "eval_steps_per_second": 0.443,
71817
+ "step": 102000
71818
  }
71819
  ],
71820
  "logging_steps": 10,
 
71834
  "attributes": {}
71835
  }
71836
  },
71837
+ "total_flos": 2.471196266234852e+19,
71838
  "train_batch_size": 8,
71839
  "trial_name": null,
71840
  "trial_params": null