nadahlberg commited on
Commit
8620392
1 Parent(s): 494d95c

Training in progress, step 196000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a89c437e50b0f8ca4fd9af67e3083993a3fdc03fe2a27a76b7b04ce1d97eabf5
3
  size 325690872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9464f6573ad9fd0fe2e6b58a6ea1736b5ba18805ac93432a1bb8af13c8d7174
3
  size 325690872
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97b0a64aecfff2c26bdd5120e954a1d5857e9b29b7d91dd33d19b696968af0e1
3
  size 651550778
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:654add82a11ead5fc284c78b157de8301575e0da3004fc15763bfafcf3e21317
3
  size 651550778
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e074e22eba76ab6e9b544774e027528ef7c10076e885ecdf1f0ce9d0a4ffb058
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b0c6e8b9d02c7e6dd2be7b0f7ff77b7a7ace2a942f56de98bb049da9978609d
3
  size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:522379038ff9f72eee4f16fc13c7c527fd89328d593b292e47a71417cbf520bb
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:858fad4a57c20e3bc6f87991b2d51b447444dda687b4fac3b65e93012d0b92cd
3
  size 15920
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73b398f0dddef58a9282d66a49fd8e220898de17f6d838d7ffa52e9fffff8e6e
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76f6e9796e8b0ab01b947c4c554b7dfb713baa7446393d60bcd9dc2794008bb7
3
  size 15920
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8dcd63f04d0d7f5e8660b206f8c68db860f40c06e41516d296fd35ca06bcd87f
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1c1cfbbca38bba0f754721006c8a7b6bd2f8ef74bde2398b49043aeae06b4a9
3
  size 15920
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:afe9ba037bd933543b4aeb0ca749e02c10800870c577fe4d6537ec92694a8a38
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3fde9f19aef78c9d1abf97fb0d1ff70877dbd6c047afe3d95a62bc334beb08d
3
  size 15920
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bfd5271ec811e7349df93e4abc5b00c0534fac3f75454b6fc73f3a9b47df954c
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bba636fe487fa05c4f12884126f347d92998e10436b775c61ed62a0845edaee6
3
  size 15920
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07489b2fd53c594e60aa0037945803fe0cdb07239fa8fae63abd5b6720b1edf9
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f6eac7b111b6403d7a0511e9ed7b6c204f504a39330847af53892c04cfc0c5e
3
  size 15920
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4819284f798dc8aad3d5a2e6f2bfc72c0dffbf7156d8c82abd86685ea2d9c9df
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29d7b0fd82697b0b10cee8d8f5187a1d86168f8dae62bf20c5fcfda6edc689db
3
  size 15920
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81927fcd203392c94aa10d8605dd1de0e335a27d0e6da83feec012c338917f03
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3168476b794df8c7f905068d035500e7de63c824f43b21661ba9cef3202443fe
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.97,
5
  "eval_steps": 2000,
6
- "global_step": 194000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -136583,6 +136583,1414 @@
136583
  "eval_samples_per_second": 53.049,
136584
  "eval_steps_per_second": 0.106,
136585
  "step": 194000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136586
  }
136587
  ],
136588
  "logging_steps": 10,
@@ -136602,7 +138010,7 @@
136602
  "attributes": {}
136603
  }
136604
  },
136605
- "total_flos": 5.130708755205325e+18,
136606
  "train_batch_size": 64,
136607
  "trial_name": null,
136608
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.98,
5
  "eval_steps": 2000,
6
+ "global_step": 196000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
136583
  "eval_samples_per_second": 53.049,
136584
  "eval_steps_per_second": 0.106,
136585
  "step": 194000
136586
+ },
136587
+ {
136588
+ "epoch": 0.97005,
136589
+ "grad_norm": 0.62890625,
136590
+ "learning_rate": 9.030150753768845e-05,
136591
+ "loss": 2.0384,
136592
+ "step": 194010
136593
+ },
136594
+ {
136595
+ "epoch": 0.9701,
136596
+ "grad_norm": 0.6796875,
136597
+ "learning_rate": 9.015075376884423e-05,
136598
+ "loss": 2.0706,
136599
+ "step": 194020
136600
+ },
136601
+ {
136602
+ "epoch": 0.97015,
136603
+ "grad_norm": 0.53125,
136604
+ "learning_rate": 8.999999999999999e-05,
136605
+ "loss": 2.0821,
136606
+ "step": 194030
136607
+ },
136608
+ {
136609
+ "epoch": 0.9702,
136610
+ "grad_norm": 0.578125,
136611
+ "learning_rate": 8.984924623115577e-05,
136612
+ "loss": 2.1075,
136613
+ "step": 194040
136614
+ },
136615
+ {
136616
+ "epoch": 0.97025,
136617
+ "grad_norm": 0.6328125,
136618
+ "learning_rate": 8.969849246231155e-05,
136619
+ "loss": 2.0734,
136620
+ "step": 194050
136621
+ },
136622
+ {
136623
+ "epoch": 0.9703,
136624
+ "grad_norm": 0.62109375,
136625
+ "learning_rate": 8.954773869346733e-05,
136626
+ "loss": 2.0926,
136627
+ "step": 194060
136628
+ },
136629
+ {
136630
+ "epoch": 0.97035,
136631
+ "grad_norm": 0.65625,
136632
+ "learning_rate": 8.939698492462311e-05,
136633
+ "loss": 2.1112,
136634
+ "step": 194070
136635
+ },
136636
+ {
136637
+ "epoch": 0.9704,
136638
+ "grad_norm": 0.671875,
136639
+ "learning_rate": 8.924623115577889e-05,
136640
+ "loss": 2.0451,
136641
+ "step": 194080
136642
+ },
136643
+ {
136644
+ "epoch": 0.97045,
136645
+ "grad_norm": 0.59375,
136646
+ "learning_rate": 8.909547738693467e-05,
136647
+ "loss": 2.0717,
136648
+ "step": 194090
136649
+ },
136650
+ {
136651
+ "epoch": 0.9705,
136652
+ "grad_norm": 0.6796875,
136653
+ "learning_rate": 8.894472361809045e-05,
136654
+ "loss": 2.0684,
136655
+ "step": 194100
136656
+ },
136657
+ {
136658
+ "epoch": 0.97055,
136659
+ "grad_norm": 0.61328125,
136660
+ "learning_rate": 8.879396984924623e-05,
136661
+ "loss": 2.1276,
136662
+ "step": 194110
136663
+ },
136664
+ {
136665
+ "epoch": 0.9706,
136666
+ "grad_norm": 0.74609375,
136667
+ "learning_rate": 8.864321608040201e-05,
136668
+ "loss": 2.0602,
136669
+ "step": 194120
136670
+ },
136671
+ {
136672
+ "epoch": 0.97065,
136673
+ "grad_norm": 0.61328125,
136674
+ "learning_rate": 8.849246231155779e-05,
136675
+ "loss": 2.1114,
136676
+ "step": 194130
136677
+ },
136678
+ {
136679
+ "epoch": 0.9707,
136680
+ "grad_norm": 0.609375,
136681
+ "learning_rate": 8.834170854271357e-05,
136682
+ "loss": 2.0791,
136683
+ "step": 194140
136684
+ },
136685
+ {
136686
+ "epoch": 0.97075,
136687
+ "grad_norm": 0.59765625,
136688
+ "learning_rate": 8.819095477386935e-05,
136689
+ "loss": 2.0548,
136690
+ "step": 194150
136691
+ },
136692
+ {
136693
+ "epoch": 0.9708,
136694
+ "grad_norm": 0.55859375,
136695
+ "learning_rate": 8.804020100502513e-05,
136696
+ "loss": 2.0775,
136697
+ "step": 194160
136698
+ },
136699
+ {
136700
+ "epoch": 0.97085,
136701
+ "grad_norm": 0.68359375,
136702
+ "learning_rate": 8.788944723618091e-05,
136703
+ "loss": 2.0738,
136704
+ "step": 194170
136705
+ },
136706
+ {
136707
+ "epoch": 0.9709,
136708
+ "grad_norm": 0.61328125,
136709
+ "learning_rate": 8.773869346733669e-05,
136710
+ "loss": 2.1551,
136711
+ "step": 194180
136712
+ },
136713
+ {
136714
+ "epoch": 0.97095,
136715
+ "grad_norm": 0.6328125,
136716
+ "learning_rate": 8.758793969849247e-05,
136717
+ "loss": 2.0596,
136718
+ "step": 194190
136719
+ },
136720
+ {
136721
+ "epoch": 0.971,
136722
+ "grad_norm": 0.609375,
136723
+ "learning_rate": 8.743718592964825e-05,
136724
+ "loss": 2.0845,
136725
+ "step": 194200
136726
+ },
136727
+ {
136728
+ "epoch": 0.97105,
136729
+ "grad_norm": 0.671875,
136730
+ "learning_rate": 8.728643216080403e-05,
136731
+ "loss": 2.0366,
136732
+ "step": 194210
136733
+ },
136734
+ {
136735
+ "epoch": 0.9711,
136736
+ "grad_norm": 0.63671875,
136737
+ "learning_rate": 8.713567839195981e-05,
136738
+ "loss": 2.088,
136739
+ "step": 194220
136740
+ },
136741
+ {
136742
+ "epoch": 0.97115,
136743
+ "grad_norm": 0.66015625,
136744
+ "learning_rate": 8.698492462311559e-05,
136745
+ "loss": 2.0873,
136746
+ "step": 194230
136747
+ },
136748
+ {
136749
+ "epoch": 0.9712,
136750
+ "grad_norm": 0.5625,
136751
+ "learning_rate": 8.683417085427135e-05,
136752
+ "loss": 2.0475,
136753
+ "step": 194240
136754
+ },
136755
+ {
136756
+ "epoch": 0.97125,
136757
+ "grad_norm": 0.64453125,
136758
+ "learning_rate": 8.668341708542713e-05,
136759
+ "loss": 2.066,
136760
+ "step": 194250
136761
+ },
136762
+ {
136763
+ "epoch": 0.9713,
136764
+ "grad_norm": 0.625,
136765
+ "learning_rate": 8.653266331658291e-05,
136766
+ "loss": 2.0245,
136767
+ "step": 194260
136768
+ },
136769
+ {
136770
+ "epoch": 0.97135,
136771
+ "grad_norm": 0.56640625,
136772
+ "learning_rate": 8.638190954773869e-05,
136773
+ "loss": 2.1035,
136774
+ "step": 194270
136775
+ },
136776
+ {
136777
+ "epoch": 0.9714,
136778
+ "grad_norm": 0.63671875,
136779
+ "learning_rate": 8.623115577889447e-05,
136780
+ "loss": 2.0527,
136781
+ "step": 194280
136782
+ },
136783
+ {
136784
+ "epoch": 0.97145,
136785
+ "grad_norm": 0.671875,
136786
+ "learning_rate": 8.608040201005025e-05,
136787
+ "loss": 2.063,
136788
+ "step": 194290
136789
+ },
136790
+ {
136791
+ "epoch": 0.9715,
136792
+ "grad_norm": 0.671875,
136793
+ "learning_rate": 8.592964824120603e-05,
136794
+ "loss": 2.0161,
136795
+ "step": 194300
136796
+ },
136797
+ {
136798
+ "epoch": 0.97155,
136799
+ "grad_norm": 0.61328125,
136800
+ "learning_rate": 8.577889447236181e-05,
136801
+ "loss": 2.1606,
136802
+ "step": 194310
136803
+ },
136804
+ {
136805
+ "epoch": 0.9716,
136806
+ "grad_norm": 0.640625,
136807
+ "learning_rate": 8.562814070351759e-05,
136808
+ "loss": 2.0762,
136809
+ "step": 194320
136810
+ },
136811
+ {
136812
+ "epoch": 0.97165,
136813
+ "grad_norm": 0.58984375,
136814
+ "learning_rate": 8.547738693467337e-05,
136815
+ "loss": 2.0266,
136816
+ "step": 194330
136817
+ },
136818
+ {
136819
+ "epoch": 0.9717,
136820
+ "grad_norm": 0.6171875,
136821
+ "learning_rate": 8.532663316582915e-05,
136822
+ "loss": 2.1165,
136823
+ "step": 194340
136824
+ },
136825
+ {
136826
+ "epoch": 0.97175,
136827
+ "grad_norm": 0.609375,
136828
+ "learning_rate": 8.517587939698493e-05,
136829
+ "loss": 2.0723,
136830
+ "step": 194350
136831
+ },
136832
+ {
136833
+ "epoch": 0.9718,
136834
+ "grad_norm": 0.6171875,
136835
+ "learning_rate": 8.502512562814071e-05,
136836
+ "loss": 2.0379,
136837
+ "step": 194360
136838
+ },
136839
+ {
136840
+ "epoch": 0.97185,
136841
+ "grad_norm": 0.64453125,
136842
+ "learning_rate": 8.487437185929649e-05,
136843
+ "loss": 2.0648,
136844
+ "step": 194370
136845
+ },
136846
+ {
136847
+ "epoch": 0.9719,
136848
+ "grad_norm": 0.60546875,
136849
+ "learning_rate": 8.472361809045227e-05,
136850
+ "loss": 2.0479,
136851
+ "step": 194380
136852
+ },
136853
+ {
136854
+ "epoch": 0.97195,
136855
+ "grad_norm": 0.5859375,
136856
+ "learning_rate": 8.457286432160805e-05,
136857
+ "loss": 2.0829,
136858
+ "step": 194390
136859
+ },
136860
+ {
136861
+ "epoch": 0.972,
136862
+ "grad_norm": 0.57421875,
136863
+ "learning_rate": 8.442211055276383e-05,
136864
+ "loss": 2.0655,
136865
+ "step": 194400
136866
+ },
136867
+ {
136868
+ "epoch": 0.97205,
136869
+ "grad_norm": 0.63671875,
136870
+ "learning_rate": 8.427135678391961e-05,
136871
+ "loss": 2.1013,
136872
+ "step": 194410
136873
+ },
136874
+ {
136875
+ "epoch": 0.9721,
136876
+ "grad_norm": 0.66796875,
136877
+ "learning_rate": 8.412060301507539e-05,
136878
+ "loss": 2.0431,
136879
+ "step": 194420
136880
+ },
136881
+ {
136882
+ "epoch": 0.97215,
136883
+ "grad_norm": 0.62890625,
136884
+ "learning_rate": 8.396984924623117e-05,
136885
+ "loss": 2.0636,
136886
+ "step": 194430
136887
+ },
136888
+ {
136889
+ "epoch": 0.9722,
136890
+ "grad_norm": 0.58203125,
136891
+ "learning_rate": 8.381909547738693e-05,
136892
+ "loss": 2.0263,
136893
+ "step": 194440
136894
+ },
136895
+ {
136896
+ "epoch": 0.97225,
136897
+ "grad_norm": 0.60546875,
136898
+ "learning_rate": 8.366834170854271e-05,
136899
+ "loss": 2.1033,
136900
+ "step": 194450
136901
+ },
136902
+ {
136903
+ "epoch": 0.9723,
136904
+ "grad_norm": 0.62890625,
136905
+ "learning_rate": 8.351758793969849e-05,
136906
+ "loss": 2.0468,
136907
+ "step": 194460
136908
+ },
136909
+ {
136910
+ "epoch": 0.97235,
136911
+ "grad_norm": 0.5625,
136912
+ "learning_rate": 8.336683417085427e-05,
136913
+ "loss": 2.0634,
136914
+ "step": 194470
136915
+ },
136916
+ {
136917
+ "epoch": 0.9724,
136918
+ "grad_norm": 0.609375,
136919
+ "learning_rate": 8.321608040201005e-05,
136920
+ "loss": 2.0786,
136921
+ "step": 194480
136922
+ },
136923
+ {
136924
+ "epoch": 0.97245,
136925
+ "grad_norm": 0.625,
136926
+ "learning_rate": 8.306532663316583e-05,
136927
+ "loss": 2.0466,
136928
+ "step": 194490
136929
+ },
136930
+ {
136931
+ "epoch": 0.9725,
136932
+ "grad_norm": 0.65625,
136933
+ "learning_rate": 8.291457286432161e-05,
136934
+ "loss": 2.1189,
136935
+ "step": 194500
136936
+ },
136937
+ {
136938
+ "epoch": 0.97255,
136939
+ "grad_norm": 0.63671875,
136940
+ "learning_rate": 8.276381909547738e-05,
136941
+ "loss": 2.0692,
136942
+ "step": 194510
136943
+ },
136944
+ {
136945
+ "epoch": 0.9726,
136946
+ "grad_norm": 0.58984375,
136947
+ "learning_rate": 8.261306532663316e-05,
136948
+ "loss": 2.1415,
136949
+ "step": 194520
136950
+ },
136951
+ {
136952
+ "epoch": 0.97265,
136953
+ "grad_norm": 0.65625,
136954
+ "learning_rate": 8.246231155778894e-05,
136955
+ "loss": 2.0286,
136956
+ "step": 194530
136957
+ },
136958
+ {
136959
+ "epoch": 0.9727,
136960
+ "grad_norm": 0.5546875,
136961
+ "learning_rate": 8.231155778894472e-05,
136962
+ "loss": 2.1371,
136963
+ "step": 194540
136964
+ },
136965
+ {
136966
+ "epoch": 0.97275,
136967
+ "grad_norm": 0.61328125,
136968
+ "learning_rate": 8.21608040201005e-05,
136969
+ "loss": 2.071,
136970
+ "step": 194550
136971
+ },
136972
+ {
136973
+ "epoch": 0.9728,
136974
+ "grad_norm": 0.53515625,
136975
+ "learning_rate": 8.201005025125628e-05,
136976
+ "loss": 2.0851,
136977
+ "step": 194560
136978
+ },
136979
+ {
136980
+ "epoch": 0.97285,
136981
+ "grad_norm": 0.57421875,
136982
+ "learning_rate": 8.185929648241206e-05,
136983
+ "loss": 2.0778,
136984
+ "step": 194570
136985
+ },
136986
+ {
136987
+ "epoch": 0.9729,
136988
+ "grad_norm": 0.66796875,
136989
+ "learning_rate": 8.170854271356784e-05,
136990
+ "loss": 2.0634,
136991
+ "step": 194580
136992
+ },
136993
+ {
136994
+ "epoch": 0.97295,
136995
+ "grad_norm": 0.62109375,
136996
+ "learning_rate": 8.155778894472362e-05,
136997
+ "loss": 2.1477,
136998
+ "step": 194590
136999
+ },
137000
+ {
137001
+ "epoch": 0.973,
137002
+ "grad_norm": 0.6953125,
137003
+ "learning_rate": 8.14070351758794e-05,
137004
+ "loss": 2.0247,
137005
+ "step": 194600
137006
+ },
137007
+ {
137008
+ "epoch": 0.97305,
137009
+ "grad_norm": 0.60546875,
137010
+ "learning_rate": 8.125628140703518e-05,
137011
+ "loss": 2.1606,
137012
+ "step": 194610
137013
+ },
137014
+ {
137015
+ "epoch": 0.9731,
137016
+ "grad_norm": 0.6328125,
137017
+ "learning_rate": 8.110552763819096e-05,
137018
+ "loss": 2.0624,
137019
+ "step": 194620
137020
+ },
137021
+ {
137022
+ "epoch": 0.97315,
137023
+ "grad_norm": 0.80859375,
137024
+ "learning_rate": 8.095477386934673e-05,
137025
+ "loss": 2.068,
137026
+ "step": 194630
137027
+ },
137028
+ {
137029
+ "epoch": 0.9732,
137030
+ "grad_norm": 0.5859375,
137031
+ "learning_rate": 8.080402010050251e-05,
137032
+ "loss": 2.1179,
137033
+ "step": 194640
137034
+ },
137035
+ {
137036
+ "epoch": 0.97325,
137037
+ "grad_norm": 0.5546875,
137038
+ "learning_rate": 8.06532663316583e-05,
137039
+ "loss": 2.0413,
137040
+ "step": 194650
137041
+ },
137042
+ {
137043
+ "epoch": 0.9733,
137044
+ "grad_norm": 0.6484375,
137045
+ "learning_rate": 8.050251256281407e-05,
137046
+ "loss": 2.0631,
137047
+ "step": 194660
137048
+ },
137049
+ {
137050
+ "epoch": 0.97335,
137051
+ "grad_norm": 0.64453125,
137052
+ "learning_rate": 8.035175879396985e-05,
137053
+ "loss": 2.0582,
137054
+ "step": 194670
137055
+ },
137056
+ {
137057
+ "epoch": 0.9734,
137058
+ "grad_norm": 0.703125,
137059
+ "learning_rate": 8.020100502512563e-05,
137060
+ "loss": 2.1336,
137061
+ "step": 194680
137062
+ },
137063
+ {
137064
+ "epoch": 0.97345,
137065
+ "grad_norm": 0.59765625,
137066
+ "learning_rate": 8.005025125628141e-05,
137067
+ "loss": 2.0992,
137068
+ "step": 194690
137069
+ },
137070
+ {
137071
+ "epoch": 0.9735,
137072
+ "grad_norm": 0.61328125,
137073
+ "learning_rate": 7.989949748743718e-05,
137074
+ "loss": 2.1031,
137075
+ "step": 194700
137076
+ },
137077
+ {
137078
+ "epoch": 0.97355,
137079
+ "grad_norm": 0.6171875,
137080
+ "learning_rate": 7.974874371859296e-05,
137081
+ "loss": 2.102,
137082
+ "step": 194710
137083
+ },
137084
+ {
137085
+ "epoch": 0.9736,
137086
+ "grad_norm": 0.6171875,
137087
+ "learning_rate": 7.959798994974874e-05,
137088
+ "loss": 2.0675,
137089
+ "step": 194720
137090
+ },
137091
+ {
137092
+ "epoch": 0.97365,
137093
+ "grad_norm": 0.58984375,
137094
+ "learning_rate": 7.944723618090452e-05,
137095
+ "loss": 2.0498,
137096
+ "step": 194730
137097
+ },
137098
+ {
137099
+ "epoch": 0.9737,
137100
+ "grad_norm": 0.64453125,
137101
+ "learning_rate": 7.92964824120603e-05,
137102
+ "loss": 2.0321,
137103
+ "step": 194740
137104
+ },
137105
+ {
137106
+ "epoch": 0.97375,
137107
+ "grad_norm": 0.625,
137108
+ "learning_rate": 7.914572864321608e-05,
137109
+ "loss": 2.0695,
137110
+ "step": 194750
137111
+ },
137112
+ {
137113
+ "epoch": 0.9738,
137114
+ "grad_norm": 0.671875,
137115
+ "learning_rate": 7.899497487437186e-05,
137116
+ "loss": 2.0564,
137117
+ "step": 194760
137118
+ },
137119
+ {
137120
+ "epoch": 0.97385,
137121
+ "grad_norm": 0.5390625,
137122
+ "learning_rate": 7.884422110552764e-05,
137123
+ "loss": 2.1664,
137124
+ "step": 194770
137125
+ },
137126
+ {
137127
+ "epoch": 0.9739,
137128
+ "grad_norm": 0.6484375,
137129
+ "learning_rate": 7.869346733668342e-05,
137130
+ "loss": 2.0693,
137131
+ "step": 194780
137132
+ },
137133
+ {
137134
+ "epoch": 0.97395,
137135
+ "grad_norm": 0.69140625,
137136
+ "learning_rate": 7.85427135678392e-05,
137137
+ "loss": 2.0349,
137138
+ "step": 194790
137139
+ },
137140
+ {
137141
+ "epoch": 0.974,
137142
+ "grad_norm": 0.640625,
137143
+ "learning_rate": 7.839195979899498e-05,
137144
+ "loss": 2.0468,
137145
+ "step": 194800
137146
+ },
137147
+ {
137148
+ "epoch": 0.97405,
137149
+ "grad_norm": 0.69921875,
137150
+ "learning_rate": 7.824120603015076e-05,
137151
+ "loss": 2.1135,
137152
+ "step": 194810
137153
+ },
137154
+ {
137155
+ "epoch": 0.9741,
137156
+ "grad_norm": 0.546875,
137157
+ "learning_rate": 7.809045226130654e-05,
137158
+ "loss": 2.1618,
137159
+ "step": 194820
137160
+ },
137161
+ {
137162
+ "epoch": 0.97415,
137163
+ "grad_norm": 0.61328125,
137164
+ "learning_rate": 7.793969849246232e-05,
137165
+ "loss": 2.06,
137166
+ "step": 194830
137167
+ },
137168
+ {
137169
+ "epoch": 0.9742,
137170
+ "grad_norm": 0.609375,
137171
+ "learning_rate": 7.77889447236181e-05,
137172
+ "loss": 2.1369,
137173
+ "step": 194840
137174
+ },
137175
+ {
137176
+ "epoch": 0.97425,
137177
+ "grad_norm": 0.5703125,
137178
+ "learning_rate": 7.763819095477388e-05,
137179
+ "loss": 2.1368,
137180
+ "step": 194850
137181
+ },
137182
+ {
137183
+ "epoch": 0.9743,
137184
+ "grad_norm": 0.59375,
137185
+ "learning_rate": 7.748743718592966e-05,
137186
+ "loss": 2.0634,
137187
+ "step": 194860
137188
+ },
137189
+ {
137190
+ "epoch": 0.97435,
137191
+ "grad_norm": 0.58984375,
137192
+ "learning_rate": 7.733668341708543e-05,
137193
+ "loss": 2.075,
137194
+ "step": 194870
137195
+ },
137196
+ {
137197
+ "epoch": 0.9744,
137198
+ "grad_norm": 0.546875,
137199
+ "learning_rate": 7.718592964824121e-05,
137200
+ "loss": 2.0687,
137201
+ "step": 194880
137202
+ },
137203
+ {
137204
+ "epoch": 0.97445,
137205
+ "grad_norm": 0.6484375,
137206
+ "learning_rate": 7.7035175879397e-05,
137207
+ "loss": 2.0964,
137208
+ "step": 194890
137209
+ },
137210
+ {
137211
+ "epoch": 0.9745,
137212
+ "grad_norm": 0.58203125,
137213
+ "learning_rate": 7.688442211055277e-05,
137214
+ "loss": 2.0518,
137215
+ "step": 194900
137216
+ },
137217
+ {
137218
+ "epoch": 0.97455,
137219
+ "grad_norm": 0.59765625,
137220
+ "learning_rate": 7.673366834170854e-05,
137221
+ "loss": 2.1192,
137222
+ "step": 194910
137223
+ },
137224
+ {
137225
+ "epoch": 0.9746,
137226
+ "grad_norm": 0.62109375,
137227
+ "learning_rate": 7.658291457286432e-05,
137228
+ "loss": 2.0555,
137229
+ "step": 194920
137230
+ },
137231
+ {
137232
+ "epoch": 0.97465,
137233
+ "grad_norm": 0.59765625,
137234
+ "learning_rate": 7.64321608040201e-05,
137235
+ "loss": 2.1182,
137236
+ "step": 194930
137237
+ },
137238
+ {
137239
+ "epoch": 0.9747,
137240
+ "grad_norm": 0.6171875,
137241
+ "learning_rate": 7.628140703517588e-05,
137242
+ "loss": 2.0265,
137243
+ "step": 194940
137244
+ },
137245
+ {
137246
+ "epoch": 0.97475,
137247
+ "grad_norm": 0.58203125,
137248
+ "learning_rate": 7.613065326633166e-05,
137249
+ "loss": 2.1029,
137250
+ "step": 194950
137251
+ },
137252
+ {
137253
+ "epoch": 0.9748,
137254
+ "grad_norm": 0.63671875,
137255
+ "learning_rate": 7.597989949748744e-05,
137256
+ "loss": 2.0365,
137257
+ "step": 194960
137258
+ },
137259
+ {
137260
+ "epoch": 0.97485,
137261
+ "grad_norm": 0.640625,
137262
+ "learning_rate": 7.582914572864322e-05,
137263
+ "loss": 2.0575,
137264
+ "step": 194970
137265
+ },
137266
+ {
137267
+ "epoch": 0.9749,
137268
+ "grad_norm": 0.5859375,
137269
+ "learning_rate": 7.5678391959799e-05,
137270
+ "loss": 2.09,
137271
+ "step": 194980
137272
+ },
137273
+ {
137274
+ "epoch": 0.97495,
137275
+ "grad_norm": 0.59765625,
137276
+ "learning_rate": 7.552763819095478e-05,
137277
+ "loss": 2.0609,
137278
+ "step": 194990
137279
+ },
137280
+ {
137281
+ "epoch": 0.975,
137282
+ "grad_norm": 0.5703125,
137283
+ "learning_rate": 7.537688442211056e-05,
137284
+ "loss": 2.1511,
137285
+ "step": 195000
137286
+ },
137287
+ {
137288
+ "epoch": 0.97505,
137289
+ "grad_norm": 0.57421875,
137290
+ "learning_rate": 7.522613065326634e-05,
137291
+ "loss": 2.0456,
137292
+ "step": 195010
137293
+ },
137294
+ {
137295
+ "epoch": 0.9751,
137296
+ "grad_norm": 0.57421875,
137297
+ "learning_rate": 7.507537688442212e-05,
137298
+ "loss": 2.0705,
137299
+ "step": 195020
137300
+ },
137301
+ {
137302
+ "epoch": 0.97515,
137303
+ "grad_norm": 0.59765625,
137304
+ "learning_rate": 7.49246231155779e-05,
137305
+ "loss": 2.0374,
137306
+ "step": 195030
137307
+ },
137308
+ {
137309
+ "epoch": 0.9752,
137310
+ "grad_norm": 0.56640625,
137311
+ "learning_rate": 7.477386934673368e-05,
137312
+ "loss": 2.1065,
137313
+ "step": 195040
137314
+ },
137315
+ {
137316
+ "epoch": 0.97525,
137317
+ "grad_norm": 0.58203125,
137318
+ "learning_rate": 7.462311557788946e-05,
137319
+ "loss": 2.0951,
137320
+ "step": 195050
137321
+ },
137322
+ {
137323
+ "epoch": 0.9753,
137324
+ "grad_norm": 0.7265625,
137325
+ "learning_rate": 7.447236180904524e-05,
137326
+ "loss": 2.0473,
137327
+ "step": 195060
137328
+ },
137329
+ {
137330
+ "epoch": 0.97535,
137331
+ "grad_norm": 0.69140625,
137332
+ "learning_rate": 7.432160804020102e-05,
137333
+ "loss": 2.121,
137334
+ "step": 195070
137335
+ },
137336
+ {
137337
+ "epoch": 0.9754,
137338
+ "grad_norm": 0.62890625,
137339
+ "learning_rate": 7.41708542713568e-05,
137340
+ "loss": 2.0394,
137341
+ "step": 195080
137342
+ },
137343
+ {
137344
+ "epoch": 0.97545,
137345
+ "grad_norm": 0.6875,
137346
+ "learning_rate": 7.402010050251256e-05,
137347
+ "loss": 2.094,
137348
+ "step": 195090
137349
+ },
137350
+ {
137351
+ "epoch": 0.9755,
137352
+ "grad_norm": 0.5859375,
137353
+ "learning_rate": 7.386934673366834e-05,
137354
+ "loss": 2.0743,
137355
+ "step": 195100
137356
+ },
137357
+ {
137358
+ "epoch": 0.97555,
137359
+ "grad_norm": 0.58984375,
137360
+ "learning_rate": 7.371859296482412e-05,
137361
+ "loss": 2.0448,
137362
+ "step": 195110
137363
+ },
137364
+ {
137365
+ "epoch": 0.9756,
137366
+ "grad_norm": 0.59765625,
137367
+ "learning_rate": 7.35678391959799e-05,
137368
+ "loss": 2.1157,
137369
+ "step": 195120
137370
+ },
137371
+ {
137372
+ "epoch": 0.97565,
137373
+ "grad_norm": 0.625,
137374
+ "learning_rate": 7.341708542713568e-05,
137375
+ "loss": 2.0819,
137376
+ "step": 195130
137377
+ },
137378
+ {
137379
+ "epoch": 0.9757,
137380
+ "grad_norm": 0.5625,
137381
+ "learning_rate": 7.326633165829146e-05,
137382
+ "loss": 2.0982,
137383
+ "step": 195140
137384
+ },
137385
+ {
137386
+ "epoch": 0.97575,
137387
+ "grad_norm": 0.72265625,
137388
+ "learning_rate": 7.311557788944724e-05,
137389
+ "loss": 1.9995,
137390
+ "step": 195150
137391
+ },
137392
+ {
137393
+ "epoch": 0.9758,
137394
+ "grad_norm": 0.63671875,
137395
+ "learning_rate": 7.2964824120603e-05,
137396
+ "loss": 2.0914,
137397
+ "step": 195160
137398
+ },
137399
+ {
137400
+ "epoch": 0.97585,
137401
+ "grad_norm": 0.58203125,
137402
+ "learning_rate": 7.281407035175879e-05,
137403
+ "loss": 2.1213,
137404
+ "step": 195170
137405
+ },
137406
+ {
137407
+ "epoch": 0.9759,
137408
+ "grad_norm": 0.5625,
137409
+ "learning_rate": 7.266331658291457e-05,
137410
+ "loss": 2.0208,
137411
+ "step": 195180
137412
+ },
137413
+ {
137414
+ "epoch": 0.97595,
137415
+ "grad_norm": 0.5859375,
137416
+ "learning_rate": 7.251256281407035e-05,
137417
+ "loss": 2.1003,
137418
+ "step": 195190
137419
+ },
137420
+ {
137421
+ "epoch": 0.976,
137422
+ "grad_norm": 0.7578125,
137423
+ "learning_rate": 7.236180904522613e-05,
137424
+ "loss": 2.0308,
137425
+ "step": 195200
137426
+ },
137427
+ {
137428
+ "epoch": 0.97605,
137429
+ "grad_norm": 0.671875,
137430
+ "learning_rate": 7.22110552763819e-05,
137431
+ "loss": 2.107,
137432
+ "step": 195210
137433
+ },
137434
+ {
137435
+ "epoch": 0.9761,
137436
+ "grad_norm": 0.62890625,
137437
+ "learning_rate": 7.206030150753768e-05,
137438
+ "loss": 2.0508,
137439
+ "step": 195220
137440
+ },
137441
+ {
137442
+ "epoch": 0.97615,
137443
+ "grad_norm": 0.58984375,
137444
+ "learning_rate": 7.190954773869346e-05,
137445
+ "loss": 2.089,
137446
+ "step": 195230
137447
+ },
137448
+ {
137449
+ "epoch": 0.9762,
137450
+ "grad_norm": 0.6171875,
137451
+ "learning_rate": 7.175879396984924e-05,
137452
+ "loss": 2.0705,
137453
+ "step": 195240
137454
+ },
137455
+ {
137456
+ "epoch": 0.97625,
137457
+ "grad_norm": 0.6328125,
137458
+ "learning_rate": 7.160804020100502e-05,
137459
+ "loss": 2.0752,
137460
+ "step": 195250
137461
+ },
137462
+ {
137463
+ "epoch": 0.9763,
137464
+ "grad_norm": 0.625,
137465
+ "learning_rate": 7.14572864321608e-05,
137466
+ "loss": 2.0629,
137467
+ "step": 195260
137468
+ },
137469
+ {
137470
+ "epoch": 0.97635,
137471
+ "grad_norm": 0.71484375,
137472
+ "learning_rate": 7.130653266331658e-05,
137473
+ "loss": 2.044,
137474
+ "step": 195270
137475
+ },
137476
+ {
137477
+ "epoch": 0.9764,
137478
+ "grad_norm": 0.625,
137479
+ "learning_rate": 7.115577889447236e-05,
137480
+ "loss": 2.1158,
137481
+ "step": 195280
137482
+ },
137483
+ {
137484
+ "epoch": 0.97645,
137485
+ "grad_norm": 0.734375,
137486
+ "learning_rate": 7.100502512562814e-05,
137487
+ "loss": 2.0761,
137488
+ "step": 195290
137489
+ },
137490
+ {
137491
+ "epoch": 0.9765,
137492
+ "grad_norm": 0.60546875,
137493
+ "learning_rate": 7.085427135678392e-05,
137494
+ "loss": 2.0954,
137495
+ "step": 195300
137496
+ },
137497
+ {
137498
+ "epoch": 0.97655,
137499
+ "grad_norm": 0.6171875,
137500
+ "learning_rate": 7.07035175879397e-05,
137501
+ "loss": 2.0375,
137502
+ "step": 195310
137503
+ },
137504
+ {
137505
+ "epoch": 0.9766,
137506
+ "grad_norm": 0.65625,
137507
+ "learning_rate": 7.055276381909548e-05,
137508
+ "loss": 2.0542,
137509
+ "step": 195320
137510
+ },
137511
+ {
137512
+ "epoch": 0.97665,
137513
+ "grad_norm": 0.59375,
137514
+ "learning_rate": 7.040201005025126e-05,
137515
+ "loss": 2.0643,
137516
+ "step": 195330
137517
+ },
137518
+ {
137519
+ "epoch": 0.9767,
137520
+ "grad_norm": 0.69140625,
137521
+ "learning_rate": 7.025125628140704e-05,
137522
+ "loss": 2.0963,
137523
+ "step": 195340
137524
+ },
137525
+ {
137526
+ "epoch": 0.97675,
137527
+ "grad_norm": 0.7421875,
137528
+ "learning_rate": 7.010050251256282e-05,
137529
+ "loss": 2.0833,
137530
+ "step": 195350
137531
+ },
137532
+ {
137533
+ "epoch": 0.9768,
137534
+ "grad_norm": 0.62890625,
137535
+ "learning_rate": 6.99497487437186e-05,
137536
+ "loss": 2.0813,
137537
+ "step": 195360
137538
+ },
137539
+ {
137540
+ "epoch": 0.97685,
137541
+ "grad_norm": 0.59375,
137542
+ "learning_rate": 6.979899497487437e-05,
137543
+ "loss": 2.0895,
137544
+ "step": 195370
137545
+ },
137546
+ {
137547
+ "epoch": 0.9769,
137548
+ "grad_norm": 0.7109375,
137549
+ "learning_rate": 6.964824120603015e-05,
137550
+ "loss": 2.0825,
137551
+ "step": 195380
137552
+ },
137553
+ {
137554
+ "epoch": 0.97695,
137555
+ "grad_norm": 0.5625,
137556
+ "learning_rate": 6.949748743718593e-05,
137557
+ "loss": 2.0591,
137558
+ "step": 195390
137559
+ },
137560
+ {
137561
+ "epoch": 0.977,
137562
+ "grad_norm": 0.5859375,
137563
+ "learning_rate": 6.93467336683417e-05,
137564
+ "loss": 2.0933,
137565
+ "step": 195400
137566
+ },
137567
+ {
137568
+ "epoch": 0.97705,
137569
+ "grad_norm": 0.6484375,
137570
+ "learning_rate": 6.919597989949749e-05,
137571
+ "loss": 2.0985,
137572
+ "step": 195410
137573
+ },
137574
+ {
137575
+ "epoch": 0.9771,
137576
+ "grad_norm": 0.58984375,
137577
+ "learning_rate": 6.904522613065327e-05,
137578
+ "loss": 2.1114,
137579
+ "step": 195420
137580
+ },
137581
+ {
137582
+ "epoch": 0.97715,
137583
+ "grad_norm": 0.703125,
137584
+ "learning_rate": 6.889447236180905e-05,
137585
+ "loss": 2.0591,
137586
+ "step": 195430
137587
+ },
137588
+ {
137589
+ "epoch": 0.9772,
137590
+ "grad_norm": 0.6015625,
137591
+ "learning_rate": 6.874371859296482e-05,
137592
+ "loss": 2.1221,
137593
+ "step": 195440
137594
+ },
137595
+ {
137596
+ "epoch": 0.97725,
137597
+ "grad_norm": 0.640625,
137598
+ "learning_rate": 6.85929648241206e-05,
137599
+ "loss": 2.0875,
137600
+ "step": 195450
137601
+ },
137602
+ {
137603
+ "epoch": 0.9773,
137604
+ "grad_norm": 0.59765625,
137605
+ "learning_rate": 6.844221105527638e-05,
137606
+ "loss": 2.0736,
137607
+ "step": 195460
137608
+ },
137609
+ {
137610
+ "epoch": 0.97735,
137611
+ "grad_norm": 0.61328125,
137612
+ "learning_rate": 6.829145728643216e-05,
137613
+ "loss": 2.115,
137614
+ "step": 195470
137615
+ },
137616
+ {
137617
+ "epoch": 0.9774,
137618
+ "grad_norm": 0.63671875,
137619
+ "learning_rate": 6.814070351758794e-05,
137620
+ "loss": 2.056,
137621
+ "step": 195480
137622
+ },
137623
+ {
137624
+ "epoch": 0.97745,
137625
+ "grad_norm": 0.5546875,
137626
+ "learning_rate": 6.798994974874372e-05,
137627
+ "loss": 2.1237,
137628
+ "step": 195490
137629
+ },
137630
+ {
137631
+ "epoch": 0.9775,
137632
+ "grad_norm": 0.62890625,
137633
+ "learning_rate": 6.78391959798995e-05,
137634
+ "loss": 2.0352,
137635
+ "step": 195500
137636
+ },
137637
+ {
137638
+ "epoch": 0.97755,
137639
+ "grad_norm": 0.69921875,
137640
+ "learning_rate": 6.768844221105528e-05,
137641
+ "loss": 2.1308,
137642
+ "step": 195510
137643
+ },
137644
+ {
137645
+ "epoch": 0.9776,
137646
+ "grad_norm": 0.62109375,
137647
+ "learning_rate": 6.753768844221106e-05,
137648
+ "loss": 2.0726,
137649
+ "step": 195520
137650
+ },
137651
+ {
137652
+ "epoch": 0.97765,
137653
+ "grad_norm": 0.6171875,
137654
+ "learning_rate": 6.738693467336684e-05,
137655
+ "loss": 2.0971,
137656
+ "step": 195530
137657
+ },
137658
+ {
137659
+ "epoch": 0.9777,
137660
+ "grad_norm": 0.66015625,
137661
+ "learning_rate": 6.723618090452262e-05,
137662
+ "loss": 2.074,
137663
+ "step": 195540
137664
+ },
137665
+ {
137666
+ "epoch": 0.97775,
137667
+ "grad_norm": 0.60546875,
137668
+ "learning_rate": 6.70854271356784e-05,
137669
+ "loss": 2.0329,
137670
+ "step": 195550
137671
+ },
137672
+ {
137673
+ "epoch": 0.9778,
137674
+ "grad_norm": 0.63671875,
137675
+ "learning_rate": 6.693467336683418e-05,
137676
+ "loss": 2.1586,
137677
+ "step": 195560
137678
+ },
137679
+ {
137680
+ "epoch": 0.97785,
137681
+ "grad_norm": 0.5859375,
137682
+ "learning_rate": 6.678391959798996e-05,
137683
+ "loss": 2.0702,
137684
+ "step": 195570
137685
+ },
137686
+ {
137687
+ "epoch": 0.9779,
137688
+ "grad_norm": 0.6015625,
137689
+ "learning_rate": 6.663316582914573e-05,
137690
+ "loss": 2.1121,
137691
+ "step": 195580
137692
+ },
137693
+ {
137694
+ "epoch": 0.97795,
137695
+ "grad_norm": 0.625,
137696
+ "learning_rate": 6.648241206030151e-05,
137697
+ "loss": 2.0714,
137698
+ "step": 195590
137699
+ },
137700
+ {
137701
+ "epoch": 0.978,
137702
+ "grad_norm": 0.640625,
137703
+ "learning_rate": 6.633165829145729e-05,
137704
+ "loss": 2.119,
137705
+ "step": 195600
137706
+ },
137707
+ {
137708
+ "epoch": 0.97805,
137709
+ "grad_norm": 0.671875,
137710
+ "learning_rate": 6.618090452261307e-05,
137711
+ "loss": 2.0327,
137712
+ "step": 195610
137713
+ },
137714
+ {
137715
+ "epoch": 0.9781,
137716
+ "grad_norm": 0.5859375,
137717
+ "learning_rate": 6.603015075376885e-05,
137718
+ "loss": 2.1342,
137719
+ "step": 195620
137720
+ },
137721
+ {
137722
+ "epoch": 0.97815,
137723
+ "grad_norm": 0.578125,
137724
+ "learning_rate": 6.587939698492463e-05,
137725
+ "loss": 2.0827,
137726
+ "step": 195630
137727
+ },
137728
+ {
137729
+ "epoch": 0.9782,
137730
+ "grad_norm": 0.875,
137731
+ "learning_rate": 6.57286432160804e-05,
137732
+ "loss": 2.0745,
137733
+ "step": 195640
137734
+ },
137735
+ {
137736
+ "epoch": 0.97825,
137737
+ "grad_norm": 0.56640625,
137738
+ "learning_rate": 6.557788944723619e-05,
137739
+ "loss": 2.0922,
137740
+ "step": 195650
137741
+ },
137742
+ {
137743
+ "epoch": 0.9783,
137744
+ "grad_norm": 0.625,
137745
+ "learning_rate": 6.542713567839197e-05,
137746
+ "loss": 2.05,
137747
+ "step": 195660
137748
+ },
137749
+ {
137750
+ "epoch": 0.97835,
137751
+ "grad_norm": 0.5703125,
137752
+ "learning_rate": 6.527638190954773e-05,
137753
+ "loss": 2.0529,
137754
+ "step": 195670
137755
+ },
137756
+ {
137757
+ "epoch": 0.9784,
137758
+ "grad_norm": 0.62890625,
137759
+ "learning_rate": 6.512562814070351e-05,
137760
+ "loss": 2.1066,
137761
+ "step": 195680
137762
+ },
137763
+ {
137764
+ "epoch": 0.97845,
137765
+ "grad_norm": 0.57421875,
137766
+ "learning_rate": 6.497487437185929e-05,
137767
+ "loss": 2.1088,
137768
+ "step": 195690
137769
+ },
137770
+ {
137771
+ "epoch": 0.9785,
137772
+ "grad_norm": 0.62109375,
137773
+ "learning_rate": 6.482412060301507e-05,
137774
+ "loss": 2.0692,
137775
+ "step": 195700
137776
+ },
137777
+ {
137778
+ "epoch": 0.97855,
137779
+ "grad_norm": 0.6640625,
137780
+ "learning_rate": 6.467336683417085e-05,
137781
+ "loss": 2.0633,
137782
+ "step": 195710
137783
+ },
137784
+ {
137785
+ "epoch": 0.9786,
137786
+ "grad_norm": 0.53125,
137787
+ "learning_rate": 6.452261306532663e-05,
137788
+ "loss": 2.0881,
137789
+ "step": 195720
137790
+ },
137791
+ {
137792
+ "epoch": 0.97865,
137793
+ "grad_norm": 0.6484375,
137794
+ "learning_rate": 6.437185929648241e-05,
137795
+ "loss": 2.0878,
137796
+ "step": 195730
137797
+ },
137798
+ {
137799
+ "epoch": 0.9787,
137800
+ "grad_norm": 0.6484375,
137801
+ "learning_rate": 6.422110552763819e-05,
137802
+ "loss": 2.0973,
137803
+ "step": 195740
137804
+ },
137805
+ {
137806
+ "epoch": 0.97875,
137807
+ "grad_norm": 0.60546875,
137808
+ "learning_rate": 6.407035175879397e-05,
137809
+ "loss": 2.0905,
137810
+ "step": 195750
137811
+ },
137812
+ {
137813
+ "epoch": 0.9788,
137814
+ "grad_norm": 0.55859375,
137815
+ "learning_rate": 6.391959798994975e-05,
137816
+ "loss": 2.0879,
137817
+ "step": 195760
137818
+ },
137819
+ {
137820
+ "epoch": 0.97885,
137821
+ "grad_norm": 0.63671875,
137822
+ "learning_rate": 6.376884422110553e-05,
137823
+ "loss": 2.0914,
137824
+ "step": 195770
137825
+ },
137826
+ {
137827
+ "epoch": 0.9789,
137828
+ "grad_norm": 0.65625,
137829
+ "learning_rate": 6.361809045226131e-05,
137830
+ "loss": 2.0636,
137831
+ "step": 195780
137832
+ },
137833
+ {
137834
+ "epoch": 0.97895,
137835
+ "grad_norm": 0.578125,
137836
+ "learning_rate": 6.346733668341709e-05,
137837
+ "loss": 2.1091,
137838
+ "step": 195790
137839
+ },
137840
+ {
137841
+ "epoch": 0.979,
137842
+ "grad_norm": 0.58984375,
137843
+ "learning_rate": 6.331658291457287e-05,
137844
+ "loss": 2.0525,
137845
+ "step": 195800
137846
+ },
137847
+ {
137848
+ "epoch": 0.97905,
137849
+ "grad_norm": 0.5859375,
137850
+ "learning_rate": 6.316582914572865e-05,
137851
+ "loss": 2.0604,
137852
+ "step": 195810
137853
+ },
137854
+ {
137855
+ "epoch": 0.9791,
137856
+ "grad_norm": 0.609375,
137857
+ "learning_rate": 6.301507537688443e-05,
137858
+ "loss": 2.0758,
137859
+ "step": 195820
137860
+ },
137861
+ {
137862
+ "epoch": 0.97915,
137863
+ "grad_norm": 0.61328125,
137864
+ "learning_rate": 6.28643216080402e-05,
137865
+ "loss": 2.0839,
137866
+ "step": 195830
137867
+ },
137868
+ {
137869
+ "epoch": 0.9792,
137870
+ "grad_norm": 0.58203125,
137871
+ "learning_rate": 6.271356783919597e-05,
137872
+ "loss": 2.1228,
137873
+ "step": 195840
137874
+ },
137875
+ {
137876
+ "epoch": 0.97925,
137877
+ "grad_norm": 0.6171875,
137878
+ "learning_rate": 6.256281407035175e-05,
137879
+ "loss": 2.0687,
137880
+ "step": 195850
137881
+ },
137882
+ {
137883
+ "epoch": 0.9793,
137884
+ "grad_norm": 0.671875,
137885
+ "learning_rate": 6.241206030150753e-05,
137886
+ "loss": 2.0924,
137887
+ "step": 195860
137888
+ },
137889
+ {
137890
+ "epoch": 0.97935,
137891
+ "grad_norm": 0.67578125,
137892
+ "learning_rate": 6.226130653266331e-05,
137893
+ "loss": 2.0714,
137894
+ "step": 195870
137895
+ },
137896
+ {
137897
+ "epoch": 0.9794,
137898
+ "grad_norm": 0.5625,
137899
+ "learning_rate": 6.211055276381909e-05,
137900
+ "loss": 2.0957,
137901
+ "step": 195880
137902
+ },
137903
+ {
137904
+ "epoch": 0.97945,
137905
+ "grad_norm": 0.609375,
137906
+ "learning_rate": 6.195979899497487e-05,
137907
+ "loss": 2.0787,
137908
+ "step": 195890
137909
+ },
137910
+ {
137911
+ "epoch": 0.9795,
137912
+ "grad_norm": 0.625,
137913
+ "learning_rate": 6.180904522613065e-05,
137914
+ "loss": 2.053,
137915
+ "step": 195900
137916
+ },
137917
+ {
137918
+ "epoch": 0.97955,
137919
+ "grad_norm": 0.6796875,
137920
+ "learning_rate": 6.165829145728643e-05,
137921
+ "loss": 2.1153,
137922
+ "step": 195910
137923
+ },
137924
+ {
137925
+ "epoch": 0.9796,
137926
+ "grad_norm": 0.65234375,
137927
+ "learning_rate": 6.150753768844221e-05,
137928
+ "loss": 2.1045,
137929
+ "step": 195920
137930
+ },
137931
+ {
137932
+ "epoch": 0.97965,
137933
+ "grad_norm": 0.6640625,
137934
+ "learning_rate": 6.135678391959799e-05,
137935
+ "loss": 2.1306,
137936
+ "step": 195930
137937
+ },
137938
+ {
137939
+ "epoch": 0.9797,
137940
+ "grad_norm": 0.63671875,
137941
+ "learning_rate": 6.120603015075377e-05,
137942
+ "loss": 2.0628,
137943
+ "step": 195940
137944
+ },
137945
+ {
137946
+ "epoch": 0.97975,
137947
+ "grad_norm": 0.62109375,
137948
+ "learning_rate": 6.105527638190955e-05,
137949
+ "loss": 2.1002,
137950
+ "step": 195950
137951
+ },
137952
+ {
137953
+ "epoch": 0.9798,
137954
+ "grad_norm": 0.58984375,
137955
+ "learning_rate": 6.090452261306533e-05,
137956
+ "loss": 2.024,
137957
+ "step": 195960
137958
+ },
137959
+ {
137960
+ "epoch": 0.97985,
137961
+ "grad_norm": 0.63671875,
137962
+ "learning_rate": 6.075376884422111e-05,
137963
+ "loss": 2.0726,
137964
+ "step": 195970
137965
+ },
137966
+ {
137967
+ "epoch": 0.9799,
137968
+ "grad_norm": 0.66015625,
137969
+ "learning_rate": 6.060301507537689e-05,
137970
+ "loss": 2.121,
137971
+ "step": 195980
137972
+ },
137973
+ {
137974
+ "epoch": 0.97995,
137975
+ "grad_norm": 0.60546875,
137976
+ "learning_rate": 6.045226130653266e-05,
137977
+ "loss": 2.1008,
137978
+ "step": 195990
137979
+ },
137980
+ {
137981
+ "epoch": 0.98,
137982
+ "grad_norm": 0.59765625,
137983
+ "learning_rate": 6.030150753768844e-05,
137984
+ "loss": 2.0839,
137985
+ "step": 196000
137986
+ },
137987
+ {
137988
+ "epoch": 0.98,
137989
+ "eval_loss": 2.0759713649749756,
137990
+ "eval_runtime": 47.6126,
137991
+ "eval_samples_per_second": 52.507,
137992
+ "eval_steps_per_second": 0.105,
137993
+ "step": 196000
137994
  }
137995
  ],
137996
  "logging_steps": 10,
 
138010
  "attributes": {}
138011
  }
138012
  },
138013
+ "total_flos": 5.183602659898163e+18,
138014
  "train_batch_size": 64,
138015
  "trial_name": null,
138016
  "trial_params": null