nadahlberg commited on
Commit
e80a0af
1 Parent(s): 0c19e85

Training in progress, step 6000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cf600ae1df186bc12ad3f96ab795fffe1d66fb2f392cc0b1e0da4a22ab0d96e
3
  size 325690872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a9324f1d5bbfda5c5ef464f5ae538c67609b9809f4bb9e7b5ff3d7efd2a04d0
3
  size 325690872
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf93cee440ae490532b023d8471daf2338941414d800be16e32ded6a64f93d7f
3
  size 651550778
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:801f28773de7283c3987b1e383caa6f1e78eb2516d24634a1b704f529ee08071
3
  size 651550778
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec18774ecda9939f8782edcd4f5f8fce6da7592595690e14c24eef50baf2c0c3
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ff22c72cb50d4b6353cf336b950d2c54115c739606544de9a8d3b0fab0ef188
3
  size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4c0ec63d1c0dc11680449892ab126d12019d874afeee380ecc374206240ea42
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c253ad05d3ffa98dc1e5291ec640e6158218602254206e6fb97ea82185040775
3
  size 15920
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e28126ecfc631abb4fc9da373d1b2e371ee08e1afb70dd1bd6a8843915e342cb
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edcdbb53b869133655de3ceb14325bb161e7aa238640244125393ff4afcb7363
3
  size 15920
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:292543bb4b1f482ad1860a0e2ebb75a7e8e43947f98454fed5488300f0d60c86
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b095e9807a468e05241a1d389fd326265bee89778655d3c00298c2f6abcee791
3
  size 15920
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7b5880ad30944d87faa9b06efddb79f74890cde9a944a1f2d363f811c9239c9
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4da3805c54028ef89f9cce32b640836c54b432b251fae2dea3d0182fa96f403
3
  size 15920
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5901ea95a1be5a3543b7ef8f079f4d1d55e4eaa10458b9caaaea3d13255968dd
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08bb3c8d2c1053a6b93a1342acb589ea4695b3fdb4dfeaf3675191c7e3390d3d
3
  size 15920
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8644f98d8bbe4f93e3d4a185b369b37291d0939398a71417839a2fba27c71b48
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cfebda0f5a673869a40a8a82d7e308c843f9b55ec287861a6a0670b4c43c9d4
3
  size 15920
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9357709bce658e95370c490cc69898a91229da20c6acf9633f86ce87659701ac
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34bc6aae481e0bf889f93cc019a7692fbdaba7925f341ec202dd711aa1d2a724
3
  size 15920
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dca225c673bc177646dfd17cf5a5bd690b36b06f0cd38f8cfb81d73abe0eb794
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aacbaeb2fb166ed7abcf1ad1b0f61705e2164e64d30abdff36a13f020686241
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.02,
5
  "eval_steps": 2000,
6
- "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2823,6 +2823,1414 @@
2823
  "eval_samples_per_second": 28.059,
2824
  "eval_steps_per_second": 0.449,
2825
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2826
  }
2827
  ],
2828
  "logging_steps": 10,
@@ -2842,7 +4250,7 @@
2842
  "attributes": {}
2843
  }
2844
  },
2845
- "total_flos": 1.586817140785152e+17,
2846
  "train_batch_size": 64,
2847
  "trial_name": null,
2848
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.03,
5
  "eval_steps": 2000,
6
+ "global_step": 6000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2823
  "eval_samples_per_second": 28.059,
2824
  "eval_steps_per_second": 0.449,
2825
  "step": 4000
2826
+ },
2827
+ {
2828
+ "epoch": 0.02005,
2829
+ "grad_norm": 31.375,
2830
+ "learning_rate": 4.9243718592964825e-05,
2831
+ "loss": 7.56,
2832
+ "step": 4010
2833
+ },
2834
+ {
2835
+ "epoch": 0.0201,
2836
+ "grad_norm": 1.28125,
2837
+ "learning_rate": 4.9241206030150756e-05,
2838
+ "loss": 7.5483,
2839
+ "step": 4020
2840
+ },
2841
+ {
2842
+ "epoch": 0.02015,
2843
+ "grad_norm": 7.59375,
2844
+ "learning_rate": 4.923869346733669e-05,
2845
+ "loss": 7.5583,
2846
+ "step": 4030
2847
+ },
2848
+ {
2849
+ "epoch": 0.0202,
2850
+ "grad_norm": 3.21875,
2851
+ "learning_rate": 4.923618090452262e-05,
2852
+ "loss": 7.5831,
2853
+ "step": 4040
2854
+ },
2855
+ {
2856
+ "epoch": 0.02025,
2857
+ "grad_norm": 3.125,
2858
+ "learning_rate": 4.923366834170854e-05,
2859
+ "loss": 7.5884,
2860
+ "step": 4050
2861
+ },
2862
+ {
2863
+ "epoch": 0.0203,
2864
+ "grad_norm": 1.625,
2865
+ "learning_rate": 4.9231155778894474e-05,
2866
+ "loss": 7.5728,
2867
+ "step": 4060
2868
+ },
2869
+ {
2870
+ "epoch": 0.02035,
2871
+ "grad_norm": 73.5,
2872
+ "learning_rate": 4.9228643216080405e-05,
2873
+ "loss": 7.5865,
2874
+ "step": 4070
2875
+ },
2876
+ {
2877
+ "epoch": 0.0204,
2878
+ "grad_norm": 15.0,
2879
+ "learning_rate": 4.9226130653266336e-05,
2880
+ "loss": 7.5648,
2881
+ "step": 4080
2882
+ },
2883
+ {
2884
+ "epoch": 0.02045,
2885
+ "grad_norm": 1.359375,
2886
+ "learning_rate": 4.922361809045226e-05,
2887
+ "loss": 7.5471,
2888
+ "step": 4090
2889
+ },
2890
+ {
2891
+ "epoch": 0.0205,
2892
+ "grad_norm": 56.25,
2893
+ "learning_rate": 4.92211055276382e-05,
2894
+ "loss": 7.564,
2895
+ "step": 4100
2896
+ },
2897
+ {
2898
+ "epoch": 0.02055,
2899
+ "grad_norm": 50.75,
2900
+ "learning_rate": 4.921859296482412e-05,
2901
+ "loss": 7.5555,
2902
+ "step": 4110
2903
+ },
2904
+ {
2905
+ "epoch": 0.0206,
2906
+ "grad_norm": 1.7890625,
2907
+ "learning_rate": 4.921608040201005e-05,
2908
+ "loss": 7.5561,
2909
+ "step": 4120
2910
+ },
2911
+ {
2912
+ "epoch": 0.02065,
2913
+ "grad_norm": 2.21875,
2914
+ "learning_rate": 4.9213567839195984e-05,
2915
+ "loss": 7.5605,
2916
+ "step": 4130
2917
+ },
2918
+ {
2919
+ "epoch": 0.0207,
2920
+ "grad_norm": 2.71875,
2921
+ "learning_rate": 4.9211055276381915e-05,
2922
+ "loss": 7.5899,
2923
+ "step": 4140
2924
+ },
2925
+ {
2926
+ "epoch": 0.02075,
2927
+ "grad_norm": 2.25,
2928
+ "learning_rate": 4.920854271356784e-05,
2929
+ "loss": 7.5548,
2930
+ "step": 4150
2931
+ },
2932
+ {
2933
+ "epoch": 0.0208,
2934
+ "grad_norm": 14.5625,
2935
+ "learning_rate": 4.920603015075377e-05,
2936
+ "loss": 7.625,
2937
+ "step": 4160
2938
+ },
2939
+ {
2940
+ "epoch": 0.02085,
2941
+ "grad_norm": 1.765625,
2942
+ "learning_rate": 4.92035175879397e-05,
2943
+ "loss": 7.5754,
2944
+ "step": 4170
2945
+ },
2946
+ {
2947
+ "epoch": 0.0209,
2948
+ "grad_norm": 1.390625,
2949
+ "learning_rate": 4.920100502512563e-05,
2950
+ "loss": 7.601,
2951
+ "step": 4180
2952
+ },
2953
+ {
2954
+ "epoch": 0.02095,
2955
+ "grad_norm": 13.5,
2956
+ "learning_rate": 4.919849246231156e-05,
2957
+ "loss": 7.5853,
2958
+ "step": 4190
2959
+ },
2960
+ {
2961
+ "epoch": 0.021,
2962
+ "grad_norm": 3.421875,
2963
+ "learning_rate": 4.919597989949749e-05,
2964
+ "loss": 7.5461,
2965
+ "step": 4200
2966
+ },
2967
+ {
2968
+ "epoch": 0.02105,
2969
+ "grad_norm": 10.6875,
2970
+ "learning_rate": 4.919346733668342e-05,
2971
+ "loss": 7.5906,
2972
+ "step": 4210
2973
+ },
2974
+ {
2975
+ "epoch": 0.0211,
2976
+ "grad_norm": 2.140625,
2977
+ "learning_rate": 4.919095477386935e-05,
2978
+ "loss": 7.5771,
2979
+ "step": 4220
2980
+ },
2981
+ {
2982
+ "epoch": 0.02115,
2983
+ "grad_norm": 4.09375,
2984
+ "learning_rate": 4.918844221105528e-05,
2985
+ "loss": 7.6011,
2986
+ "step": 4230
2987
+ },
2988
+ {
2989
+ "epoch": 0.0212,
2990
+ "grad_norm": 64.0,
2991
+ "learning_rate": 4.9185929648241205e-05,
2992
+ "loss": 7.5948,
2993
+ "step": 4240
2994
+ },
2995
+ {
2996
+ "epoch": 0.02125,
2997
+ "grad_norm": 2.21875,
2998
+ "learning_rate": 4.9183417085427136e-05,
2999
+ "loss": 7.5682,
3000
+ "step": 4250
3001
+ },
3002
+ {
3003
+ "epoch": 0.0213,
3004
+ "grad_norm": 28.625,
3005
+ "learning_rate": 4.918090452261307e-05,
3006
+ "loss": 7.5859,
3007
+ "step": 4260
3008
+ },
3009
+ {
3010
+ "epoch": 0.02135,
3011
+ "grad_norm": 5.6875,
3012
+ "learning_rate": 4.9178391959799e-05,
3013
+ "loss": 7.591,
3014
+ "step": 4270
3015
+ },
3016
+ {
3017
+ "epoch": 0.0214,
3018
+ "grad_norm": 60.25,
3019
+ "learning_rate": 4.917587939698492e-05,
3020
+ "loss": 7.5652,
3021
+ "step": 4280
3022
+ },
3023
+ {
3024
+ "epoch": 0.02145,
3025
+ "grad_norm": 1.75,
3026
+ "learning_rate": 4.917336683417086e-05,
3027
+ "loss": 7.5578,
3028
+ "step": 4290
3029
+ },
3030
+ {
3031
+ "epoch": 0.0215,
3032
+ "grad_norm": 1.734375,
3033
+ "learning_rate": 4.9170854271356784e-05,
3034
+ "loss": 7.6033,
3035
+ "step": 4300
3036
+ },
3037
+ {
3038
+ "epoch": 0.02155,
3039
+ "grad_norm": 7.75,
3040
+ "learning_rate": 4.9168341708542715e-05,
3041
+ "loss": 7.5929,
3042
+ "step": 4310
3043
+ },
3044
+ {
3045
+ "epoch": 0.0216,
3046
+ "grad_norm": 29.5,
3047
+ "learning_rate": 4.9165829145728646e-05,
3048
+ "loss": 7.5736,
3049
+ "step": 4320
3050
+ },
3051
+ {
3052
+ "epoch": 0.02165,
3053
+ "grad_norm": 11.6875,
3054
+ "learning_rate": 4.916331658291458e-05,
3055
+ "loss": 7.5885,
3056
+ "step": 4330
3057
+ },
3058
+ {
3059
+ "epoch": 0.0217,
3060
+ "grad_norm": 1.1640625,
3061
+ "learning_rate": 4.91608040201005e-05,
3062
+ "loss": 7.6,
3063
+ "step": 4340
3064
+ },
3065
+ {
3066
+ "epoch": 0.02175,
3067
+ "grad_norm": 80.0,
3068
+ "learning_rate": 4.915829145728644e-05,
3069
+ "loss": 7.5922,
3070
+ "step": 4350
3071
+ },
3072
+ {
3073
+ "epoch": 0.0218,
3074
+ "grad_norm": 2.0625,
3075
+ "learning_rate": 4.9155778894472363e-05,
3076
+ "loss": 7.5828,
3077
+ "step": 4360
3078
+ },
3079
+ {
3080
+ "epoch": 0.02185,
3081
+ "grad_norm": 16.5,
3082
+ "learning_rate": 4.9153266331658294e-05,
3083
+ "loss": 7.5455,
3084
+ "step": 4370
3085
+ },
3086
+ {
3087
+ "epoch": 0.0219,
3088
+ "grad_norm": 1.8984375,
3089
+ "learning_rate": 4.915075376884422e-05,
3090
+ "loss": 7.5869,
3091
+ "step": 4380
3092
+ },
3093
+ {
3094
+ "epoch": 0.02195,
3095
+ "grad_norm": 9.25,
3096
+ "learning_rate": 4.9148241206030156e-05,
3097
+ "loss": 7.57,
3098
+ "step": 4390
3099
+ },
3100
+ {
3101
+ "epoch": 0.022,
3102
+ "grad_norm": 1.234375,
3103
+ "learning_rate": 4.914572864321608e-05,
3104
+ "loss": 7.5779,
3105
+ "step": 4400
3106
+ },
3107
+ {
3108
+ "epoch": 0.02205,
3109
+ "grad_norm": 42.5,
3110
+ "learning_rate": 4.914321608040201e-05,
3111
+ "loss": 7.5756,
3112
+ "step": 4410
3113
+ },
3114
+ {
3115
+ "epoch": 0.0221,
3116
+ "grad_norm": 2.40625,
3117
+ "learning_rate": 4.914070351758794e-05,
3118
+ "loss": 7.5708,
3119
+ "step": 4420
3120
+ },
3121
+ {
3122
+ "epoch": 0.02215,
3123
+ "grad_norm": 4.8125,
3124
+ "learning_rate": 4.9138190954773874e-05,
3125
+ "loss": 7.5854,
3126
+ "step": 4430
3127
+ },
3128
+ {
3129
+ "epoch": 0.0222,
3130
+ "grad_norm": 1.296875,
3131
+ "learning_rate": 4.91356783919598e-05,
3132
+ "loss": 7.6228,
3133
+ "step": 4440
3134
+ },
3135
+ {
3136
+ "epoch": 0.02225,
3137
+ "grad_norm": 1.6015625,
3138
+ "learning_rate": 4.9133165829145736e-05,
3139
+ "loss": 7.56,
3140
+ "step": 4450
3141
+ },
3142
+ {
3143
+ "epoch": 0.0223,
3144
+ "grad_norm": 9.375,
3145
+ "learning_rate": 4.913065326633166e-05,
3146
+ "loss": 7.5837,
3147
+ "step": 4460
3148
+ },
3149
+ {
3150
+ "epoch": 0.02235,
3151
+ "grad_norm": 26.5,
3152
+ "learning_rate": 4.912814070351759e-05,
3153
+ "loss": 7.5659,
3154
+ "step": 4470
3155
+ },
3156
+ {
3157
+ "epoch": 0.0224,
3158
+ "grad_norm": 1.5859375,
3159
+ "learning_rate": 4.912562814070352e-05,
3160
+ "loss": 7.622,
3161
+ "step": 4480
3162
+ },
3163
+ {
3164
+ "epoch": 0.02245,
3165
+ "grad_norm": 3.6875,
3166
+ "learning_rate": 4.912311557788945e-05,
3167
+ "loss": 7.5823,
3168
+ "step": 4490
3169
+ },
3170
+ {
3171
+ "epoch": 0.0225,
3172
+ "grad_norm": 76.5,
3173
+ "learning_rate": 4.912060301507538e-05,
3174
+ "loss": 7.5724,
3175
+ "step": 4500
3176
+ },
3177
+ {
3178
+ "epoch": 0.02255,
3179
+ "grad_norm": 14.25,
3180
+ "learning_rate": 4.911809045226131e-05,
3181
+ "loss": 7.5786,
3182
+ "step": 4510
3183
+ },
3184
+ {
3185
+ "epoch": 0.0226,
3186
+ "grad_norm": 1.53125,
3187
+ "learning_rate": 4.911557788944724e-05,
3188
+ "loss": 7.5411,
3189
+ "step": 4520
3190
+ },
3191
+ {
3192
+ "epoch": 0.02265,
3193
+ "grad_norm": 28.125,
3194
+ "learning_rate": 4.911306532663317e-05,
3195
+ "loss": 7.5757,
3196
+ "step": 4530
3197
+ },
3198
+ {
3199
+ "epoch": 0.0227,
3200
+ "grad_norm": 1.4609375,
3201
+ "learning_rate": 4.9110552763819095e-05,
3202
+ "loss": 7.5391,
3203
+ "step": 4540
3204
+ },
3205
+ {
3206
+ "epoch": 0.02275,
3207
+ "grad_norm": 1.546875,
3208
+ "learning_rate": 4.9108040201005026e-05,
3209
+ "loss": 7.6018,
3210
+ "step": 4550
3211
+ },
3212
+ {
3213
+ "epoch": 0.0228,
3214
+ "grad_norm": 3.890625,
3215
+ "learning_rate": 4.910552763819096e-05,
3216
+ "loss": 7.5935,
3217
+ "step": 4560
3218
+ },
3219
+ {
3220
+ "epoch": 0.02285,
3221
+ "grad_norm": 2.28125,
3222
+ "learning_rate": 4.910301507537689e-05,
3223
+ "loss": 7.5827,
3224
+ "step": 4570
3225
+ },
3226
+ {
3227
+ "epoch": 0.0229,
3228
+ "grad_norm": 8.0,
3229
+ "learning_rate": 4.910050251256282e-05,
3230
+ "loss": 7.5591,
3231
+ "step": 4580
3232
+ },
3233
+ {
3234
+ "epoch": 0.02295,
3235
+ "grad_norm": 3.328125,
3236
+ "learning_rate": 4.909798994974874e-05,
3237
+ "loss": 7.563,
3238
+ "step": 4590
3239
+ },
3240
+ {
3241
+ "epoch": 0.023,
3242
+ "grad_norm": 10.875,
3243
+ "learning_rate": 4.9095477386934674e-05,
3244
+ "loss": 7.542,
3245
+ "step": 4600
3246
+ },
3247
+ {
3248
+ "epoch": 0.02305,
3249
+ "grad_norm": 8.25,
3250
+ "learning_rate": 4.9092964824120605e-05,
3251
+ "loss": 7.5628,
3252
+ "step": 4610
3253
+ },
3254
+ {
3255
+ "epoch": 0.0231,
3256
+ "grad_norm": 29.25,
3257
+ "learning_rate": 4.9090452261306536e-05,
3258
+ "loss": 7.5309,
3259
+ "step": 4620
3260
+ },
3261
+ {
3262
+ "epoch": 0.02315,
3263
+ "grad_norm": 2.0,
3264
+ "learning_rate": 4.908793969849246e-05,
3265
+ "loss": 7.554,
3266
+ "step": 4630
3267
+ },
3268
+ {
3269
+ "epoch": 0.0232,
3270
+ "grad_norm": 74.0,
3271
+ "learning_rate": 4.90854271356784e-05,
3272
+ "loss": 7.5653,
3273
+ "step": 4640
3274
+ },
3275
+ {
3276
+ "epoch": 0.02325,
3277
+ "grad_norm": 27.25,
3278
+ "learning_rate": 4.908291457286432e-05,
3279
+ "loss": 7.5813,
3280
+ "step": 4650
3281
+ },
3282
+ {
3283
+ "epoch": 0.0233,
3284
+ "grad_norm": 15.6875,
3285
+ "learning_rate": 4.908040201005025e-05,
3286
+ "loss": 7.5811,
3287
+ "step": 4660
3288
+ },
3289
+ {
3290
+ "epoch": 0.02335,
3291
+ "grad_norm": 4.65625,
3292
+ "learning_rate": 4.907788944723618e-05,
3293
+ "loss": 7.5954,
3294
+ "step": 4670
3295
+ },
3296
+ {
3297
+ "epoch": 0.0234,
3298
+ "grad_norm": 1.6875,
3299
+ "learning_rate": 4.9075376884422115e-05,
3300
+ "loss": 7.5495,
3301
+ "step": 4680
3302
+ },
3303
+ {
3304
+ "epoch": 0.02345,
3305
+ "grad_norm": 138.0,
3306
+ "learning_rate": 4.907286432160804e-05,
3307
+ "loss": 7.5763,
3308
+ "step": 4690
3309
+ },
3310
+ {
3311
+ "epoch": 0.0235,
3312
+ "grad_norm": 3.234375,
3313
+ "learning_rate": 4.907035175879397e-05,
3314
+ "loss": 7.5582,
3315
+ "step": 4700
3316
+ },
3317
+ {
3318
+ "epoch": 0.02355,
3319
+ "grad_norm": 10.4375,
3320
+ "learning_rate": 4.90678391959799e-05,
3321
+ "loss": 7.562,
3322
+ "step": 4710
3323
+ },
3324
+ {
3325
+ "epoch": 0.0236,
3326
+ "grad_norm": 46.25,
3327
+ "learning_rate": 4.906532663316583e-05,
3328
+ "loss": 7.5865,
3329
+ "step": 4720
3330
+ },
3331
+ {
3332
+ "epoch": 0.02365,
3333
+ "grad_norm": 9.5625,
3334
+ "learning_rate": 4.906281407035176e-05,
3335
+ "loss": 7.5701,
3336
+ "step": 4730
3337
+ },
3338
+ {
3339
+ "epoch": 0.0237,
3340
+ "grad_norm": 5.3125,
3341
+ "learning_rate": 4.9060301507537695e-05,
3342
+ "loss": 7.608,
3343
+ "step": 4740
3344
+ },
3345
+ {
3346
+ "epoch": 0.02375,
3347
+ "grad_norm": 9.75,
3348
+ "learning_rate": 4.905778894472362e-05,
3349
+ "loss": 7.5944,
3350
+ "step": 4750
3351
+ },
3352
+ {
3353
+ "epoch": 0.0238,
3354
+ "grad_norm": 8.125,
3355
+ "learning_rate": 4.905527638190955e-05,
3356
+ "loss": 7.6044,
3357
+ "step": 4760
3358
+ },
3359
+ {
3360
+ "epoch": 0.02385,
3361
+ "grad_norm": 1.5546875,
3362
+ "learning_rate": 4.905276381909548e-05,
3363
+ "loss": 7.594,
3364
+ "step": 4770
3365
+ },
3366
+ {
3367
+ "epoch": 0.0239,
3368
+ "grad_norm": 5.1875,
3369
+ "learning_rate": 4.905025125628141e-05,
3370
+ "loss": 7.5989,
3371
+ "step": 4780
3372
+ },
3373
+ {
3374
+ "epoch": 0.02395,
3375
+ "grad_norm": 1.5390625,
3376
+ "learning_rate": 4.9047738693467336e-05,
3377
+ "loss": 7.5679,
3378
+ "step": 4790
3379
+ },
3380
+ {
3381
+ "epoch": 0.024,
3382
+ "grad_norm": 16.875,
3383
+ "learning_rate": 4.9045226130653274e-05,
3384
+ "loss": 7.5629,
3385
+ "step": 4800
3386
+ },
3387
+ {
3388
+ "epoch": 0.02405,
3389
+ "grad_norm": 1.671875,
3390
+ "learning_rate": 4.90427135678392e-05,
3391
+ "loss": 7.5778,
3392
+ "step": 4810
3393
+ },
3394
+ {
3395
+ "epoch": 0.0241,
3396
+ "grad_norm": 1.78125,
3397
+ "learning_rate": 4.904020100502513e-05,
3398
+ "loss": 7.5648,
3399
+ "step": 4820
3400
+ },
3401
+ {
3402
+ "epoch": 0.02415,
3403
+ "grad_norm": 5.84375,
3404
+ "learning_rate": 4.9037688442211053e-05,
3405
+ "loss": 7.5534,
3406
+ "step": 4830
3407
+ },
3408
+ {
3409
+ "epoch": 0.0242,
3410
+ "grad_norm": 1.640625,
3411
+ "learning_rate": 4.903517587939699e-05,
3412
+ "loss": 7.6181,
3413
+ "step": 4840
3414
+ },
3415
+ {
3416
+ "epoch": 0.02425,
3417
+ "grad_norm": 52.25,
3418
+ "learning_rate": 4.9032663316582916e-05,
3419
+ "loss": 7.5699,
3420
+ "step": 4850
3421
+ },
3422
+ {
3423
+ "epoch": 0.0243,
3424
+ "grad_norm": 1.9453125,
3425
+ "learning_rate": 4.9030150753768847e-05,
3426
+ "loss": 7.6262,
3427
+ "step": 4860
3428
+ },
3429
+ {
3430
+ "epoch": 0.02435,
3431
+ "grad_norm": 1.625,
3432
+ "learning_rate": 4.902763819095478e-05,
3433
+ "loss": 7.5909,
3434
+ "step": 4870
3435
+ },
3436
+ {
3437
+ "epoch": 0.0244,
3438
+ "grad_norm": 12.9375,
3439
+ "learning_rate": 4.902512562814071e-05,
3440
+ "loss": 7.5804,
3441
+ "step": 4880
3442
+ },
3443
+ {
3444
+ "epoch": 0.02445,
3445
+ "grad_norm": 1.84375,
3446
+ "learning_rate": 4.902261306532663e-05,
3447
+ "loss": 7.5537,
3448
+ "step": 4890
3449
+ },
3450
+ {
3451
+ "epoch": 0.0245,
3452
+ "grad_norm": 9.0,
3453
+ "learning_rate": 4.9020100502512564e-05,
3454
+ "loss": 7.5913,
3455
+ "step": 4900
3456
+ },
3457
+ {
3458
+ "epoch": 0.02455,
3459
+ "grad_norm": 9.3125,
3460
+ "learning_rate": 4.9017587939698495e-05,
3461
+ "loss": 7.5744,
3462
+ "step": 4910
3463
+ },
3464
+ {
3465
+ "epoch": 0.0246,
3466
+ "grad_norm": 1.5234375,
3467
+ "learning_rate": 4.9015075376884426e-05,
3468
+ "loss": 7.6023,
3469
+ "step": 4920
3470
+ },
3471
+ {
3472
+ "epoch": 0.02465,
3473
+ "grad_norm": 1.8046875,
3474
+ "learning_rate": 4.901256281407036e-05,
3475
+ "loss": 7.575,
3476
+ "step": 4930
3477
+ },
3478
+ {
3479
+ "epoch": 0.0247,
3480
+ "grad_norm": 17.875,
3481
+ "learning_rate": 4.901005025125628e-05,
3482
+ "loss": 7.5894,
3483
+ "step": 4940
3484
+ },
3485
+ {
3486
+ "epoch": 0.02475,
3487
+ "grad_norm": 33.25,
3488
+ "learning_rate": 4.900753768844221e-05,
3489
+ "loss": 7.6012,
3490
+ "step": 4950
3491
+ },
3492
+ {
3493
+ "epoch": 0.0248,
3494
+ "grad_norm": 2.53125,
3495
+ "learning_rate": 4.900502512562814e-05,
3496
+ "loss": 7.5584,
3497
+ "step": 4960
3498
+ },
3499
+ {
3500
+ "epoch": 0.02485,
3501
+ "grad_norm": 2.078125,
3502
+ "learning_rate": 4.9002512562814074e-05,
3503
+ "loss": 7.5945,
3504
+ "step": 4970
3505
+ },
3506
+ {
3507
+ "epoch": 0.0249,
3508
+ "grad_norm": 3.0625,
3509
+ "learning_rate": 4.9e-05,
3510
+ "loss": 7.5507,
3511
+ "step": 4980
3512
+ },
3513
+ {
3514
+ "epoch": 0.02495,
3515
+ "grad_norm": 2.859375,
3516
+ "learning_rate": 4.899748743718593e-05,
3517
+ "loss": 7.585,
3518
+ "step": 4990
3519
+ },
3520
+ {
3521
+ "epoch": 0.025,
3522
+ "grad_norm": 7.1875,
3523
+ "learning_rate": 4.899497487437186e-05,
3524
+ "loss": 7.5395,
3525
+ "step": 5000
3526
+ },
3527
+ {
3528
+ "epoch": 0.02505,
3529
+ "grad_norm": 1.2109375,
3530
+ "learning_rate": 4.899246231155779e-05,
3531
+ "loss": 7.5543,
3532
+ "step": 5010
3533
+ },
3534
+ {
3535
+ "epoch": 0.0251,
3536
+ "grad_norm": 104.0,
3537
+ "learning_rate": 4.8989949748743716e-05,
3538
+ "loss": 7.5663,
3539
+ "step": 5020
3540
+ },
3541
+ {
3542
+ "epoch": 0.02515,
3543
+ "grad_norm": 35.25,
3544
+ "learning_rate": 4.8987437185929654e-05,
3545
+ "loss": 7.5931,
3546
+ "step": 5030
3547
+ },
3548
+ {
3549
+ "epoch": 0.0252,
3550
+ "grad_norm": 1.8046875,
3551
+ "learning_rate": 4.898492462311558e-05,
3552
+ "loss": 7.5548,
3553
+ "step": 5040
3554
+ },
3555
+ {
3556
+ "epoch": 0.02525,
3557
+ "grad_norm": 29.5,
3558
+ "learning_rate": 4.898241206030151e-05,
3559
+ "loss": 7.5911,
3560
+ "step": 5050
3561
+ },
3562
+ {
3563
+ "epoch": 0.0253,
3564
+ "grad_norm": 4.34375,
3565
+ "learning_rate": 4.897989949748744e-05,
3566
+ "loss": 7.5819,
3567
+ "step": 5060
3568
+ },
3569
+ {
3570
+ "epoch": 0.02535,
3571
+ "grad_norm": 1.5234375,
3572
+ "learning_rate": 4.897738693467337e-05,
3573
+ "loss": 7.6094,
3574
+ "step": 5070
3575
+ },
3576
+ {
3577
+ "epoch": 0.0254,
3578
+ "grad_norm": 49.25,
3579
+ "learning_rate": 4.8974874371859295e-05,
3580
+ "loss": 7.5936,
3581
+ "step": 5080
3582
+ },
3583
+ {
3584
+ "epoch": 0.02545,
3585
+ "grad_norm": 2.671875,
3586
+ "learning_rate": 4.897236180904523e-05,
3587
+ "loss": 7.5785,
3588
+ "step": 5090
3589
+ },
3590
+ {
3591
+ "epoch": 0.0255,
3592
+ "grad_norm": 4.1875,
3593
+ "learning_rate": 4.896984924623116e-05,
3594
+ "loss": 7.5704,
3595
+ "step": 5100
3596
+ },
3597
+ {
3598
+ "epoch": 0.02555,
3599
+ "grad_norm": 1.4296875,
3600
+ "learning_rate": 4.896733668341709e-05,
3601
+ "loss": 7.5756,
3602
+ "step": 5110
3603
+ },
3604
+ {
3605
+ "epoch": 0.0256,
3606
+ "grad_norm": 51.75,
3607
+ "learning_rate": 4.896482412060302e-05,
3608
+ "loss": 7.594,
3609
+ "step": 5120
3610
+ },
3611
+ {
3612
+ "epoch": 0.02565,
3613
+ "grad_norm": 3.484375,
3614
+ "learning_rate": 4.896231155778895e-05,
3615
+ "loss": 7.5914,
3616
+ "step": 5130
3617
+ },
3618
+ {
3619
+ "epoch": 0.0257,
3620
+ "grad_norm": 6.625,
3621
+ "learning_rate": 4.8959798994974874e-05,
3622
+ "loss": 7.5849,
3623
+ "step": 5140
3624
+ },
3625
+ {
3626
+ "epoch": 0.02575,
3627
+ "grad_norm": 1.765625,
3628
+ "learning_rate": 4.8957286432160805e-05,
3629
+ "loss": 7.592,
3630
+ "step": 5150
3631
+ },
3632
+ {
3633
+ "epoch": 0.0258,
3634
+ "grad_norm": 1.765625,
3635
+ "learning_rate": 4.8954773869346736e-05,
3636
+ "loss": 7.5583,
3637
+ "step": 5160
3638
+ },
3639
+ {
3640
+ "epoch": 0.02585,
3641
+ "grad_norm": 1.5625,
3642
+ "learning_rate": 4.895226130653267e-05,
3643
+ "loss": 7.6054,
3644
+ "step": 5170
3645
+ },
3646
+ {
3647
+ "epoch": 0.0259,
3648
+ "grad_norm": 1.6796875,
3649
+ "learning_rate": 4.894974874371859e-05,
3650
+ "loss": 7.5911,
3651
+ "step": 5180
3652
+ },
3653
+ {
3654
+ "epoch": 0.02595,
3655
+ "grad_norm": 2.921875,
3656
+ "learning_rate": 4.894723618090453e-05,
3657
+ "loss": 7.5316,
3658
+ "step": 5190
3659
+ },
3660
+ {
3661
+ "epoch": 0.026,
3662
+ "grad_norm": 18.125,
3663
+ "learning_rate": 4.8944723618090454e-05,
3664
+ "loss": 7.6079,
3665
+ "step": 5200
3666
+ },
3667
+ {
3668
+ "epoch": 0.02605,
3669
+ "grad_norm": 2.6875,
3670
+ "learning_rate": 4.8942211055276385e-05,
3671
+ "loss": 7.5652,
3672
+ "step": 5210
3673
+ },
3674
+ {
3675
+ "epoch": 0.0261,
3676
+ "grad_norm": 13.1875,
3677
+ "learning_rate": 4.8939698492462316e-05,
3678
+ "loss": 7.593,
3679
+ "step": 5220
3680
+ },
3681
+ {
3682
+ "epoch": 0.02615,
3683
+ "grad_norm": 1.734375,
3684
+ "learning_rate": 4.893718592964825e-05,
3685
+ "loss": 7.5381,
3686
+ "step": 5230
3687
+ },
3688
+ {
3689
+ "epoch": 0.0262,
3690
+ "grad_norm": 1.390625,
3691
+ "learning_rate": 4.893467336683417e-05,
3692
+ "loss": 7.5803,
3693
+ "step": 5240
3694
+ },
3695
+ {
3696
+ "epoch": 0.02625,
3697
+ "grad_norm": 32.75,
3698
+ "learning_rate": 4.89321608040201e-05,
3699
+ "loss": 7.5725,
3700
+ "step": 5250
3701
+ },
3702
+ {
3703
+ "epoch": 0.0263,
3704
+ "grad_norm": 27.375,
3705
+ "learning_rate": 4.892964824120603e-05,
3706
+ "loss": 7.5863,
3707
+ "step": 5260
3708
+ },
3709
+ {
3710
+ "epoch": 0.02635,
3711
+ "grad_norm": 1.8046875,
3712
+ "learning_rate": 4.8927135678391964e-05,
3713
+ "loss": 7.5594,
3714
+ "step": 5270
3715
+ },
3716
+ {
3717
+ "epoch": 0.0264,
3718
+ "grad_norm": 84.0,
3719
+ "learning_rate": 4.8924623115577895e-05,
3720
+ "loss": 7.5623,
3721
+ "step": 5280
3722
+ },
3723
+ {
3724
+ "epoch": 0.02645,
3725
+ "grad_norm": 1.8203125,
3726
+ "learning_rate": 4.892211055276382e-05,
3727
+ "loss": 7.5728,
3728
+ "step": 5290
3729
+ },
3730
+ {
3731
+ "epoch": 0.0265,
3732
+ "grad_norm": 3.1875,
3733
+ "learning_rate": 4.891959798994975e-05,
3734
+ "loss": 7.5856,
3735
+ "step": 5300
3736
+ },
3737
+ {
3738
+ "epoch": 0.02655,
3739
+ "grad_norm": 53.5,
3740
+ "learning_rate": 4.891708542713568e-05,
3741
+ "loss": 7.577,
3742
+ "step": 5310
3743
+ },
3744
+ {
3745
+ "epoch": 0.0266,
3746
+ "grad_norm": 1.7265625,
3747
+ "learning_rate": 4.891457286432161e-05,
3748
+ "loss": 7.591,
3749
+ "step": 5320
3750
+ },
3751
+ {
3752
+ "epoch": 0.02665,
3753
+ "grad_norm": 19.375,
3754
+ "learning_rate": 4.8912060301507537e-05,
3755
+ "loss": 7.5813,
3756
+ "step": 5330
3757
+ },
3758
+ {
3759
+ "epoch": 0.0267,
3760
+ "grad_norm": 37.75,
3761
+ "learning_rate": 4.890954773869347e-05,
3762
+ "loss": 7.5753,
3763
+ "step": 5340
3764
+ },
3765
+ {
3766
+ "epoch": 0.02675,
3767
+ "grad_norm": 4.0,
3768
+ "learning_rate": 4.89070351758794e-05,
3769
+ "loss": 7.5711,
3770
+ "step": 5350
3771
+ },
3772
+ {
3773
+ "epoch": 0.0268,
3774
+ "grad_norm": 7.84375,
3775
+ "learning_rate": 4.890452261306533e-05,
3776
+ "loss": 7.5626,
3777
+ "step": 5360
3778
+ },
3779
+ {
3780
+ "epoch": 0.02685,
3781
+ "grad_norm": 13.0625,
3782
+ "learning_rate": 4.8902010050251254e-05,
3783
+ "loss": 7.6023,
3784
+ "step": 5370
3785
+ },
3786
+ {
3787
+ "epoch": 0.0269,
3788
+ "grad_norm": 33.0,
3789
+ "learning_rate": 4.889949748743719e-05,
3790
+ "loss": 7.5754,
3791
+ "step": 5380
3792
+ },
3793
+ {
3794
+ "epoch": 0.02695,
3795
+ "grad_norm": 8.3125,
3796
+ "learning_rate": 4.8896984924623116e-05,
3797
+ "loss": 7.5874,
3798
+ "step": 5390
3799
+ },
3800
+ {
3801
+ "epoch": 0.027,
3802
+ "grad_norm": 2.046875,
3803
+ "learning_rate": 4.889447236180905e-05,
3804
+ "loss": 7.5645,
3805
+ "step": 5400
3806
+ },
3807
+ {
3808
+ "epoch": 0.02705,
3809
+ "grad_norm": 5.5,
3810
+ "learning_rate": 4.889195979899498e-05,
3811
+ "loss": 7.5483,
3812
+ "step": 5410
3813
+ },
3814
+ {
3815
+ "epoch": 0.0271,
3816
+ "grad_norm": 20.875,
3817
+ "learning_rate": 4.888944723618091e-05,
3818
+ "loss": 7.5784,
3819
+ "step": 5420
3820
+ },
3821
+ {
3822
+ "epoch": 0.02715,
3823
+ "grad_norm": 1.6796875,
3824
+ "learning_rate": 4.888693467336683e-05,
3825
+ "loss": 7.5691,
3826
+ "step": 5430
3827
+ },
3828
+ {
3829
+ "epoch": 0.0272,
3830
+ "grad_norm": 3.828125,
3831
+ "learning_rate": 4.888442211055277e-05,
3832
+ "loss": 7.5517,
3833
+ "step": 5440
3834
+ },
3835
+ {
3836
+ "epoch": 0.02725,
3837
+ "grad_norm": 2.265625,
3838
+ "learning_rate": 4.8881909547738695e-05,
3839
+ "loss": 7.6301,
3840
+ "step": 5450
3841
+ },
3842
+ {
3843
+ "epoch": 0.0273,
3844
+ "grad_norm": 2.390625,
3845
+ "learning_rate": 4.8879396984924626e-05,
3846
+ "loss": 7.6033,
3847
+ "step": 5460
3848
+ },
3849
+ {
3850
+ "epoch": 0.02735,
3851
+ "grad_norm": 2.578125,
3852
+ "learning_rate": 4.887688442211055e-05,
3853
+ "loss": 7.5836,
3854
+ "step": 5470
3855
+ },
3856
+ {
3857
+ "epoch": 0.0274,
3858
+ "grad_norm": 3.609375,
3859
+ "learning_rate": 4.887437185929649e-05,
3860
+ "loss": 7.5596,
3861
+ "step": 5480
3862
+ },
3863
+ {
3864
+ "epoch": 0.02745,
3865
+ "grad_norm": 12.25,
3866
+ "learning_rate": 4.887185929648241e-05,
3867
+ "loss": 7.5735,
3868
+ "step": 5490
3869
+ },
3870
+ {
3871
+ "epoch": 0.0275,
3872
+ "grad_norm": 1.8046875,
3873
+ "learning_rate": 4.8869346733668344e-05,
3874
+ "loss": 7.5858,
3875
+ "step": 5500
3876
+ },
3877
+ {
3878
+ "epoch": 0.02755,
3879
+ "grad_norm": 12.4375,
3880
+ "learning_rate": 4.8866834170854275e-05,
3881
+ "loss": 7.6097,
3882
+ "step": 5510
3883
+ },
3884
+ {
3885
+ "epoch": 0.0276,
3886
+ "grad_norm": 1.40625,
3887
+ "learning_rate": 4.8864321608040206e-05,
3888
+ "loss": 7.5665,
3889
+ "step": 5520
3890
+ },
3891
+ {
3892
+ "epoch": 0.02765,
3893
+ "grad_norm": 2.875,
3894
+ "learning_rate": 4.886180904522613e-05,
3895
+ "loss": 7.6013,
3896
+ "step": 5530
3897
+ },
3898
+ {
3899
+ "epoch": 0.0277,
3900
+ "grad_norm": 5.40625,
3901
+ "learning_rate": 4.885929648241207e-05,
3902
+ "loss": 7.5832,
3903
+ "step": 5540
3904
+ },
3905
+ {
3906
+ "epoch": 0.02775,
3907
+ "grad_norm": 1.2578125,
3908
+ "learning_rate": 4.885678391959799e-05,
3909
+ "loss": 7.5888,
3910
+ "step": 5550
3911
+ },
3912
+ {
3913
+ "epoch": 0.0278,
3914
+ "grad_norm": 4.65625,
3915
+ "learning_rate": 4.885427135678392e-05,
3916
+ "loss": 7.5835,
3917
+ "step": 5560
3918
+ },
3919
+ {
3920
+ "epoch": 0.02785,
3921
+ "grad_norm": 3.140625,
3922
+ "learning_rate": 4.8851758793969854e-05,
3923
+ "loss": 7.5709,
3924
+ "step": 5570
3925
+ },
3926
+ {
3927
+ "epoch": 0.0279,
3928
+ "grad_norm": 2.28125,
3929
+ "learning_rate": 4.8849246231155785e-05,
3930
+ "loss": 7.5727,
3931
+ "step": 5580
3932
+ },
3933
+ {
3934
+ "epoch": 0.02795,
3935
+ "grad_norm": 3.890625,
3936
+ "learning_rate": 4.884673366834171e-05,
3937
+ "loss": 7.572,
3938
+ "step": 5590
3939
+ },
3940
+ {
3941
+ "epoch": 0.028,
3942
+ "grad_norm": 2.421875,
3943
+ "learning_rate": 4.884422110552764e-05,
3944
+ "loss": 7.5692,
3945
+ "step": 5600
3946
+ },
3947
+ {
3948
+ "epoch": 0.02805,
3949
+ "grad_norm": 2.109375,
3950
+ "learning_rate": 4.884170854271357e-05,
3951
+ "loss": 7.6236,
3952
+ "step": 5610
3953
+ },
3954
+ {
3955
+ "epoch": 0.0281,
3956
+ "grad_norm": 1.46875,
3957
+ "learning_rate": 4.88391959798995e-05,
3958
+ "loss": 7.5949,
3959
+ "step": 5620
3960
+ },
3961
+ {
3962
+ "epoch": 0.02815,
3963
+ "grad_norm": 3.625,
3964
+ "learning_rate": 4.8836683417085426e-05,
3965
+ "loss": 7.5883,
3966
+ "step": 5630
3967
+ },
3968
+ {
3969
+ "epoch": 0.0282,
3970
+ "grad_norm": 1.484375,
3971
+ "learning_rate": 4.883417085427136e-05,
3972
+ "loss": 7.5879,
3973
+ "step": 5640
3974
+ },
3975
+ {
3976
+ "epoch": 0.02825,
3977
+ "grad_norm": 1.3359375,
3978
+ "learning_rate": 4.883165829145729e-05,
3979
+ "loss": 7.5975,
3980
+ "step": 5650
3981
+ },
3982
+ {
3983
+ "epoch": 0.0283,
3984
+ "grad_norm": 3.75,
3985
+ "learning_rate": 4.882914572864322e-05,
3986
+ "loss": 7.5806,
3987
+ "step": 5660
3988
+ },
3989
+ {
3990
+ "epoch": 0.02835,
3991
+ "grad_norm": 1.421875,
3992
+ "learning_rate": 4.882663316582915e-05,
3993
+ "loss": 7.5494,
3994
+ "step": 5670
3995
+ },
3996
+ {
3997
+ "epoch": 0.0284,
3998
+ "grad_norm": 1.796875,
3999
+ "learning_rate": 4.8824120603015075e-05,
4000
+ "loss": 7.5537,
4001
+ "step": 5680
4002
+ },
4003
+ {
4004
+ "epoch": 0.02845,
4005
+ "grad_norm": 5.6875,
4006
+ "learning_rate": 4.8821608040201006e-05,
4007
+ "loss": 7.5803,
4008
+ "step": 5690
4009
+ },
4010
+ {
4011
+ "epoch": 0.0285,
4012
+ "grad_norm": 1.6171875,
4013
+ "learning_rate": 4.881909547738694e-05,
4014
+ "loss": 7.6241,
4015
+ "step": 5700
4016
+ },
4017
+ {
4018
+ "epoch": 0.02855,
4019
+ "grad_norm": 25.125,
4020
+ "learning_rate": 4.881658291457287e-05,
4021
+ "loss": 7.5613,
4022
+ "step": 5710
4023
+ },
4024
+ {
4025
+ "epoch": 0.0286,
4026
+ "grad_norm": 5.65625,
4027
+ "learning_rate": 4.881407035175879e-05,
4028
+ "loss": 7.6088,
4029
+ "step": 5720
4030
+ },
4031
+ {
4032
+ "epoch": 0.02865,
4033
+ "grad_norm": 60.25,
4034
+ "learning_rate": 4.881155778894473e-05,
4035
+ "loss": 7.5541,
4036
+ "step": 5730
4037
+ },
4038
+ {
4039
+ "epoch": 0.0287,
4040
+ "grad_norm": 1.2890625,
4041
+ "learning_rate": 4.8809045226130654e-05,
4042
+ "loss": 7.5828,
4043
+ "step": 5740
4044
+ },
4045
+ {
4046
+ "epoch": 0.02875,
4047
+ "grad_norm": 21.375,
4048
+ "learning_rate": 4.8806532663316585e-05,
4049
+ "loss": 7.5942,
4050
+ "step": 5750
4051
+ },
4052
+ {
4053
+ "epoch": 0.0288,
4054
+ "grad_norm": 102.0,
4055
+ "learning_rate": 4.880402010050251e-05,
4056
+ "loss": 7.5712,
4057
+ "step": 5760
4058
+ },
4059
+ {
4060
+ "epoch": 0.02885,
4061
+ "grad_norm": 6.78125,
4062
+ "learning_rate": 4.880150753768845e-05,
4063
+ "loss": 7.5548,
4064
+ "step": 5770
4065
+ },
4066
+ {
4067
+ "epoch": 0.0289,
4068
+ "grad_norm": 1.6328125,
4069
+ "learning_rate": 4.879899497487437e-05,
4070
+ "loss": 7.5928,
4071
+ "step": 5780
4072
+ },
4073
+ {
4074
+ "epoch": 0.02895,
4075
+ "grad_norm": 1.71875,
4076
+ "learning_rate": 4.87964824120603e-05,
4077
+ "loss": 7.5883,
4078
+ "step": 5790
4079
+ },
4080
+ {
4081
+ "epoch": 0.029,
4082
+ "grad_norm": 24.125,
4083
+ "learning_rate": 4.8793969849246233e-05,
4084
+ "loss": 7.5893,
4085
+ "step": 5800
4086
+ },
4087
+ {
4088
+ "epoch": 0.02905,
4089
+ "grad_norm": 20.875,
4090
+ "learning_rate": 4.8791457286432164e-05,
4091
+ "loss": 7.5664,
4092
+ "step": 5810
4093
+ },
4094
+ {
4095
+ "epoch": 0.0291,
4096
+ "grad_norm": 51.0,
4097
+ "learning_rate": 4.878894472361809e-05,
4098
+ "loss": 7.5849,
4099
+ "step": 5820
4100
+ },
4101
+ {
4102
+ "epoch": 0.02915,
4103
+ "grad_norm": 12.25,
4104
+ "learning_rate": 4.8786432160804026e-05,
4105
+ "loss": 7.5563,
4106
+ "step": 5830
4107
+ },
4108
+ {
4109
+ "epoch": 0.0292,
4110
+ "grad_norm": 1.9921875,
4111
+ "learning_rate": 4.878391959798995e-05,
4112
+ "loss": 7.5565,
4113
+ "step": 5840
4114
+ },
4115
+ {
4116
+ "epoch": 0.02925,
4117
+ "grad_norm": 11.75,
4118
+ "learning_rate": 4.878140703517588e-05,
4119
+ "loss": 7.5521,
4120
+ "step": 5850
4121
+ },
4122
+ {
4123
+ "epoch": 0.0293,
4124
+ "grad_norm": 15.3125,
4125
+ "learning_rate": 4.877889447236181e-05,
4126
+ "loss": 7.6012,
4127
+ "step": 5860
4128
+ },
4129
+ {
4130
+ "epoch": 0.02935,
4131
+ "grad_norm": 45.25,
4132
+ "learning_rate": 4.8776381909547744e-05,
4133
+ "loss": 7.557,
4134
+ "step": 5870
4135
+ },
4136
+ {
4137
+ "epoch": 0.0294,
4138
+ "grad_norm": 1.8359375,
4139
+ "learning_rate": 4.877386934673367e-05,
4140
+ "loss": 7.5793,
4141
+ "step": 5880
4142
+ },
4143
+ {
4144
+ "epoch": 0.02945,
4145
+ "grad_norm": 6.96875,
4146
+ "learning_rate": 4.8771356783919606e-05,
4147
+ "loss": 7.5896,
4148
+ "step": 5890
4149
+ },
4150
+ {
4151
+ "epoch": 0.0295,
4152
+ "grad_norm": 1.6953125,
4153
+ "learning_rate": 4.876884422110553e-05,
4154
+ "loss": 7.5965,
4155
+ "step": 5900
4156
+ },
4157
+ {
4158
+ "epoch": 0.02955,
4159
+ "grad_norm": 2.046875,
4160
+ "learning_rate": 4.876633165829146e-05,
4161
+ "loss": 7.6091,
4162
+ "step": 5910
4163
+ },
4164
+ {
4165
+ "epoch": 0.0296,
4166
+ "grad_norm": 1.953125,
4167
+ "learning_rate": 4.8763819095477385e-05,
4168
+ "loss": 7.577,
4169
+ "step": 5920
4170
+ },
4171
+ {
4172
+ "epoch": 0.02965,
4173
+ "grad_norm": 1.53125,
4174
+ "learning_rate": 4.876130653266332e-05,
4175
+ "loss": 7.6014,
4176
+ "step": 5930
4177
+ },
4178
+ {
4179
+ "epoch": 0.0297,
4180
+ "grad_norm": 2.546875,
4181
+ "learning_rate": 4.875879396984925e-05,
4182
+ "loss": 7.6076,
4183
+ "step": 5940
4184
+ },
4185
+ {
4186
+ "epoch": 0.02975,
4187
+ "grad_norm": 48.5,
4188
+ "learning_rate": 4.875628140703518e-05,
4189
+ "loss": 7.5915,
4190
+ "step": 5950
4191
+ },
4192
+ {
4193
+ "epoch": 0.0298,
4194
+ "grad_norm": 16.375,
4195
+ "learning_rate": 4.875376884422111e-05,
4196
+ "loss": 7.5711,
4197
+ "step": 5960
4198
+ },
4199
+ {
4200
+ "epoch": 0.02985,
4201
+ "grad_norm": 2.140625,
4202
+ "learning_rate": 4.875125628140704e-05,
4203
+ "loss": 7.5665,
4204
+ "step": 5970
4205
+ },
4206
+ {
4207
+ "epoch": 0.0299,
4208
+ "grad_norm": 1.8125,
4209
+ "learning_rate": 4.8748743718592965e-05,
4210
+ "loss": 7.5443,
4211
+ "step": 5980
4212
+ },
4213
+ {
4214
+ "epoch": 0.02995,
4215
+ "grad_norm": 9.125,
4216
+ "learning_rate": 4.8746231155778896e-05,
4217
+ "loss": 7.5808,
4218
+ "step": 5990
4219
+ },
4220
+ {
4221
+ "epoch": 0.03,
4222
+ "grad_norm": 1.5703125,
4223
+ "learning_rate": 4.874371859296483e-05,
4224
+ "loss": 7.5749,
4225
+ "step": 6000
4226
+ },
4227
+ {
4228
+ "epoch": 0.03,
4229
+ "eval_loss": 7.596262454986572,
4230
+ "eval_runtime": 84.3987,
4231
+ "eval_samples_per_second": 29.621,
4232
+ "eval_steps_per_second": 0.474,
4233
+ "step": 6000
4234
  }
4235
  ],
4236
  "logging_steps": 10,
 
4250
  "attributes": {}
4251
  }
4252
  },
4253
+ "total_flos": 2.380225711177728e+17,
4254
  "train_batch_size": 64,
4255
  "trial_name": null,
4256
  "trial_params": null