diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e7cee57f317119bc7dce107069e74104d4f6d5e4 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 49.02, + "eval_accuracy": 0.8875, + "eval_loss": 0.4278368353843689, + "eval_runtime": 1123.0215, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.071 +} \ No newline at end of file diff --git a/runs/Apr24_20-05-06_blackhorse/events.out.tfevents.1714244811.blackhorse.9061.1 b/runs/Apr24_20-05-06_blackhorse/events.out.tfevents.1714244811.blackhorse.9061.1 index b5bc02fa96ed1bf18d795c489b310dd2bf997378..e2b00eb7fda540bd19c35dbd50bd7692c54ada1b 100644 --- a/runs/Apr24_20-05-06_blackhorse/events.out.tfevents.1714244811.blackhorse.9061.1 +++ b/runs/Apr24_20-05-06_blackhorse/events.out.tfevents.1714244811.blackhorse.9061.1 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d58156d37c20e78424840356fe1b1b1f834d116f1a8653284b358a053ea5a8de -size 411 +oid sha256:83d67358f4de06b288aa9d32aa3642fd3b42e99f24fcdcab8e824f7aaeb18629 +size 734 diff --git a/test_results.json b/test_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e7cee57f317119bc7dce107069e74104d4f6d5e4 --- /dev/null +++ b/test_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 49.02, + "eval_accuracy": 0.8875, + "eval_loss": 0.4278368353843689, + "eval_runtime": 1123.0215, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.071 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d7a9b4c31d59c8f7468d5fd303f94028a2954438 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,11558 @@ +{ + "best_metric": 0.9769673704414588, + "best_model_checkpoint": "videomae-base-finetuned-isl-numbers-alphabet-nouns/checkpoint-13904", + "epoch": 49.02, + "eval_steps": 500, + "global_step": 15800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006329113924050633, + "grad_norm": 7.342864990234375, + "learning_rate": 3.1645569620253163e-07, + "loss": 5.0266, + "step": 10 + }, + { + "epoch": 0.0012658227848101266, + "grad_norm": 7.16239070892334, + "learning_rate": 6.329113924050633e-07, + "loss": 5.0833, + "step": 20 + }, + { + "epoch": 0.0018987341772151898, + "grad_norm": 7.36508846282959, + "learning_rate": 9.493670886075951e-07, + "loss": 5.0852, + "step": 30 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 7.174118995666504, + "learning_rate": 1.2658227848101265e-06, + "loss": 5.1223, + "step": 40 + }, + { + "epoch": 0.0031645569620253164, + "grad_norm": 7.650392055511475, + "learning_rate": 1.5822784810126583e-06, + "loss": 5.0765, + "step": 50 + }, + { + "epoch": 0.0037974683544303796, + "grad_norm": 7.764711856842041, + "learning_rate": 1.8987341772151901e-06, + "loss": 5.0422, + "step": 60 + }, + { + "epoch": 0.004430379746835443, + "grad_norm": 11.08586311340332, + "learning_rate": 2.2151898734177215e-06, + "loss": 5.0661, + "step": 70 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 7.217874050140381, + "learning_rate": 2.531645569620253e-06, + "loss": 5.0943, + "step": 80 + }, + { + "epoch": 0.00569620253164557, + "grad_norm": 7.349112510681152, + "learning_rate": 2.848101265822785e-06, + "loss": 5.1201, + "step": 90 + }, + { + "epoch": 0.006329113924050633, + "grad_norm": 7.260085582733154, + "learning_rate": 3.1645569620253167e-06, + "loss": 5.0797, + "step": 100 + }, + { + "epoch": 0.006962025316455696, + "grad_norm": 7.140063762664795, + "learning_rate": 3.4810126582278482e-06, + "loss": 5.0803, + "step": 110 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 8.30470085144043, + "learning_rate": 3.7974683544303802e-06, + "loss": 5.061, + "step": 120 + }, + { + "epoch": 0.008227848101265823, + "grad_norm": 8.53849983215332, + "learning_rate": 4.113924050632911e-06, + "loss": 5.0407, + "step": 130 + }, + { + "epoch": 0.008860759493670886, + "grad_norm": 7.535670757293701, + "learning_rate": 4.430379746835443e-06, + "loss": 5.0505, + "step": 140 + }, + { + "epoch": 0.00949367088607595, + "grad_norm": 8.011333465576172, + "learning_rate": 4.746835443037975e-06, + "loss": 5.0643, + "step": 150 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 8.219789505004883, + "learning_rate": 5.063291139240506e-06, + "loss": 4.9714, + "step": 160 + }, + { + "epoch": 0.010759493670886076, + "grad_norm": 8.42809009552002, + "learning_rate": 5.379746835443038e-06, + "loss": 4.9477, + "step": 170 + }, + { + "epoch": 0.01139240506329114, + "grad_norm": 11.367712020874023, + "learning_rate": 5.69620253164557e-06, + "loss": 4.9509, + "step": 180 + }, + { + "epoch": 0.012025316455696202, + "grad_norm": 9.428906440734863, + "learning_rate": 6.012658227848101e-06, + "loss": 4.9267, + "step": 190 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 8.949156761169434, + "learning_rate": 6.329113924050633e-06, + "loss": 4.8931, + "step": 200 + }, + { + "epoch": 0.013291139240506329, + "grad_norm": 8.922019004821777, + "learning_rate": 6.6455696202531645e-06, + "loss": 4.921, + "step": 210 + }, + { + "epoch": 0.013924050632911392, + "grad_norm": 10.950883865356445, + "learning_rate": 6.9620253164556965e-06, + "loss": 4.781, + "step": 220 + }, + { + "epoch": 0.014556962025316455, + "grad_norm": 10.333183288574219, + "learning_rate": 7.2784810126582285e-06, + "loss": 4.8136, + "step": 230 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 11.363033294677734, + "learning_rate": 7.5949367088607605e-06, + "loss": 4.8154, + "step": 240 + }, + { + "epoch": 0.015822784810126583, + "grad_norm": 10.919289588928223, + "learning_rate": 7.911392405063292e-06, + "loss": 4.8229, + "step": 250 + }, + { + "epoch": 0.016455696202531647, + "grad_norm": 13.473514556884766, + "learning_rate": 8.227848101265822e-06, + "loss": 4.6251, + "step": 260 + }, + { + "epoch": 0.01708860759493671, + "grad_norm": 13.167325973510742, + "learning_rate": 8.544303797468354e-06, + "loss": 4.6354, + "step": 270 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 12.691937446594238, + "learning_rate": 8.860759493670886e-06, + "loss": 4.6029, + "step": 280 + }, + { + "epoch": 0.018354430379746836, + "grad_norm": 13.479981422424316, + "learning_rate": 9.177215189873418e-06, + "loss": 4.621, + "step": 290 + }, + { + "epoch": 0.0189873417721519, + "grad_norm": 12.953887939453125, + "learning_rate": 9.49367088607595e-06, + "loss": 4.5616, + "step": 300 + }, + { + "epoch": 0.019620253164556962, + "grad_norm": 13.775838851928711, + "learning_rate": 9.81012658227848e-06, + "loss": 4.5228, + "step": 310 + }, + { + "epoch": 0.02, + "eval_accuracy": 0.2533589251439539, + "eval_loss": 4.3514204025268555, + "eval_runtime": 834.4919, + "eval_samples_per_second": 0.624, + "eval_steps_per_second": 0.079, + "step": 316 + }, + { + "epoch": 1.0002531645569621, + "grad_norm": 12.405595779418945, + "learning_rate": 1.0126582278481012e-05, + "loss": 4.3858, + "step": 320 + }, + { + "epoch": 1.0008860759493672, + "grad_norm": 14.15708065032959, + "learning_rate": 1.0443037974683544e-05, + "loss": 4.3131, + "step": 330 + }, + { + "epoch": 1.0015189873417722, + "grad_norm": 12.798722267150879, + "learning_rate": 1.0759493670886076e-05, + "loss": 4.3399, + "step": 340 + }, + { + "epoch": 1.0021518987341773, + "grad_norm": 15.9908447265625, + "learning_rate": 1.1075949367088608e-05, + "loss": 4.2376, + "step": 350 + }, + { + "epoch": 1.0027848101265824, + "grad_norm": 12.808874130249023, + "learning_rate": 1.139240506329114e-05, + "loss": 4.1385, + "step": 360 + }, + { + "epoch": 1.0034177215189874, + "grad_norm": 15.553874969482422, + "learning_rate": 1.170886075949367e-05, + "loss": 4.202, + "step": 370 + }, + { + "epoch": 1.0040506329113925, + "grad_norm": 12.774052619934082, + "learning_rate": 1.2025316455696203e-05, + "loss": 4.0002, + "step": 380 + }, + { + "epoch": 1.0046835443037974, + "grad_norm": 15.000925064086914, + "learning_rate": 1.2341772151898735e-05, + "loss": 4.0083, + "step": 390 + }, + { + "epoch": 1.0053164556962024, + "grad_norm": 13.524576187133789, + "learning_rate": 1.2658227848101267e-05, + "loss": 4.086, + "step": 400 + }, + { + "epoch": 1.0059493670886075, + "grad_norm": 14.740311622619629, + "learning_rate": 1.2974683544303799e-05, + "loss": 4.0018, + "step": 410 + }, + { + "epoch": 1.0065822784810126, + "grad_norm": 13.781682968139648, + "learning_rate": 1.3291139240506329e-05, + "loss": 3.9788, + "step": 420 + }, + { + "epoch": 1.0072151898734176, + "grad_norm": 17.60359764099121, + "learning_rate": 1.3607594936708861e-05, + "loss": 3.7957, + "step": 430 + }, + { + "epoch": 1.0078481012658227, + "grad_norm": 15.860666275024414, + "learning_rate": 1.3924050632911393e-05, + "loss": 3.8316, + "step": 440 + }, + { + "epoch": 1.0084810126582278, + "grad_norm": 18.55211639404297, + "learning_rate": 1.4240506329113925e-05, + "loss": 3.7908, + "step": 450 + }, + { + "epoch": 1.0091139240506328, + "grad_norm": 12.959016799926758, + "learning_rate": 1.4556962025316457e-05, + "loss": 3.6121, + "step": 460 + }, + { + "epoch": 1.009746835443038, + "grad_norm": 14.276836395263672, + "learning_rate": 1.4873417721518987e-05, + "loss": 3.6771, + "step": 470 + }, + { + "epoch": 1.010379746835443, + "grad_norm": 15.222319602966309, + "learning_rate": 1.5189873417721521e-05, + "loss": 3.695, + "step": 480 + }, + { + "epoch": 1.011012658227848, + "grad_norm": 12.566011428833008, + "learning_rate": 1.550632911392405e-05, + "loss": 3.6071, + "step": 490 + }, + { + "epoch": 1.011645569620253, + "grad_norm": 13.210318565368652, + "learning_rate": 1.5822784810126583e-05, + "loss": 3.4652, + "step": 500 + }, + { + "epoch": 1.0122784810126582, + "grad_norm": 14.671469688415527, + "learning_rate": 1.6139240506329115e-05, + "loss": 3.457, + "step": 510 + }, + { + "epoch": 1.0129113924050632, + "grad_norm": 12.707144737243652, + "learning_rate": 1.6455696202531644e-05, + "loss": 3.5506, + "step": 520 + }, + { + "epoch": 1.0135443037974683, + "grad_norm": 15.84700870513916, + "learning_rate": 1.677215189873418e-05, + "loss": 3.433, + "step": 530 + }, + { + "epoch": 1.0141772151898734, + "grad_norm": 13.466840744018555, + "learning_rate": 1.7088607594936708e-05, + "loss": 3.4244, + "step": 540 + }, + { + "epoch": 1.0148101265822784, + "grad_norm": 12.980550765991211, + "learning_rate": 1.7405063291139243e-05, + "loss": 3.3659, + "step": 550 + }, + { + "epoch": 1.0154430379746835, + "grad_norm": 12.236871719360352, + "learning_rate": 1.7721518987341772e-05, + "loss": 3.3406, + "step": 560 + }, + { + "epoch": 1.0160759493670886, + "grad_norm": 14.126879692077637, + "learning_rate": 1.8037974683544304e-05, + "loss": 3.2676, + "step": 570 + }, + { + "epoch": 1.0167088607594936, + "grad_norm": 14.927568435668945, + "learning_rate": 1.8354430379746836e-05, + "loss": 3.1862, + "step": 580 + }, + { + "epoch": 1.0173417721518987, + "grad_norm": 14.192325592041016, + "learning_rate": 1.8670886075949368e-05, + "loss": 3.2144, + "step": 590 + }, + { + "epoch": 1.0179746835443038, + "grad_norm": 12.938292503356934, + "learning_rate": 1.89873417721519e-05, + "loss": 3.1199, + "step": 600 + }, + { + "epoch": 1.0186075949367088, + "grad_norm": 15.724618911743164, + "learning_rate": 1.9303797468354432e-05, + "loss": 3.1761, + "step": 610 + }, + { + "epoch": 1.019240506329114, + "grad_norm": 12.795076370239258, + "learning_rate": 1.962025316455696e-05, + "loss": 3.0823, + "step": 620 + }, + { + "epoch": 1.019873417721519, + "grad_norm": 16.50122833251953, + "learning_rate": 1.9936708860759496e-05, + "loss": 3.0795, + "step": 630 + }, + { + "epoch": 1.02, + "eval_accuracy": 0.581573896353167, + "eval_loss": 2.8514750003814697, + "eval_runtime": 833.5163, + "eval_samples_per_second": 0.625, + "eval_steps_per_second": 0.079, + "step": 632 + }, + { + "epoch": 2.0005063291139242, + "grad_norm": 16.488386154174805, + "learning_rate": 2.0253164556962025e-05, + "loss": 2.7661, + "step": 640 + }, + { + "epoch": 2.0011392405063293, + "grad_norm": 13.507458686828613, + "learning_rate": 2.056962025316456e-05, + "loss": 2.6532, + "step": 650 + }, + { + "epoch": 2.0017721518987344, + "grad_norm": 12.340897560119629, + "learning_rate": 2.088607594936709e-05, + "loss": 2.6855, + "step": 660 + }, + { + "epoch": 2.0024050632911394, + "grad_norm": 13.872140884399414, + "learning_rate": 2.120253164556962e-05, + "loss": 2.8458, + "step": 670 + }, + { + "epoch": 2.0030379746835445, + "grad_norm": 12.209596633911133, + "learning_rate": 2.1518987341772153e-05, + "loss": 2.6217, + "step": 680 + }, + { + "epoch": 2.0036708860759496, + "grad_norm": 12.713300704956055, + "learning_rate": 2.1835443037974685e-05, + "loss": 2.6449, + "step": 690 + }, + { + "epoch": 2.0043037974683546, + "grad_norm": 13.209062576293945, + "learning_rate": 2.2151898734177217e-05, + "loss": 2.6258, + "step": 700 + }, + { + "epoch": 2.0049367088607597, + "grad_norm": 13.877063751220703, + "learning_rate": 2.246835443037975e-05, + "loss": 2.5975, + "step": 710 + }, + { + "epoch": 2.0055696202531648, + "grad_norm": 15.146308898925781, + "learning_rate": 2.278481012658228e-05, + "loss": 2.4536, + "step": 720 + }, + { + "epoch": 2.00620253164557, + "grad_norm": 12.468453407287598, + "learning_rate": 2.3101265822784813e-05, + "loss": 2.4758, + "step": 730 + }, + { + "epoch": 2.006835443037975, + "grad_norm": 13.691295623779297, + "learning_rate": 2.341772151898734e-05, + "loss": 2.6211, + "step": 740 + }, + { + "epoch": 2.00746835443038, + "grad_norm": 13.543225288391113, + "learning_rate": 2.3734177215189873e-05, + "loss": 2.5074, + "step": 750 + }, + { + "epoch": 2.008101265822785, + "grad_norm": 12.138303756713867, + "learning_rate": 2.4050632911392405e-05, + "loss": 2.2878, + "step": 760 + }, + { + "epoch": 2.0087341772151897, + "grad_norm": 12.926602363586426, + "learning_rate": 2.4367088607594937e-05, + "loss": 2.3154, + "step": 770 + }, + { + "epoch": 2.0093670886075947, + "grad_norm": 13.598923683166504, + "learning_rate": 2.468354430379747e-05, + "loss": 2.2751, + "step": 780 + }, + { + "epoch": 2.01, + "grad_norm": 12.883562088012695, + "learning_rate": 2.5e-05, + "loss": 2.3078, + "step": 790 + }, + { + "epoch": 2.010632911392405, + "grad_norm": 13.546457290649414, + "learning_rate": 2.5316455696202533e-05, + "loss": 2.5038, + "step": 800 + }, + { + "epoch": 2.01126582278481, + "grad_norm": 15.554511070251465, + "learning_rate": 2.5632911392405062e-05, + "loss": 2.3539, + "step": 810 + }, + { + "epoch": 2.011898734177215, + "grad_norm": 12.446627616882324, + "learning_rate": 2.5949367088607597e-05, + "loss": 2.4136, + "step": 820 + }, + { + "epoch": 2.01253164556962, + "grad_norm": 17.281230926513672, + "learning_rate": 2.626582278481013e-05, + "loss": 2.2076, + "step": 830 + }, + { + "epoch": 2.013164556962025, + "grad_norm": 11.104803085327148, + "learning_rate": 2.6582278481012658e-05, + "loss": 2.1716, + "step": 840 + }, + { + "epoch": 2.01379746835443, + "grad_norm": 15.020671844482422, + "learning_rate": 2.689873417721519e-05, + "loss": 2.3163, + "step": 850 + }, + { + "epoch": 2.0144303797468353, + "grad_norm": 12.115602493286133, + "learning_rate": 2.7215189873417722e-05, + "loss": 2.1664, + "step": 860 + }, + { + "epoch": 2.0150632911392403, + "grad_norm": 13.693445205688477, + "learning_rate": 2.7531645569620257e-05, + "loss": 2.1606, + "step": 870 + }, + { + "epoch": 2.0156962025316454, + "grad_norm": 10.924906730651855, + "learning_rate": 2.7848101265822786e-05, + "loss": 2.136, + "step": 880 + }, + { + "epoch": 2.0163291139240505, + "grad_norm": 11.92796802520752, + "learning_rate": 2.8164556962025318e-05, + "loss": 2.1733, + "step": 890 + }, + { + "epoch": 2.0169620253164555, + "grad_norm": 12.855241775512695, + "learning_rate": 2.848101265822785e-05, + "loss": 2.07, + "step": 900 + }, + { + "epoch": 2.0175949367088606, + "grad_norm": 14.710478782653809, + "learning_rate": 2.879746835443038e-05, + "loss": 2.1004, + "step": 910 + }, + { + "epoch": 2.0182278481012657, + "grad_norm": 14.612150192260742, + "learning_rate": 2.9113924050632914e-05, + "loss": 2.011, + "step": 920 + }, + { + "epoch": 2.0188607594936707, + "grad_norm": 15.704828262329102, + "learning_rate": 2.9430379746835446e-05, + "loss": 2.0263, + "step": 930 + }, + { + "epoch": 2.019493670886076, + "grad_norm": 12.45781421661377, + "learning_rate": 2.9746835443037974e-05, + "loss": 1.8438, + "step": 940 + }, + { + "epoch": 2.02, + "eval_accuracy": 0.7332053742802304, + "eval_loss": 1.7508126497268677, + "eval_runtime": 838.4982, + "eval_samples_per_second": 0.621, + "eval_steps_per_second": 0.079, + "step": 948 + }, + { + "epoch": 3.000126582278481, + "grad_norm": 11.997598648071289, + "learning_rate": 3.0063291139240506e-05, + "loss": 1.8218, + "step": 950 + }, + { + "epoch": 3.000759493670886, + "grad_norm": 10.052958488464355, + "learning_rate": 3.0379746835443042e-05, + "loss": 1.8534, + "step": 960 + }, + { + "epoch": 3.001392405063291, + "grad_norm": 12.40282154083252, + "learning_rate": 3.0696202531645574e-05, + "loss": 1.7654, + "step": 970 + }, + { + "epoch": 3.002025316455696, + "grad_norm": 12.564291000366211, + "learning_rate": 3.10126582278481e-05, + "loss": 1.537, + "step": 980 + }, + { + "epoch": 3.002658227848101, + "grad_norm": 12.120141983032227, + "learning_rate": 3.132911392405064e-05, + "loss": 1.7112, + "step": 990 + }, + { + "epoch": 3.003291139240506, + "grad_norm": 9.203033447265625, + "learning_rate": 3.1645569620253167e-05, + "loss": 1.5344, + "step": 1000 + }, + { + "epoch": 3.0039240506329112, + "grad_norm": 10.776932716369629, + "learning_rate": 3.1962025316455695e-05, + "loss": 1.6746, + "step": 1010 + }, + { + "epoch": 3.0045569620253163, + "grad_norm": 13.07652759552002, + "learning_rate": 3.227848101265823e-05, + "loss": 1.7123, + "step": 1020 + }, + { + "epoch": 3.0051898734177214, + "grad_norm": 12.412099838256836, + "learning_rate": 3.2594936708860766e-05, + "loss": 1.6235, + "step": 1030 + }, + { + "epoch": 3.0058227848101264, + "grad_norm": 11.809402465820312, + "learning_rate": 3.291139240506329e-05, + "loss": 1.7162, + "step": 1040 + }, + { + "epoch": 3.0064556962025315, + "grad_norm": 10.014476776123047, + "learning_rate": 3.322784810126582e-05, + "loss": 1.5555, + "step": 1050 + }, + { + "epoch": 3.0070886075949366, + "grad_norm": 11.16119384765625, + "learning_rate": 3.354430379746836e-05, + "loss": 1.5755, + "step": 1060 + }, + { + "epoch": 3.0077215189873416, + "grad_norm": 11.865368843078613, + "learning_rate": 3.386075949367089e-05, + "loss": 1.4931, + "step": 1070 + }, + { + "epoch": 3.0083544303797467, + "grad_norm": 10.88665771484375, + "learning_rate": 3.4177215189873416e-05, + "loss": 1.4321, + "step": 1080 + }, + { + "epoch": 3.0089873417721518, + "grad_norm": 15.595149993896484, + "learning_rate": 3.449367088607595e-05, + "loss": 1.7013, + "step": 1090 + }, + { + "epoch": 3.009620253164557, + "grad_norm": 12.852721214294434, + "learning_rate": 3.4810126582278487e-05, + "loss": 1.4012, + "step": 1100 + }, + { + "epoch": 3.010253164556962, + "grad_norm": 11.31406307220459, + "learning_rate": 3.5126582278481015e-05, + "loss": 1.3529, + "step": 1110 + }, + { + "epoch": 3.010886075949367, + "grad_norm": 13.643885612487793, + "learning_rate": 3.5443037974683544e-05, + "loss": 1.6591, + "step": 1120 + }, + { + "epoch": 3.011518987341772, + "grad_norm": 8.9163818359375, + "learning_rate": 3.575949367088608e-05, + "loss": 1.5645, + "step": 1130 + }, + { + "epoch": 3.012151898734177, + "grad_norm": 11.10364818572998, + "learning_rate": 3.607594936708861e-05, + "loss": 1.573, + "step": 1140 + }, + { + "epoch": 3.012784810126582, + "grad_norm": 12.710495948791504, + "learning_rate": 3.639240506329114e-05, + "loss": 1.4811, + "step": 1150 + }, + { + "epoch": 3.0134177215189872, + "grad_norm": 9.857568740844727, + "learning_rate": 3.670886075949367e-05, + "loss": 1.3618, + "step": 1160 + }, + { + "epoch": 3.0140506329113923, + "grad_norm": 12.947936058044434, + "learning_rate": 3.70253164556962e-05, + "loss": 1.4507, + "step": 1170 + }, + { + "epoch": 3.0146835443037974, + "grad_norm": 7.388261795043945, + "learning_rate": 3.7341772151898736e-05, + "loss": 1.368, + "step": 1180 + }, + { + "epoch": 3.0153164556962024, + "grad_norm": 12.413993835449219, + "learning_rate": 3.765822784810127e-05, + "loss": 1.2501, + "step": 1190 + }, + { + "epoch": 3.0159493670886075, + "grad_norm": 9.680545806884766, + "learning_rate": 3.79746835443038e-05, + "loss": 1.3826, + "step": 1200 + }, + { + "epoch": 3.0165822784810126, + "grad_norm": 10.841100692749023, + "learning_rate": 3.829113924050633e-05, + "loss": 1.3361, + "step": 1210 + }, + { + "epoch": 3.0172151898734176, + "grad_norm": 10.57774829864502, + "learning_rate": 3.8607594936708864e-05, + "loss": 1.4978, + "step": 1220 + }, + { + "epoch": 3.0178481012658227, + "grad_norm": 20.876358032226562, + "learning_rate": 3.89240506329114e-05, + "loss": 1.4616, + "step": 1230 + }, + { + "epoch": 3.0184810126582278, + "grad_norm": 13.871617317199707, + "learning_rate": 3.924050632911392e-05, + "loss": 1.2869, + "step": 1240 + }, + { + "epoch": 3.019113924050633, + "grad_norm": 10.208392143249512, + "learning_rate": 3.9556962025316456e-05, + "loss": 1.4173, + "step": 1250 + }, + { + "epoch": 3.019746835443038, + "grad_norm": 13.292877197265625, + "learning_rate": 3.987341772151899e-05, + "loss": 1.1451, + "step": 1260 + }, + { + "epoch": 3.02, + "eval_accuracy": 0.7389635316698656, + "eval_loss": 1.146372675895691, + "eval_runtime": 855.4763, + "eval_samples_per_second": 0.609, + "eval_steps_per_second": 0.077, + "step": 1264 + }, + { + "epoch": 4.000379746835443, + "grad_norm": 12.077232360839844, + "learning_rate": 4.018987341772152e-05, + "loss": 1.186, + "step": 1270 + }, + { + "epoch": 4.0010126582278485, + "grad_norm": 11.689604759216309, + "learning_rate": 4.050632911392405e-05, + "loss": 1.2715, + "step": 1280 + }, + { + "epoch": 4.001645569620253, + "grad_norm": 10.000913619995117, + "learning_rate": 4.0822784810126584e-05, + "loss": 1.1501, + "step": 1290 + }, + { + "epoch": 4.002278481012659, + "grad_norm": 11.559700965881348, + "learning_rate": 4.113924050632912e-05, + "loss": 1.3049, + "step": 1300 + }, + { + "epoch": 4.002911392405063, + "grad_norm": 9.99675178527832, + "learning_rate": 4.145569620253165e-05, + "loss": 1.1664, + "step": 1310 + }, + { + "epoch": 4.003544303797469, + "grad_norm": 10.537801742553711, + "learning_rate": 4.177215189873418e-05, + "loss": 1.0544, + "step": 1320 + }, + { + "epoch": 4.004177215189873, + "grad_norm": 7.792250633239746, + "learning_rate": 4.208860759493671e-05, + "loss": 1.185, + "step": 1330 + }, + { + "epoch": 4.004810126582279, + "grad_norm": 7.729872226715088, + "learning_rate": 4.240506329113924e-05, + "loss": 0.958, + "step": 1340 + }, + { + "epoch": 4.0054430379746835, + "grad_norm": 8.17205810546875, + "learning_rate": 4.2721518987341776e-05, + "loss": 1.1805, + "step": 1350 + }, + { + "epoch": 4.006075949367089, + "grad_norm": 8.281828880310059, + "learning_rate": 4.3037974683544305e-05, + "loss": 1.0389, + "step": 1360 + }, + { + "epoch": 4.006708860759494, + "grad_norm": 10.64874267578125, + "learning_rate": 4.3354430379746834e-05, + "loss": 1.1096, + "step": 1370 + }, + { + "epoch": 4.007341772151899, + "grad_norm": 10.857294082641602, + "learning_rate": 4.367088607594937e-05, + "loss": 1.0934, + "step": 1380 + }, + { + "epoch": 4.007974683544304, + "grad_norm": 10.056556701660156, + "learning_rate": 4.3987341772151904e-05, + "loss": 1.0382, + "step": 1390 + }, + { + "epoch": 4.008607594936709, + "grad_norm": 5.310425281524658, + "learning_rate": 4.430379746835443e-05, + "loss": 0.8079, + "step": 1400 + }, + { + "epoch": 4.009240506329114, + "grad_norm": 9.902403831481934, + "learning_rate": 4.462025316455696e-05, + "loss": 1.1619, + "step": 1410 + }, + { + "epoch": 4.009873417721519, + "grad_norm": 10.88439655303955, + "learning_rate": 4.49367088607595e-05, + "loss": 1.1363, + "step": 1420 + }, + { + "epoch": 4.010506329113924, + "grad_norm": 8.662186622619629, + "learning_rate": 4.525316455696203e-05, + "loss": 1.0104, + "step": 1430 + }, + { + "epoch": 4.0111392405063295, + "grad_norm": 7.63654899597168, + "learning_rate": 4.556962025316456e-05, + "loss": 0.9476, + "step": 1440 + }, + { + "epoch": 4.011772151898734, + "grad_norm": 15.624666213989258, + "learning_rate": 4.588607594936709e-05, + "loss": 1.0386, + "step": 1450 + }, + { + "epoch": 4.01240506329114, + "grad_norm": 10.169722557067871, + "learning_rate": 4.6202531645569625e-05, + "loss": 1.0147, + "step": 1460 + }, + { + "epoch": 4.013037974683544, + "grad_norm": 10.036338806152344, + "learning_rate": 4.6518987341772154e-05, + "loss": 1.0159, + "step": 1470 + }, + { + "epoch": 4.01367088607595, + "grad_norm": 10.780523300170898, + "learning_rate": 4.683544303797468e-05, + "loss": 1.0246, + "step": 1480 + }, + { + "epoch": 4.014303797468354, + "grad_norm": 12.800423622131348, + "learning_rate": 4.715189873417722e-05, + "loss": 0.9578, + "step": 1490 + }, + { + "epoch": 4.01493670886076, + "grad_norm": 11.1471586227417, + "learning_rate": 4.7468354430379746e-05, + "loss": 0.9799, + "step": 1500 + }, + { + "epoch": 4.0155696202531646, + "grad_norm": 5.804954528808594, + "learning_rate": 4.778481012658228e-05, + "loss": 0.9171, + "step": 1510 + }, + { + "epoch": 4.01620253164557, + "grad_norm": 14.239872932434082, + "learning_rate": 4.810126582278481e-05, + "loss": 1.1415, + "step": 1520 + }, + { + "epoch": 4.016835443037975, + "grad_norm": 6.897063255310059, + "learning_rate": 4.8417721518987346e-05, + "loss": 1.0636, + "step": 1530 + }, + { + "epoch": 4.017468354430379, + "grad_norm": 12.96029281616211, + "learning_rate": 4.8734177215189874e-05, + "loss": 0.9399, + "step": 1540 + }, + { + "epoch": 4.018101265822785, + "grad_norm": 17.250343322753906, + "learning_rate": 4.905063291139241e-05, + "loss": 0.8518, + "step": 1550 + }, + { + "epoch": 4.018734177215189, + "grad_norm": 10.851552963256836, + "learning_rate": 4.936708860759494e-05, + "loss": 1.091, + "step": 1560 + }, + { + "epoch": 4.019367088607595, + "grad_norm": 16.084505081176758, + "learning_rate": 4.968354430379747e-05, + "loss": 1.0378, + "step": 1570 + }, + { + "epoch": 4.02, + "grad_norm": 11.509261131286621, + "learning_rate": 5e-05, + "loss": 1.0637, + "step": 1580 + }, + { + "epoch": 4.02, + "eval_accuracy": 0.7773512476007678, + "eval_loss": 0.7994989156723022, + "eval_runtime": 871.4001, + "eval_samples_per_second": 0.598, + "eval_steps_per_second": 0.076, + "step": 1580 + }, + { + "epoch": 5.000632911392405, + "grad_norm": 8.971628189086914, + "learning_rate": 4.99648382559775e-05, + "loss": 0.8316, + "step": 1590 + }, + { + "epoch": 5.00126582278481, + "grad_norm": 13.378620147705078, + "learning_rate": 4.9929676511955e-05, + "loss": 0.8914, + "step": 1600 + }, + { + "epoch": 5.001898734177215, + "grad_norm": 9.115056037902832, + "learning_rate": 4.989451476793249e-05, + "loss": 0.9715, + "step": 1610 + }, + { + "epoch": 5.00253164556962, + "grad_norm": 20.863187789916992, + "learning_rate": 4.985935302390999e-05, + "loss": 0.7779, + "step": 1620 + }, + { + "epoch": 5.003164556962025, + "grad_norm": 10.088607788085938, + "learning_rate": 4.982419127988748e-05, + "loss": 0.8775, + "step": 1630 + }, + { + "epoch": 5.00379746835443, + "grad_norm": 6.800666332244873, + "learning_rate": 4.9789029535864986e-05, + "loss": 0.7337, + "step": 1640 + }, + { + "epoch": 5.004430379746835, + "grad_norm": 12.704118728637695, + "learning_rate": 4.975386779184248e-05, + "loss": 0.7668, + "step": 1650 + }, + { + "epoch": 5.0050632911392405, + "grad_norm": 7.471581935882568, + "learning_rate": 4.9718706047819975e-05, + "loss": 0.7418, + "step": 1660 + }, + { + "epoch": 5.005696202531645, + "grad_norm": 11.495733261108398, + "learning_rate": 4.968354430379747e-05, + "loss": 0.8782, + "step": 1670 + }, + { + "epoch": 5.006329113924051, + "grad_norm": 7.495334148406982, + "learning_rate": 4.964838255977497e-05, + "loss": 0.7234, + "step": 1680 + }, + { + "epoch": 5.006962025316455, + "grad_norm": 9.78715991973877, + "learning_rate": 4.9613220815752464e-05, + "loss": 0.8924, + "step": 1690 + }, + { + "epoch": 5.007594936708861, + "grad_norm": 6.057085990905762, + "learning_rate": 4.957805907172996e-05, + "loss": 0.6553, + "step": 1700 + }, + { + "epoch": 5.008227848101265, + "grad_norm": 7.595339298248291, + "learning_rate": 4.9542897327707454e-05, + "loss": 0.9012, + "step": 1710 + }, + { + "epoch": 5.008860759493671, + "grad_norm": 11.763632774353027, + "learning_rate": 4.950773558368496e-05, + "loss": 0.7808, + "step": 1720 + }, + { + "epoch": 5.0094936708860756, + "grad_norm": 9.754287719726562, + "learning_rate": 4.947257383966245e-05, + "loss": 0.8886, + "step": 1730 + }, + { + "epoch": 5.010126582278481, + "grad_norm": 7.873773097991943, + "learning_rate": 4.943741209563995e-05, + "loss": 0.6798, + "step": 1740 + }, + { + "epoch": 5.010759493670886, + "grad_norm": 9.378247261047363, + "learning_rate": 4.940225035161744e-05, + "loss": 0.763, + "step": 1750 + }, + { + "epoch": 5.011392405063291, + "grad_norm": 13.096108436584473, + "learning_rate": 4.936708860759494e-05, + "loss": 0.7834, + "step": 1760 + }, + { + "epoch": 5.012025316455696, + "grad_norm": 5.072068691253662, + "learning_rate": 4.933192686357244e-05, + "loss": 0.6282, + "step": 1770 + }, + { + "epoch": 5.012658227848101, + "grad_norm": 14.543283462524414, + "learning_rate": 4.929676511954993e-05, + "loss": 0.6679, + "step": 1780 + }, + { + "epoch": 5.013291139240506, + "grad_norm": 11.026363372802734, + "learning_rate": 4.9261603375527427e-05, + "loss": 0.7401, + "step": 1790 + }, + { + "epoch": 5.0139240506329115, + "grad_norm": 5.937496662139893, + "learning_rate": 4.9226441631504925e-05, + "loss": 0.7408, + "step": 1800 + }, + { + "epoch": 5.014556962025316, + "grad_norm": 9.093450546264648, + "learning_rate": 4.919127988748242e-05, + "loss": 0.9351, + "step": 1810 + }, + { + "epoch": 5.015189873417722, + "grad_norm": 6.913345813751221, + "learning_rate": 4.9156118143459915e-05, + "loss": 0.6614, + "step": 1820 + }, + { + "epoch": 5.015822784810126, + "grad_norm": 5.915560245513916, + "learning_rate": 4.912095639943741e-05, + "loss": 0.6584, + "step": 1830 + }, + { + "epoch": 5.016455696202532, + "grad_norm": 9.257479667663574, + "learning_rate": 4.908579465541491e-05, + "loss": 0.6744, + "step": 1840 + }, + { + "epoch": 5.017088607594936, + "grad_norm": 12.13564395904541, + "learning_rate": 4.905063291139241e-05, + "loss": 0.7035, + "step": 1850 + }, + { + "epoch": 5.017721518987342, + "grad_norm": 6.930849075317383, + "learning_rate": 4.90154711673699e-05, + "loss": 0.7628, + "step": 1860 + }, + { + "epoch": 5.0183544303797465, + "grad_norm": 13.400097846984863, + "learning_rate": 4.89803094233474e-05, + "loss": 0.6906, + "step": 1870 + }, + { + "epoch": 5.018987341772152, + "grad_norm": 3.020031452178955, + "learning_rate": 4.89451476793249e-05, + "loss": 0.8423, + "step": 1880 + }, + { + "epoch": 5.019620253164557, + "grad_norm": 11.807697296142578, + "learning_rate": 4.8909985935302396e-05, + "loss": 0.7795, + "step": 1890 + }, + { + "epoch": 5.02, + "eval_accuracy": 0.8829174664107485, + "eval_loss": 0.4938121438026428, + "eval_runtime": 888.2928, + "eval_samples_per_second": 0.587, + "eval_steps_per_second": 0.074, + "step": 1896 + }, + { + "epoch": 6.000253164556962, + "grad_norm": 1.9311364889144897, + "learning_rate": 4.887482419127989e-05, + "loss": 0.5037, + "step": 1900 + }, + { + "epoch": 6.000886075949367, + "grad_norm": 2.175624370574951, + "learning_rate": 4.8839662447257386e-05, + "loss": 0.5447, + "step": 1910 + }, + { + "epoch": 6.001518987341772, + "grad_norm": 6.488412380218506, + "learning_rate": 4.8804500703234885e-05, + "loss": 0.4324, + "step": 1920 + }, + { + "epoch": 6.002151898734177, + "grad_norm": 7.727587699890137, + "learning_rate": 4.876933895921238e-05, + "loss": 0.7189, + "step": 1930 + }, + { + "epoch": 6.002784810126582, + "grad_norm": 9.56769847869873, + "learning_rate": 4.8734177215189874e-05, + "loss": 0.4081, + "step": 1940 + }, + { + "epoch": 6.0034177215189874, + "grad_norm": 4.022727012634277, + "learning_rate": 4.869901547116737e-05, + "loss": 0.6191, + "step": 1950 + }, + { + "epoch": 6.004050632911392, + "grad_norm": 4.674831867218018, + "learning_rate": 4.866385372714487e-05, + "loss": 0.523, + "step": 1960 + }, + { + "epoch": 6.004683544303798, + "grad_norm": 9.083635330200195, + "learning_rate": 4.862869198312236e-05, + "loss": 0.5281, + "step": 1970 + }, + { + "epoch": 6.005316455696202, + "grad_norm": 11.267608642578125, + "learning_rate": 4.859353023909986e-05, + "loss": 0.3717, + "step": 1980 + }, + { + "epoch": 6.005949367088608, + "grad_norm": 4.938966751098633, + "learning_rate": 4.855836849507735e-05, + "loss": 0.4737, + "step": 1990 + }, + { + "epoch": 6.006582278481012, + "grad_norm": 7.700178623199463, + "learning_rate": 4.852320675105486e-05, + "loss": 0.507, + "step": 2000 + }, + { + "epoch": 6.007215189873418, + "grad_norm": 13.916314125061035, + "learning_rate": 4.848804500703235e-05, + "loss": 0.4627, + "step": 2010 + }, + { + "epoch": 6.0078481012658225, + "grad_norm": 9.637331008911133, + "learning_rate": 4.845288326300985e-05, + "loss": 0.509, + "step": 2020 + }, + { + "epoch": 6.008481012658228, + "grad_norm": 6.41615629196167, + "learning_rate": 4.8417721518987346e-05, + "loss": 0.4817, + "step": 2030 + }, + { + "epoch": 6.009113924050633, + "grad_norm": 4.391960144042969, + "learning_rate": 4.8382559774964844e-05, + "loss": 0.6306, + "step": 2040 + }, + { + "epoch": 6.009746835443038, + "grad_norm": 3.4830005168914795, + "learning_rate": 4.8347398030942336e-05, + "loss": 0.6192, + "step": 2050 + }, + { + "epoch": 6.010379746835443, + "grad_norm": 8.927491188049316, + "learning_rate": 4.8312236286919834e-05, + "loss": 0.3656, + "step": 2060 + }, + { + "epoch": 6.011012658227848, + "grad_norm": 6.872843265533447, + "learning_rate": 4.827707454289733e-05, + "loss": 0.4649, + "step": 2070 + }, + { + "epoch": 6.011645569620253, + "grad_norm": 5.7255682945251465, + "learning_rate": 4.824191279887483e-05, + "loss": 0.5626, + "step": 2080 + }, + { + "epoch": 6.012278481012658, + "grad_norm": 12.707902908325195, + "learning_rate": 4.820675105485232e-05, + "loss": 0.7542, + "step": 2090 + }, + { + "epoch": 6.012911392405063, + "grad_norm": 8.386948585510254, + "learning_rate": 4.817158931082982e-05, + "loss": 0.5574, + "step": 2100 + }, + { + "epoch": 6.0135443037974685, + "grad_norm": 5.364466667175293, + "learning_rate": 4.813642756680732e-05, + "loss": 0.5088, + "step": 2110 + }, + { + "epoch": 6.014177215189873, + "grad_norm": 7.0593791007995605, + "learning_rate": 4.810126582278481e-05, + "loss": 0.4507, + "step": 2120 + }, + { + "epoch": 6.014810126582279, + "grad_norm": 4.73020076751709, + "learning_rate": 4.806610407876231e-05, + "loss": 0.4129, + "step": 2130 + }, + { + "epoch": 6.015443037974683, + "grad_norm": 9.857553482055664, + "learning_rate": 4.80309423347398e-05, + "loss": 0.6213, + "step": 2140 + }, + { + "epoch": 6.016075949367089, + "grad_norm": 3.092639446258545, + "learning_rate": 4.7995780590717305e-05, + "loss": 0.465, + "step": 2150 + }, + { + "epoch": 6.016708860759493, + "grad_norm": 10.670005798339844, + "learning_rate": 4.79606188466948e-05, + "loss": 0.5071, + "step": 2160 + }, + { + "epoch": 6.017341772151899, + "grad_norm": 10.013630867004395, + "learning_rate": 4.7925457102672295e-05, + "loss": 0.4878, + "step": 2170 + }, + { + "epoch": 6.0179746835443035, + "grad_norm": 5.06149959564209, + "learning_rate": 4.789029535864979e-05, + "loss": 0.4294, + "step": 2180 + }, + { + "epoch": 6.018607594936709, + "grad_norm": 6.84969425201416, + "learning_rate": 4.785513361462729e-05, + "loss": 0.4591, + "step": 2190 + }, + { + "epoch": 6.019240506329114, + "grad_norm": 16.97979736328125, + "learning_rate": 4.7819971870604783e-05, + "loss": 0.3969, + "step": 2200 + }, + { + "epoch": 6.019873417721519, + "grad_norm": 4.75033712387085, + "learning_rate": 4.778481012658228e-05, + "loss": 0.4484, + "step": 2210 + }, + { + "epoch": 6.02, + "eval_accuracy": 0.8829174664107485, + "eval_loss": 0.38333675265312195, + "eval_runtime": 852.5332, + "eval_samples_per_second": 0.611, + "eval_steps_per_second": 0.077, + "step": 2212 + }, + { + "epoch": 7.000506329113924, + "grad_norm": 14.740522384643555, + "learning_rate": 4.774964838255977e-05, + "loss": 0.4139, + "step": 2220 + }, + { + "epoch": 7.001139240506329, + "grad_norm": 2.6119625568389893, + "learning_rate": 4.771448663853728e-05, + "loss": 0.3493, + "step": 2230 + }, + { + "epoch": 7.001772151898734, + "grad_norm": 1.260722279548645, + "learning_rate": 4.767932489451477e-05, + "loss": 0.2562, + "step": 2240 + }, + { + "epoch": 7.002405063291139, + "grad_norm": 2.5697829723358154, + "learning_rate": 4.764416315049227e-05, + "loss": 0.3227, + "step": 2250 + }, + { + "epoch": 7.0030379746835445, + "grad_norm": 5.382594585418701, + "learning_rate": 4.760900140646976e-05, + "loss": 0.42, + "step": 2260 + }, + { + "epoch": 7.003670886075949, + "grad_norm": 3.63082218170166, + "learning_rate": 4.757383966244726e-05, + "loss": 0.3715, + "step": 2270 + }, + { + "epoch": 7.004303797468355, + "grad_norm": 4.501291751861572, + "learning_rate": 4.7538677918424756e-05, + "loss": 0.2527, + "step": 2280 + }, + { + "epoch": 7.004936708860759, + "grad_norm": 6.318272590637207, + "learning_rate": 4.7503516174402255e-05, + "loss": 0.2922, + "step": 2290 + }, + { + "epoch": 7.005569620253165, + "grad_norm": 15.981465339660645, + "learning_rate": 4.7468354430379746e-05, + "loss": 0.4605, + "step": 2300 + }, + { + "epoch": 7.006202531645569, + "grad_norm": 7.8365020751953125, + "learning_rate": 4.7433192686357245e-05, + "loss": 0.4728, + "step": 2310 + }, + { + "epoch": 7.006835443037975, + "grad_norm": 6.1255388259887695, + "learning_rate": 4.739803094233474e-05, + "loss": 0.3578, + "step": 2320 + }, + { + "epoch": 7.0074683544303795, + "grad_norm": 12.867307662963867, + "learning_rate": 4.7362869198312235e-05, + "loss": 0.3075, + "step": 2330 + }, + { + "epoch": 7.008101265822785, + "grad_norm": 3.2056193351745605, + "learning_rate": 4.732770745428973e-05, + "loss": 0.4281, + "step": 2340 + }, + { + "epoch": 7.00873417721519, + "grad_norm": 14.265457153320312, + "learning_rate": 4.729254571026723e-05, + "loss": 0.584, + "step": 2350 + }, + { + "epoch": 7.009367088607595, + "grad_norm": 5.145318984985352, + "learning_rate": 4.725738396624473e-05, + "loss": 0.3264, + "step": 2360 + }, + { + "epoch": 7.01, + "grad_norm": 12.106173515319824, + "learning_rate": 4.722222222222222e-05, + "loss": 0.4977, + "step": 2370 + }, + { + "epoch": 7.010632911392405, + "grad_norm": 2.557343006134033, + "learning_rate": 4.718706047819972e-05, + "loss": 0.2676, + "step": 2380 + }, + { + "epoch": 7.01126582278481, + "grad_norm": 9.406204223632812, + "learning_rate": 4.715189873417722e-05, + "loss": 0.3632, + "step": 2390 + }, + { + "epoch": 7.011898734177215, + "grad_norm": 0.7909258604049683, + "learning_rate": 4.7116736990154716e-05, + "loss": 0.2164, + "step": 2400 + }, + { + "epoch": 7.01253164556962, + "grad_norm": 0.8705269694328308, + "learning_rate": 4.708157524613221e-05, + "loss": 0.3072, + "step": 2410 + }, + { + "epoch": 7.013164556962026, + "grad_norm": 3.603971481323242, + "learning_rate": 4.704641350210971e-05, + "loss": 0.2117, + "step": 2420 + }, + { + "epoch": 7.01379746835443, + "grad_norm": 7.447606563568115, + "learning_rate": 4.7011251758087204e-05, + "loss": 0.32, + "step": 2430 + }, + { + "epoch": 7.014430379746836, + "grad_norm": 7.379356384277344, + "learning_rate": 4.69760900140647e-05, + "loss": 0.3718, + "step": 2440 + }, + { + "epoch": 7.01506329113924, + "grad_norm": 17.737926483154297, + "learning_rate": 4.6940928270042194e-05, + "loss": 0.4529, + "step": 2450 + }, + { + "epoch": 7.015696202531646, + "grad_norm": 3.0855658054351807, + "learning_rate": 4.690576652601969e-05, + "loss": 0.2227, + "step": 2460 + }, + { + "epoch": 7.0163291139240505, + "grad_norm": 14.56204605102539, + "learning_rate": 4.687060478199719e-05, + "loss": 0.3014, + "step": 2470 + }, + { + "epoch": 7.016962025316456, + "grad_norm": 3.587031841278076, + "learning_rate": 4.683544303797468e-05, + "loss": 0.3073, + "step": 2480 + }, + { + "epoch": 7.017594936708861, + "grad_norm": 3.022749423980713, + "learning_rate": 4.680028129395218e-05, + "loss": 0.2003, + "step": 2490 + }, + { + "epoch": 7.018227848101266, + "grad_norm": 8.433323860168457, + "learning_rate": 4.676511954992968e-05, + "loss": 0.3411, + "step": 2500 + }, + { + "epoch": 7.018860759493671, + "grad_norm": 4.128199100494385, + "learning_rate": 4.672995780590718e-05, + "loss": 0.3518, + "step": 2510 + }, + { + "epoch": 7.019493670886076, + "grad_norm": 11.647120475769043, + "learning_rate": 4.669479606188467e-05, + "loss": 0.2162, + "step": 2520 + }, + { + "epoch": 7.02, + "eval_accuracy": 0.9155470249520153, + "eval_loss": 0.25118356943130493, + "eval_runtime": 859.0348, + "eval_samples_per_second": 0.606, + "eval_steps_per_second": 0.077, + "step": 2528 + }, + { + "epoch": 8.000126582278481, + "grad_norm": 21.305402755737305, + "learning_rate": 4.665963431786217e-05, + "loss": 0.3546, + "step": 2530 + }, + { + "epoch": 8.000759493670886, + "grad_norm": 9.045007705688477, + "learning_rate": 4.6624472573839666e-05, + "loss": 0.2433, + "step": 2540 + }, + { + "epoch": 8.00139240506329, + "grad_norm": 6.032439708709717, + "learning_rate": 4.6589310829817164e-05, + "loss": 0.2786, + "step": 2550 + }, + { + "epoch": 8.002025316455697, + "grad_norm": 5.416832447052002, + "learning_rate": 4.6554149085794655e-05, + "loss": 0.2595, + "step": 2560 + }, + { + "epoch": 8.002658227848102, + "grad_norm": 2.8014585971832275, + "learning_rate": 4.6518987341772154e-05, + "loss": 0.301, + "step": 2570 + }, + { + "epoch": 8.003291139240506, + "grad_norm": 1.782472014427185, + "learning_rate": 4.648382559774965e-05, + "loss": 0.1689, + "step": 2580 + }, + { + "epoch": 8.00392405063291, + "grad_norm": 2.679903507232666, + "learning_rate": 4.644866385372715e-05, + "loss": 0.2884, + "step": 2590 + }, + { + "epoch": 8.004556962025317, + "grad_norm": 2.2325003147125244, + "learning_rate": 4.641350210970464e-05, + "loss": 0.2047, + "step": 2600 + }, + { + "epoch": 8.005189873417722, + "grad_norm": 9.163021087646484, + "learning_rate": 4.637834036568214e-05, + "loss": 0.1786, + "step": 2610 + }, + { + "epoch": 8.005822784810126, + "grad_norm": 3.2019553184509277, + "learning_rate": 4.634317862165964e-05, + "loss": 0.2159, + "step": 2620 + }, + { + "epoch": 8.006455696202531, + "grad_norm": 4.9009904861450195, + "learning_rate": 4.630801687763714e-05, + "loss": 0.249, + "step": 2630 + }, + { + "epoch": 8.007088607594937, + "grad_norm": 4.865252494812012, + "learning_rate": 4.627285513361463e-05, + "loss": 0.3338, + "step": 2640 + }, + { + "epoch": 8.007721518987342, + "grad_norm": 1.178444504737854, + "learning_rate": 4.623769338959213e-05, + "loss": 0.2716, + "step": 2650 + }, + { + "epoch": 8.008354430379747, + "grad_norm": 2.2412354946136475, + "learning_rate": 4.6202531645569625e-05, + "loss": 0.1516, + "step": 2660 + }, + { + "epoch": 8.008987341772151, + "grad_norm": 11.467605590820312, + "learning_rate": 4.616736990154712e-05, + "loss": 0.3324, + "step": 2670 + }, + { + "epoch": 8.009620253164558, + "grad_norm": 4.130499362945557, + "learning_rate": 4.6132208157524615e-05, + "loss": 0.3092, + "step": 2680 + }, + { + "epoch": 8.010253164556962, + "grad_norm": 8.060324668884277, + "learning_rate": 4.6097046413502107e-05, + "loss": 0.2931, + "step": 2690 + }, + { + "epoch": 8.010886075949367, + "grad_norm": 18.59467124938965, + "learning_rate": 4.606188466947961e-05, + "loss": 0.2696, + "step": 2700 + }, + { + "epoch": 8.011518987341772, + "grad_norm": 0.5714246034622192, + "learning_rate": 4.60267229254571e-05, + "loss": 0.2422, + "step": 2710 + }, + { + "epoch": 8.012151898734178, + "grad_norm": 2.771367311477661, + "learning_rate": 4.59915611814346e-05, + "loss": 0.2716, + "step": 2720 + }, + { + "epoch": 8.012784810126583, + "grad_norm": 1.5201733112335205, + "learning_rate": 4.595639943741209e-05, + "loss": 0.1204, + "step": 2730 + }, + { + "epoch": 8.013417721518987, + "grad_norm": 15.473625183105469, + "learning_rate": 4.59212376933896e-05, + "loss": 0.2585, + "step": 2740 + }, + { + "epoch": 8.014050632911392, + "grad_norm": 3.226699113845825, + "learning_rate": 4.588607594936709e-05, + "loss": 0.1866, + "step": 2750 + }, + { + "epoch": 8.014683544303798, + "grad_norm": 1.999643087387085, + "learning_rate": 4.585091420534459e-05, + "loss": 0.237, + "step": 2760 + }, + { + "epoch": 8.015316455696203, + "grad_norm": 18.86202049255371, + "learning_rate": 4.581575246132208e-05, + "loss": 0.2425, + "step": 2770 + }, + { + "epoch": 8.015949367088608, + "grad_norm": 0.8307034969329834, + "learning_rate": 4.5780590717299585e-05, + "loss": 0.1629, + "step": 2780 + }, + { + "epoch": 8.016582278481012, + "grad_norm": 15.315855979919434, + "learning_rate": 4.5745428973277076e-05, + "loss": 0.2366, + "step": 2790 + }, + { + "epoch": 8.017215189873419, + "grad_norm": 16.721681594848633, + "learning_rate": 4.5710267229254575e-05, + "loss": 0.3524, + "step": 2800 + }, + { + "epoch": 8.017848101265823, + "grad_norm": 0.7722905278205872, + "learning_rate": 4.5675105485232066e-05, + "loss": 0.1772, + "step": 2810 + }, + { + "epoch": 8.018481012658228, + "grad_norm": 1.7419816255569458, + "learning_rate": 4.5639943741209564e-05, + "loss": 0.206, + "step": 2820 + }, + { + "epoch": 8.019113924050632, + "grad_norm": 4.165656089782715, + "learning_rate": 4.560478199718706e-05, + "loss": 0.2458, + "step": 2830 + }, + { + "epoch": 8.019746835443039, + "grad_norm": 2.085017681121826, + "learning_rate": 4.556962025316456e-05, + "loss": 0.228, + "step": 2840 + }, + { + "epoch": 8.02, + "eval_accuracy": 0.9309021113243762, + "eval_loss": 0.19722820818424225, + "eval_runtime": 905.8613, + "eval_samples_per_second": 0.575, + "eval_steps_per_second": 0.073, + "step": 2844 + }, + { + "epoch": 9.000379746835444, + "grad_norm": 13.498887062072754, + "learning_rate": 4.553445850914206e-05, + "loss": 0.1674, + "step": 2850 + }, + { + "epoch": 9.001012658227848, + "grad_norm": 0.7413739562034607, + "learning_rate": 4.549929676511955e-05, + "loss": 0.1371, + "step": 2860 + }, + { + "epoch": 9.001645569620253, + "grad_norm": 1.669324278831482, + "learning_rate": 4.546413502109705e-05, + "loss": 0.146, + "step": 2870 + }, + { + "epoch": 9.002278481012658, + "grad_norm": 2.6705691814422607, + "learning_rate": 4.542897327707454e-05, + "loss": 0.2867, + "step": 2880 + }, + { + "epoch": 9.002911392405064, + "grad_norm": 3.0164263248443604, + "learning_rate": 4.5393811533052046e-05, + "loss": 0.2902, + "step": 2890 + }, + { + "epoch": 9.003544303797469, + "grad_norm": 6.8492302894592285, + "learning_rate": 4.535864978902954e-05, + "loss": 0.1386, + "step": 2900 + }, + { + "epoch": 9.004177215189873, + "grad_norm": 0.835591733455658, + "learning_rate": 4.5323488045007036e-05, + "loss": 0.1567, + "step": 2910 + }, + { + "epoch": 9.004810126582278, + "grad_norm": 4.592350959777832, + "learning_rate": 4.528832630098453e-05, + "loss": 0.2066, + "step": 2920 + }, + { + "epoch": 9.005443037974684, + "grad_norm": 5.640721797943115, + "learning_rate": 4.525316455696203e-05, + "loss": 0.0528, + "step": 2930 + }, + { + "epoch": 9.006075949367089, + "grad_norm": 13.428877830505371, + "learning_rate": 4.5218002812939524e-05, + "loss": 0.1163, + "step": 2940 + }, + { + "epoch": 9.006708860759494, + "grad_norm": 0.22825801372528076, + "learning_rate": 4.518284106891702e-05, + "loss": 0.1288, + "step": 2950 + }, + { + "epoch": 9.007341772151898, + "grad_norm": 1.8029659986495972, + "learning_rate": 4.5147679324894514e-05, + "loss": 0.2082, + "step": 2960 + }, + { + "epoch": 9.007974683544305, + "grad_norm": 0.23065458238124847, + "learning_rate": 4.511251758087202e-05, + "loss": 0.2156, + "step": 2970 + }, + { + "epoch": 9.00860759493671, + "grad_norm": 3.216405153274536, + "learning_rate": 4.507735583684951e-05, + "loss": 0.2436, + "step": 2980 + }, + { + "epoch": 9.009240506329114, + "grad_norm": 0.4626835584640503, + "learning_rate": 4.504219409282701e-05, + "loss": 0.2139, + "step": 2990 + }, + { + "epoch": 9.009873417721519, + "grad_norm": 5.6381516456604, + "learning_rate": 4.50070323488045e-05, + "loss": 0.1326, + "step": 3000 + }, + { + "epoch": 9.010506329113925, + "grad_norm": 7.001626968383789, + "learning_rate": 4.4971870604782e-05, + "loss": 0.1666, + "step": 3010 + }, + { + "epoch": 9.01113924050633, + "grad_norm": 3.0966169834136963, + "learning_rate": 4.49367088607595e-05, + "loss": 0.2111, + "step": 3020 + }, + { + "epoch": 9.011772151898734, + "grad_norm": 7.884695053100586, + "learning_rate": 4.490154711673699e-05, + "loss": 0.2074, + "step": 3030 + }, + { + "epoch": 9.012405063291139, + "grad_norm": 18.61140251159668, + "learning_rate": 4.486638537271449e-05, + "loss": 0.1748, + "step": 3040 + }, + { + "epoch": 9.013037974683545, + "grad_norm": 23.860849380493164, + "learning_rate": 4.4831223628691985e-05, + "loss": 0.1367, + "step": 3050 + }, + { + "epoch": 9.01367088607595, + "grad_norm": 6.8134942054748535, + "learning_rate": 4.4796061884669484e-05, + "loss": 0.1653, + "step": 3060 + }, + { + "epoch": 9.014303797468354, + "grad_norm": 21.31640625, + "learning_rate": 4.4760900140646975e-05, + "loss": 0.3217, + "step": 3070 + }, + { + "epoch": 9.014936708860759, + "grad_norm": 3.0076091289520264, + "learning_rate": 4.4725738396624474e-05, + "loss": 0.2275, + "step": 3080 + }, + { + "epoch": 9.015569620253165, + "grad_norm": 0.38252460956573486, + "learning_rate": 4.469057665260197e-05, + "loss": 0.0889, + "step": 3090 + }, + { + "epoch": 9.01620253164557, + "grad_norm": 3.001718282699585, + "learning_rate": 4.465541490857947e-05, + "loss": 0.2065, + "step": 3100 + }, + { + "epoch": 9.016835443037975, + "grad_norm": 5.9051923751831055, + "learning_rate": 4.462025316455696e-05, + "loss": 0.2604, + "step": 3110 + }, + { + "epoch": 9.01746835443038, + "grad_norm": 1.5905555486679077, + "learning_rate": 4.458509142053446e-05, + "loss": 0.2658, + "step": 3120 + }, + { + "epoch": 9.018101265822784, + "grad_norm": 2.863093137741089, + "learning_rate": 4.454992967651196e-05, + "loss": 0.174, + "step": 3130 + }, + { + "epoch": 9.01873417721519, + "grad_norm": 2.637539863586426, + "learning_rate": 4.451476793248946e-05, + "loss": 0.1515, + "step": 3140 + }, + { + "epoch": 9.019367088607595, + "grad_norm": 0.3857173025608063, + "learning_rate": 4.447960618846695e-05, + "loss": 0.0724, + "step": 3150 + }, + { + "epoch": 9.02, + "grad_norm": 1.806871771812439, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.1711, + "step": 3160 + }, + { + "epoch": 9.02, + "eval_accuracy": 0.9481765834932822, + "eval_loss": 0.14262719452381134, + "eval_runtime": 899.0913, + "eval_samples_per_second": 0.579, + "eval_steps_per_second": 0.073, + "step": 3160 + }, + { + "epoch": 10.000632911392405, + "grad_norm": 5.219705104827881, + "learning_rate": 4.4409282700421945e-05, + "loss": 0.1425, + "step": 3170 + }, + { + "epoch": 10.00126582278481, + "grad_norm": 12.191892623901367, + "learning_rate": 4.4374120956399436e-05, + "loss": 0.1729, + "step": 3180 + }, + { + "epoch": 10.001898734177216, + "grad_norm": 1.2383445501327515, + "learning_rate": 4.4338959212376935e-05, + "loss": 0.2565, + "step": 3190 + }, + { + "epoch": 10.00253164556962, + "grad_norm": 0.5347968935966492, + "learning_rate": 4.430379746835443e-05, + "loss": 0.1981, + "step": 3200 + }, + { + "epoch": 10.003164556962025, + "grad_norm": 0.8782184720039368, + "learning_rate": 4.426863572433193e-05, + "loss": 0.0789, + "step": 3210 + }, + { + "epoch": 10.00379746835443, + "grad_norm": 0.14596301317214966, + "learning_rate": 4.423347398030942e-05, + "loss": 0.1853, + "step": 3220 + }, + { + "epoch": 10.004430379746836, + "grad_norm": 0.4674586057662964, + "learning_rate": 4.419831223628692e-05, + "loss": 0.1213, + "step": 3230 + }, + { + "epoch": 10.00506329113924, + "grad_norm": 5.122977256774902, + "learning_rate": 4.416315049226441e-05, + "loss": 0.1708, + "step": 3240 + }, + { + "epoch": 10.005696202531645, + "grad_norm": 2.872215509414673, + "learning_rate": 4.412798874824192e-05, + "loss": 0.2037, + "step": 3250 + }, + { + "epoch": 10.00632911392405, + "grad_norm": 3.31655216217041, + "learning_rate": 4.409282700421941e-05, + "loss": 0.1574, + "step": 3260 + }, + { + "epoch": 10.006962025316456, + "grad_norm": 0.5382121801376343, + "learning_rate": 4.405766526019691e-05, + "loss": 0.1031, + "step": 3270 + }, + { + "epoch": 10.00759493670886, + "grad_norm": 1.605431079864502, + "learning_rate": 4.4022503516174406e-05, + "loss": 0.1636, + "step": 3280 + }, + { + "epoch": 10.008227848101265, + "grad_norm": 9.264389038085938, + "learning_rate": 4.3987341772151904e-05, + "loss": 0.2556, + "step": 3290 + }, + { + "epoch": 10.00886075949367, + "grad_norm": 0.28090256452560425, + "learning_rate": 4.3952180028129396e-05, + "loss": 0.1123, + "step": 3300 + }, + { + "epoch": 10.009493670886076, + "grad_norm": 2.213951349258423, + "learning_rate": 4.3917018284106894e-05, + "loss": 0.0749, + "step": 3310 + }, + { + "epoch": 10.010126582278481, + "grad_norm": 6.6392035484313965, + "learning_rate": 4.388185654008439e-05, + "loss": 0.1676, + "step": 3320 + }, + { + "epoch": 10.010759493670886, + "grad_norm": 0.35711225867271423, + "learning_rate": 4.384669479606189e-05, + "loss": 0.0756, + "step": 3330 + }, + { + "epoch": 10.01139240506329, + "grad_norm": 0.24826167523860931, + "learning_rate": 4.381153305203938e-05, + "loss": 0.0955, + "step": 3340 + }, + { + "epoch": 10.012025316455697, + "grad_norm": 5.6375579833984375, + "learning_rate": 4.377637130801688e-05, + "loss": 0.1472, + "step": 3350 + }, + { + "epoch": 10.012658227848101, + "grad_norm": 0.28531956672668457, + "learning_rate": 4.374120956399438e-05, + "loss": 0.0843, + "step": 3360 + }, + { + "epoch": 10.013291139240506, + "grad_norm": 5.148308277130127, + "learning_rate": 4.370604781997187e-05, + "loss": 0.222, + "step": 3370 + }, + { + "epoch": 10.01392405063291, + "grad_norm": 0.3964751064777374, + "learning_rate": 4.367088607594937e-05, + "loss": 0.1279, + "step": 3380 + }, + { + "epoch": 10.014556962025317, + "grad_norm": 4.597806930541992, + "learning_rate": 4.363572433192686e-05, + "loss": 0.1258, + "step": 3390 + }, + { + "epoch": 10.015189873417722, + "grad_norm": 1.5554261207580566, + "learning_rate": 4.3600562587904366e-05, + "loss": 0.1193, + "step": 3400 + }, + { + "epoch": 10.015822784810126, + "grad_norm": 0.0851406529545784, + "learning_rate": 4.356540084388186e-05, + "loss": 0.1182, + "step": 3410 + }, + { + "epoch": 10.01645569620253, + "grad_norm": 3.5009474754333496, + "learning_rate": 4.3530239099859356e-05, + "loss": 0.155, + "step": 3420 + }, + { + "epoch": 10.017088607594937, + "grad_norm": 2.019500732421875, + "learning_rate": 4.349507735583685e-05, + "loss": 0.1827, + "step": 3430 + }, + { + "epoch": 10.017721518987342, + "grad_norm": 1.833369255065918, + "learning_rate": 4.345991561181435e-05, + "loss": 0.0991, + "step": 3440 + }, + { + "epoch": 10.018354430379746, + "grad_norm": 0.4851783215999603, + "learning_rate": 4.3424753867791844e-05, + "loss": 0.2478, + "step": 3450 + }, + { + "epoch": 10.018987341772151, + "grad_norm": 2.687387466430664, + "learning_rate": 4.338959212376934e-05, + "loss": 0.0995, + "step": 3460 + }, + { + "epoch": 10.019620253164558, + "grad_norm": 11.315146446228027, + "learning_rate": 4.3354430379746834e-05, + "loss": 0.2251, + "step": 3470 + }, + { + "epoch": 10.02, + "eval_accuracy": 0.9558541266794626, + "eval_loss": 0.0965082123875618, + "eval_runtime": 881.0116, + "eval_samples_per_second": 0.591, + "eval_steps_per_second": 0.075, + "step": 3476 + }, + { + "epoch": 11.000253164556963, + "grad_norm": 8.084904670715332, + "learning_rate": 4.331926863572434e-05, + "loss": 0.1515, + "step": 3480 + }, + { + "epoch": 11.000886075949367, + "grad_norm": 27.45550537109375, + "learning_rate": 4.328410689170183e-05, + "loss": 0.1284, + "step": 3490 + }, + { + "epoch": 11.001518987341772, + "grad_norm": 8.539572715759277, + "learning_rate": 4.324894514767933e-05, + "loss": 0.1598, + "step": 3500 + }, + { + "epoch": 11.002151898734176, + "grad_norm": 0.08583386987447739, + "learning_rate": 4.321378340365682e-05, + "loss": 0.0517, + "step": 3510 + }, + { + "epoch": 11.002784810126583, + "grad_norm": 0.14813897013664246, + "learning_rate": 4.317862165963432e-05, + "loss": 0.1125, + "step": 3520 + }, + { + "epoch": 11.003417721518987, + "grad_norm": 2.4185800552368164, + "learning_rate": 4.314345991561182e-05, + "loss": 0.1984, + "step": 3530 + }, + { + "epoch": 11.004050632911392, + "grad_norm": 0.21438997983932495, + "learning_rate": 4.3108298171589315e-05, + "loss": 0.0713, + "step": 3540 + }, + { + "epoch": 11.004683544303797, + "grad_norm": 0.394996702671051, + "learning_rate": 4.307313642756681e-05, + "loss": 0.1331, + "step": 3550 + }, + { + "epoch": 11.005316455696203, + "grad_norm": 1.3863253593444824, + "learning_rate": 4.3037974683544305e-05, + "loss": 0.0773, + "step": 3560 + }, + { + "epoch": 11.005949367088608, + "grad_norm": 1.7728908061981201, + "learning_rate": 4.3002812939521803e-05, + "loss": 0.1833, + "step": 3570 + }, + { + "epoch": 11.006582278481012, + "grad_norm": 0.08379014581441879, + "learning_rate": 4.2967651195499295e-05, + "loss": 0.0516, + "step": 3580 + }, + { + "epoch": 11.007215189873417, + "grad_norm": 2.6824705600738525, + "learning_rate": 4.293248945147679e-05, + "loss": 0.1616, + "step": 3590 + }, + { + "epoch": 11.007848101265823, + "grad_norm": 2.570587158203125, + "learning_rate": 4.289732770745429e-05, + "loss": 0.1306, + "step": 3600 + }, + { + "epoch": 11.008481012658228, + "grad_norm": 14.159221649169922, + "learning_rate": 4.286216596343179e-05, + "loss": 0.1083, + "step": 3610 + }, + { + "epoch": 11.009113924050633, + "grad_norm": 9.88803482055664, + "learning_rate": 4.282700421940928e-05, + "loss": 0.1391, + "step": 3620 + }, + { + "epoch": 11.009746835443037, + "grad_norm": 5.837044715881348, + "learning_rate": 4.279184247538678e-05, + "loss": 0.1622, + "step": 3630 + }, + { + "epoch": 11.010379746835444, + "grad_norm": 0.1928025782108307, + "learning_rate": 4.275668073136428e-05, + "loss": 0.1777, + "step": 3640 + }, + { + "epoch": 11.011012658227848, + "grad_norm": 0.26675212383270264, + "learning_rate": 4.2721518987341776e-05, + "loss": 0.088, + "step": 3650 + }, + { + "epoch": 11.011645569620253, + "grad_norm": 0.1680004894733429, + "learning_rate": 4.268635724331927e-05, + "loss": 0.1404, + "step": 3660 + }, + { + "epoch": 11.012278481012657, + "grad_norm": 2.046947717666626, + "learning_rate": 4.2651195499296766e-05, + "loss": 0.1198, + "step": 3670 + }, + { + "epoch": 11.012911392405064, + "grad_norm": 1.6072787046432495, + "learning_rate": 4.2616033755274265e-05, + "loss": 0.085, + "step": 3680 + }, + { + "epoch": 11.013544303797469, + "grad_norm": 0.38024991750717163, + "learning_rate": 4.258087201125176e-05, + "loss": 0.0692, + "step": 3690 + }, + { + "epoch": 11.014177215189873, + "grad_norm": 0.4724186062812805, + "learning_rate": 4.2545710267229255e-05, + "loss": 0.1008, + "step": 3700 + }, + { + "epoch": 11.014810126582278, + "grad_norm": 4.140766143798828, + "learning_rate": 4.251054852320675e-05, + "loss": 0.1582, + "step": 3710 + }, + { + "epoch": 11.015443037974684, + "grad_norm": 2.098039388656616, + "learning_rate": 4.247538677918425e-05, + "loss": 0.1359, + "step": 3720 + }, + { + "epoch": 11.016075949367089, + "grad_norm": 1.2859419584274292, + "learning_rate": 4.244022503516174e-05, + "loss": 0.0806, + "step": 3730 + }, + { + "epoch": 11.016708860759493, + "grad_norm": 4.074576377868652, + "learning_rate": 4.240506329113924e-05, + "loss": 0.2069, + "step": 3740 + }, + { + "epoch": 11.017341772151898, + "grad_norm": 11.83212947845459, + "learning_rate": 4.236990154711674e-05, + "loss": 0.1142, + "step": 3750 + }, + { + "epoch": 11.017974683544304, + "grad_norm": 2.487534999847412, + "learning_rate": 4.233473980309424e-05, + "loss": 0.1747, + "step": 3760 + }, + { + "epoch": 11.018607594936709, + "grad_norm": 6.369086265563965, + "learning_rate": 4.229957805907173e-05, + "loss": 0.2394, + "step": 3770 + }, + { + "epoch": 11.019240506329114, + "grad_norm": 14.795226097106934, + "learning_rate": 4.226441631504923e-05, + "loss": 0.1638, + "step": 3780 + }, + { + "epoch": 11.019873417721518, + "grad_norm": 2.7429358959198, + "learning_rate": 4.2229254571026726e-05, + "loss": 0.1697, + "step": 3790 + }, + { + "epoch": 11.02, + "eval_accuracy": 0.9539347408829175, + "eval_loss": 0.11410919576883316, + "eval_runtime": 878.2486, + "eval_samples_per_second": 0.593, + "eval_steps_per_second": 0.075, + "step": 3792 + }, + { + "epoch": 12.000506329113923, + "grad_norm": 6.182743072509766, + "learning_rate": 4.2194092827004224e-05, + "loss": 0.1566, + "step": 3800 + }, + { + "epoch": 12.00113924050633, + "grad_norm": 3.826575994491577, + "learning_rate": 4.2158931082981716e-05, + "loss": 0.0969, + "step": 3810 + }, + { + "epoch": 12.001772151898734, + "grad_norm": 0.3016466498374939, + "learning_rate": 4.2123769338959214e-05, + "loss": 0.2192, + "step": 3820 + }, + { + "epoch": 12.002405063291139, + "grad_norm": 2.917588710784912, + "learning_rate": 4.208860759493671e-05, + "loss": 0.0723, + "step": 3830 + }, + { + "epoch": 12.003037974683544, + "grad_norm": 0.10706198215484619, + "learning_rate": 4.205344585091421e-05, + "loss": 0.136, + "step": 3840 + }, + { + "epoch": 12.00367088607595, + "grad_norm": 1.9893161058425903, + "learning_rate": 4.20182841068917e-05, + "loss": 0.1361, + "step": 3850 + }, + { + "epoch": 12.004303797468355, + "grad_norm": 1.9333717823028564, + "learning_rate": 4.19831223628692e-05, + "loss": 0.1467, + "step": 3860 + }, + { + "epoch": 12.00493670886076, + "grad_norm": 0.13750018179416656, + "learning_rate": 4.19479606188467e-05, + "loss": 0.072, + "step": 3870 + }, + { + "epoch": 12.005569620253164, + "grad_norm": 3.1651651859283447, + "learning_rate": 4.19127988748242e-05, + "loss": 0.1241, + "step": 3880 + }, + { + "epoch": 12.00620253164557, + "grad_norm": 1.7854634523391724, + "learning_rate": 4.187763713080169e-05, + "loss": 0.1204, + "step": 3890 + }, + { + "epoch": 12.006835443037975, + "grad_norm": 2.030898332595825, + "learning_rate": 4.184247538677919e-05, + "loss": 0.1028, + "step": 3900 + }, + { + "epoch": 12.00746835443038, + "grad_norm": 2.5038902759552, + "learning_rate": 4.1807313642756686e-05, + "loss": 0.0767, + "step": 3910 + }, + { + "epoch": 12.008101265822784, + "grad_norm": 0.07481967657804489, + "learning_rate": 4.177215189873418e-05, + "loss": 0.1092, + "step": 3920 + }, + { + "epoch": 12.00873417721519, + "grad_norm": 15.426689147949219, + "learning_rate": 4.1736990154711675e-05, + "loss": 0.1277, + "step": 3930 + }, + { + "epoch": 12.009367088607595, + "grad_norm": 0.38814952969551086, + "learning_rate": 4.170182841068917e-05, + "loss": 0.0369, + "step": 3940 + }, + { + "epoch": 12.01, + "grad_norm": 3.7652478218078613, + "learning_rate": 4.166666666666667e-05, + "loss": 0.0799, + "step": 3950 + }, + { + "epoch": 12.010632911392404, + "grad_norm": 0.06228223815560341, + "learning_rate": 4.1631504922644164e-05, + "loss": 0.1217, + "step": 3960 + }, + { + "epoch": 12.01126582278481, + "grad_norm": 0.3022870719432831, + "learning_rate": 4.159634317862166e-05, + "loss": 0.0405, + "step": 3970 + }, + { + "epoch": 12.011898734177215, + "grad_norm": 2.400820255279541, + "learning_rate": 4.1561181434599153e-05, + "loss": 0.0291, + "step": 3980 + }, + { + "epoch": 12.01253164556962, + "grad_norm": 2.68705415725708, + "learning_rate": 4.152601969057666e-05, + "loss": 0.1775, + "step": 3990 + }, + { + "epoch": 12.013164556962025, + "grad_norm": 0.5902582406997681, + "learning_rate": 4.149085794655415e-05, + "loss": 0.1165, + "step": 4000 + }, + { + "epoch": 12.013797468354431, + "grad_norm": 0.9047210812568665, + "learning_rate": 4.145569620253165e-05, + "loss": 0.07, + "step": 4010 + }, + { + "epoch": 12.014430379746836, + "grad_norm": 0.09479296952486038, + "learning_rate": 4.142053445850914e-05, + "loss": 0.0495, + "step": 4020 + }, + { + "epoch": 12.01506329113924, + "grad_norm": 1.401077151298523, + "learning_rate": 4.1385372714486645e-05, + "loss": 0.0204, + "step": 4030 + }, + { + "epoch": 12.015696202531645, + "grad_norm": 4.037631988525391, + "learning_rate": 4.135021097046414e-05, + "loss": 0.1883, + "step": 4040 + }, + { + "epoch": 12.016329113924051, + "grad_norm": 1.7805005311965942, + "learning_rate": 4.1315049226441635e-05, + "loss": 0.1802, + "step": 4050 + }, + { + "epoch": 12.016962025316456, + "grad_norm": 3.6074600219726562, + "learning_rate": 4.1279887482419127e-05, + "loss": 0.0523, + "step": 4060 + }, + { + "epoch": 12.01759493670886, + "grad_norm": 0.1754535734653473, + "learning_rate": 4.1244725738396625e-05, + "loss": 0.0763, + "step": 4070 + }, + { + "epoch": 12.018227848101265, + "grad_norm": 1.0715147256851196, + "learning_rate": 4.120956399437412e-05, + "loss": 0.1114, + "step": 4080 + }, + { + "epoch": 12.018860759493672, + "grad_norm": 0.06572480499744415, + "learning_rate": 4.1174402250351615e-05, + "loss": 0.0499, + "step": 4090 + }, + { + "epoch": 12.019493670886076, + "grad_norm": 2.0521042346954346, + "learning_rate": 4.113924050632912e-05, + "loss": 0.1229, + "step": 4100 + }, + { + "epoch": 12.02, + "eval_accuracy": 0.9539347408829175, + "eval_loss": 0.13622045516967773, + "eval_runtime": 873.7358, + "eval_samples_per_second": 0.596, + "eval_steps_per_second": 0.076, + "step": 4108 + }, + { + "epoch": 13.000126582278481, + "grad_norm": 0.08451604843139648, + "learning_rate": 4.110407876230661e-05, + "loss": 0.119, + "step": 4110 + }, + { + "epoch": 13.000759493670886, + "grad_norm": 1.373328685760498, + "learning_rate": 4.106891701828411e-05, + "loss": 0.0773, + "step": 4120 + }, + { + "epoch": 13.00139240506329, + "grad_norm": 5.43549919128418, + "learning_rate": 4.10337552742616e-05, + "loss": 0.1298, + "step": 4130 + }, + { + "epoch": 13.002025316455697, + "grad_norm": 1.0948410034179688, + "learning_rate": 4.0998593530239106e-05, + "loss": 0.1255, + "step": 4140 + }, + { + "epoch": 13.002658227848102, + "grad_norm": 0.060861099511384964, + "learning_rate": 4.09634317862166e-05, + "loss": 0.1304, + "step": 4150 + }, + { + "epoch": 13.003291139240506, + "grad_norm": 15.299447059631348, + "learning_rate": 4.0928270042194096e-05, + "loss": 0.1789, + "step": 4160 + }, + { + "epoch": 13.00392405063291, + "grad_norm": 0.08800447732210159, + "learning_rate": 4.089310829817159e-05, + "loss": 0.0934, + "step": 4170 + }, + { + "epoch": 13.004556962025317, + "grad_norm": 2.429713010787964, + "learning_rate": 4.085794655414909e-05, + "loss": 0.1312, + "step": 4180 + }, + { + "epoch": 13.005189873417722, + "grad_norm": 0.05040478706359863, + "learning_rate": 4.0822784810126584e-05, + "loss": 0.0764, + "step": 4190 + }, + { + "epoch": 13.005822784810126, + "grad_norm": 12.875263214111328, + "learning_rate": 4.078762306610408e-05, + "loss": 0.2538, + "step": 4200 + }, + { + "epoch": 13.006455696202531, + "grad_norm": 1.7678494453430176, + "learning_rate": 4.0752461322081574e-05, + "loss": 0.0656, + "step": 4210 + }, + { + "epoch": 13.007088607594937, + "grad_norm": 0.05160669609904289, + "learning_rate": 4.071729957805907e-05, + "loss": 0.1236, + "step": 4220 + }, + { + "epoch": 13.007721518987342, + "grad_norm": 2.3818519115448, + "learning_rate": 4.068213783403657e-05, + "loss": 0.1056, + "step": 4230 + }, + { + "epoch": 13.008354430379747, + "grad_norm": 1.16487717628479, + "learning_rate": 4.064697609001407e-05, + "loss": 0.1062, + "step": 4240 + }, + { + "epoch": 13.008987341772151, + "grad_norm": 1.0950814485549927, + "learning_rate": 4.061181434599156e-05, + "loss": 0.114, + "step": 4250 + }, + { + "epoch": 13.009620253164558, + "grad_norm": 2.856937885284424, + "learning_rate": 4.057665260196906e-05, + "loss": 0.1701, + "step": 4260 + }, + { + "epoch": 13.010253164556962, + "grad_norm": 0.057572536170482635, + "learning_rate": 4.054149085794656e-05, + "loss": 0.1006, + "step": 4270 + }, + { + "epoch": 13.010886075949367, + "grad_norm": 0.05074339732527733, + "learning_rate": 4.050632911392405e-05, + "loss": 0.1008, + "step": 4280 + }, + { + "epoch": 13.011518987341772, + "grad_norm": 3.1123783588409424, + "learning_rate": 4.047116736990155e-05, + "loss": 0.0352, + "step": 4290 + }, + { + "epoch": 13.012151898734178, + "grad_norm": 2.150893211364746, + "learning_rate": 4.0436005625879046e-05, + "loss": 0.0824, + "step": 4300 + }, + { + "epoch": 13.012784810126583, + "grad_norm": 2.5087695121765137, + "learning_rate": 4.0400843881856544e-05, + "loss": 0.1343, + "step": 4310 + }, + { + "epoch": 13.013417721518987, + "grad_norm": 1.6841331720352173, + "learning_rate": 4.0365682137834036e-05, + "loss": 0.1258, + "step": 4320 + }, + { + "epoch": 13.014050632911392, + "grad_norm": 0.3586716949939728, + "learning_rate": 4.0330520393811534e-05, + "loss": 0.1204, + "step": 4330 + }, + { + "epoch": 13.014683544303798, + "grad_norm": 0.045751988887786865, + "learning_rate": 4.029535864978903e-05, + "loss": 0.0682, + "step": 4340 + }, + { + "epoch": 13.015316455696203, + "grad_norm": 3.5271830558776855, + "learning_rate": 4.026019690576653e-05, + "loss": 0.261, + "step": 4350 + }, + { + "epoch": 13.015949367088608, + "grad_norm": 3.505953550338745, + "learning_rate": 4.022503516174402e-05, + "loss": 0.1309, + "step": 4360 + }, + { + "epoch": 13.016582278481012, + "grad_norm": 0.1506178230047226, + "learning_rate": 4.018987341772152e-05, + "loss": 0.0923, + "step": 4370 + }, + { + "epoch": 13.017215189873419, + "grad_norm": 0.3128964304924011, + "learning_rate": 4.015471167369902e-05, + "loss": 0.0573, + "step": 4380 + }, + { + "epoch": 13.017848101265823, + "grad_norm": 2.3400206565856934, + "learning_rate": 4.011954992967652e-05, + "loss": 0.099, + "step": 4390 + }, + { + "epoch": 13.018481012658228, + "grad_norm": 0.4466443955898285, + "learning_rate": 4.008438818565401e-05, + "loss": 0.0531, + "step": 4400 + }, + { + "epoch": 13.019113924050632, + "grad_norm": 2.060894250869751, + "learning_rate": 4.004922644163151e-05, + "loss": 0.0446, + "step": 4410 + }, + { + "epoch": 13.019746835443039, + "grad_norm": 1.2128475904464722, + "learning_rate": 4.0014064697609005e-05, + "loss": 0.0676, + "step": 4420 + }, + { + "epoch": 13.02, + "eval_accuracy": 0.9654510556621881, + "eval_loss": 0.07451339066028595, + "eval_runtime": 911.5098, + "eval_samples_per_second": 0.572, + "eval_steps_per_second": 0.072, + "step": 4424 + }, + { + "epoch": 14.000379746835444, + "grad_norm": 0.05884459242224693, + "learning_rate": 3.99789029535865e-05, + "loss": 0.0896, + "step": 4430 + }, + { + "epoch": 14.001012658227848, + "grad_norm": 0.05943896621465683, + "learning_rate": 3.9943741209563995e-05, + "loss": 0.1132, + "step": 4440 + }, + { + "epoch": 14.001645569620253, + "grad_norm": 0.13569866120815277, + "learning_rate": 3.9908579465541493e-05, + "loss": 0.091, + "step": 4450 + }, + { + "epoch": 14.002278481012658, + "grad_norm": 0.12505175173282623, + "learning_rate": 3.987341772151899e-05, + "loss": 0.0719, + "step": 4460 + }, + { + "epoch": 14.002911392405064, + "grad_norm": 1.747598648071289, + "learning_rate": 3.983825597749648e-05, + "loss": 0.0855, + "step": 4470 + }, + { + "epoch": 14.003544303797469, + "grad_norm": 0.03635944798588753, + "learning_rate": 3.980309423347398e-05, + "loss": 0.1261, + "step": 4480 + }, + { + "epoch": 14.004177215189873, + "grad_norm": 1.7012404203414917, + "learning_rate": 3.976793248945147e-05, + "loss": 0.1321, + "step": 4490 + }, + { + "epoch": 14.004810126582278, + "grad_norm": 0.040289971977472305, + "learning_rate": 3.973277074542898e-05, + "loss": 0.0378, + "step": 4500 + }, + { + "epoch": 14.005443037974684, + "grad_norm": 2.7338550090789795, + "learning_rate": 3.969760900140647e-05, + "loss": 0.0765, + "step": 4510 + }, + { + "epoch": 14.006075949367089, + "grad_norm": 3.6036548614501953, + "learning_rate": 3.966244725738397e-05, + "loss": 0.1414, + "step": 4520 + }, + { + "epoch": 14.006708860759494, + "grad_norm": 0.07101106643676758, + "learning_rate": 3.9627285513361467e-05, + "loss": 0.0402, + "step": 4530 + }, + { + "epoch": 14.007341772151898, + "grad_norm": 2.1385743618011475, + "learning_rate": 3.9592123769338965e-05, + "loss": 0.082, + "step": 4540 + }, + { + "epoch": 14.007974683544305, + "grad_norm": 1.6399197578430176, + "learning_rate": 3.9556962025316456e-05, + "loss": 0.1278, + "step": 4550 + }, + { + "epoch": 14.00860759493671, + "grad_norm": 0.14580035209655762, + "learning_rate": 3.9521800281293955e-05, + "loss": 0.097, + "step": 4560 + }, + { + "epoch": 14.009240506329114, + "grad_norm": 0.044299304485321045, + "learning_rate": 3.948663853727145e-05, + "loss": 0.1258, + "step": 4570 + }, + { + "epoch": 14.009873417721519, + "grad_norm": 0.06222519651055336, + "learning_rate": 3.945147679324895e-05, + "loss": 0.0526, + "step": 4580 + }, + { + "epoch": 14.010506329113925, + "grad_norm": 2.3559911251068115, + "learning_rate": 3.941631504922644e-05, + "loss": 0.0934, + "step": 4590 + }, + { + "epoch": 14.01113924050633, + "grad_norm": 0.762189507484436, + "learning_rate": 3.938115330520394e-05, + "loss": 0.0937, + "step": 4600 + }, + { + "epoch": 14.011772151898734, + "grad_norm": 0.03613949567079544, + "learning_rate": 3.934599156118144e-05, + "loss": 0.1002, + "step": 4610 + }, + { + "epoch": 14.012405063291139, + "grad_norm": 0.0497271791100502, + "learning_rate": 3.931082981715893e-05, + "loss": 0.1768, + "step": 4620 + }, + { + "epoch": 14.013037974683545, + "grad_norm": 0.0881432294845581, + "learning_rate": 3.927566807313643e-05, + "loss": 0.111, + "step": 4630 + }, + { + "epoch": 14.01367088607595, + "grad_norm": 1.8317328691482544, + "learning_rate": 3.924050632911392e-05, + "loss": 0.0322, + "step": 4640 + }, + { + "epoch": 14.014303797468354, + "grad_norm": 0.1062496155500412, + "learning_rate": 3.9205344585091426e-05, + "loss": 0.0409, + "step": 4650 + }, + { + "epoch": 14.014936708860759, + "grad_norm": 0.05037945136427879, + "learning_rate": 3.917018284106892e-05, + "loss": 0.158, + "step": 4660 + }, + { + "epoch": 14.015569620253165, + "grad_norm": 1.9508050680160522, + "learning_rate": 3.9135021097046416e-05, + "loss": 0.1135, + "step": 4670 + }, + { + "epoch": 14.01620253164557, + "grad_norm": 0.03313547745347023, + "learning_rate": 3.909985935302391e-05, + "loss": 0.0667, + "step": 4680 + }, + { + "epoch": 14.016835443037975, + "grad_norm": 0.0592011958360672, + "learning_rate": 3.906469760900141e-05, + "loss": 0.096, + "step": 4690 + }, + { + "epoch": 14.01746835443038, + "grad_norm": 0.03745780512690544, + "learning_rate": 3.9029535864978904e-05, + "loss": 0.1084, + "step": 4700 + }, + { + "epoch": 14.018101265822784, + "grad_norm": 2.3232529163360596, + "learning_rate": 3.89943741209564e-05, + "loss": 0.0571, + "step": 4710 + }, + { + "epoch": 14.01873417721519, + "grad_norm": 0.03583463653922081, + "learning_rate": 3.8959212376933894e-05, + "loss": 0.0409, + "step": 4720 + }, + { + "epoch": 14.019367088607595, + "grad_norm": 3.5223681926727295, + "learning_rate": 3.89240506329114e-05, + "loss": 0.0784, + "step": 4730 + }, + { + "epoch": 14.02, + "grad_norm": 0.07330437749624252, + "learning_rate": 3.888888888888889e-05, + "loss": 0.1228, + "step": 4740 + }, + { + "epoch": 14.02, + "eval_accuracy": 0.963531669865643, + "eval_loss": 0.08169866353273392, + "eval_runtime": 921.6107, + "eval_samples_per_second": 0.565, + "eval_steps_per_second": 0.072, + "step": 4740 + }, + { + "epoch": 15.000632911392405, + "grad_norm": 0.6237608194351196, + "learning_rate": 3.885372714486639e-05, + "loss": 0.0534, + "step": 4750 + }, + { + "epoch": 15.00126582278481, + "grad_norm": 1.1300462484359741, + "learning_rate": 3.881856540084388e-05, + "loss": 0.1765, + "step": 4760 + }, + { + "epoch": 15.001898734177216, + "grad_norm": 0.06267885118722916, + "learning_rate": 3.878340365682138e-05, + "loss": 0.0799, + "step": 4770 + }, + { + "epoch": 15.00253164556962, + "grad_norm": 0.0465877391397953, + "learning_rate": 3.874824191279888e-05, + "loss": 0.0156, + "step": 4780 + }, + { + "epoch": 15.003164556962025, + "grad_norm": 0.08922750502824783, + "learning_rate": 3.8713080168776376e-05, + "loss": 0.1283, + "step": 4790 + }, + { + "epoch": 15.00379746835443, + "grad_norm": 3.4005231857299805, + "learning_rate": 3.867791842475387e-05, + "loss": 0.0989, + "step": 4800 + }, + { + "epoch": 15.004430379746836, + "grad_norm": 0.07256881147623062, + "learning_rate": 3.8642756680731365e-05, + "loss": 0.099, + "step": 4810 + }, + { + "epoch": 15.00506329113924, + "grad_norm": 0.05675378069281578, + "learning_rate": 3.8607594936708864e-05, + "loss": 0.1094, + "step": 4820 + }, + { + "epoch": 15.005696202531645, + "grad_norm": 1.7418971061706543, + "learning_rate": 3.8572433192686355e-05, + "loss": 0.0712, + "step": 4830 + }, + { + "epoch": 15.00632911392405, + "grad_norm": 0.034461941570043564, + "learning_rate": 3.8537271448663854e-05, + "loss": 0.1568, + "step": 4840 + }, + { + "epoch": 15.006962025316456, + "grad_norm": 0.12834057211875916, + "learning_rate": 3.850210970464135e-05, + "loss": 0.071, + "step": 4850 + }, + { + "epoch": 15.00759493670886, + "grad_norm": 2.435114622116089, + "learning_rate": 3.846694796061885e-05, + "loss": 0.0736, + "step": 4860 + }, + { + "epoch": 15.008227848101265, + "grad_norm": 0.10457431524991989, + "learning_rate": 3.843178621659634e-05, + "loss": 0.0941, + "step": 4870 + }, + { + "epoch": 15.00886075949367, + "grad_norm": 1.1757208108901978, + "learning_rate": 3.839662447257384e-05, + "loss": 0.089, + "step": 4880 + }, + { + "epoch": 15.009493670886076, + "grad_norm": 0.11455094069242477, + "learning_rate": 3.836146272855134e-05, + "loss": 0.146, + "step": 4890 + }, + { + "epoch": 15.010126582278481, + "grad_norm": 0.047735344618558884, + "learning_rate": 3.832630098452884e-05, + "loss": 0.0979, + "step": 4900 + }, + { + "epoch": 15.010759493670886, + "grad_norm": 2.26092267036438, + "learning_rate": 3.829113924050633e-05, + "loss": 0.0598, + "step": 4910 + }, + { + "epoch": 15.01139240506329, + "grad_norm": 0.04012997820973396, + "learning_rate": 3.825597749648383e-05, + "loss": 0.0603, + "step": 4920 + }, + { + "epoch": 15.012025316455697, + "grad_norm": 0.060564495623111725, + "learning_rate": 3.8220815752461325e-05, + "loss": 0.0079, + "step": 4930 + }, + { + "epoch": 15.012658227848101, + "grad_norm": 1.690059781074524, + "learning_rate": 3.8185654008438823e-05, + "loss": 0.1119, + "step": 4940 + }, + { + "epoch": 15.013291139240506, + "grad_norm": 0.04470343515276909, + "learning_rate": 3.8150492264416315e-05, + "loss": 0.0421, + "step": 4950 + }, + { + "epoch": 15.01392405063291, + "grad_norm": 2.4390032291412354, + "learning_rate": 3.811533052039381e-05, + "loss": 0.1052, + "step": 4960 + }, + { + "epoch": 15.014556962025317, + "grad_norm": 0.034953054040670395, + "learning_rate": 3.808016877637131e-05, + "loss": 0.0869, + "step": 4970 + }, + { + "epoch": 15.015189873417722, + "grad_norm": 1.8463102579116821, + "learning_rate": 3.80450070323488e-05, + "loss": 0.0601, + "step": 4980 + }, + { + "epoch": 15.015822784810126, + "grad_norm": 0.025022268295288086, + "learning_rate": 3.80098452883263e-05, + "loss": 0.1042, + "step": 4990 + }, + { + "epoch": 15.01645569620253, + "grad_norm": 2.0151307582855225, + "learning_rate": 3.79746835443038e-05, + "loss": 0.1235, + "step": 5000 + }, + { + "epoch": 15.017088607594937, + "grad_norm": 2.130545139312744, + "learning_rate": 3.79395218002813e-05, + "loss": 0.1464, + "step": 5010 + }, + { + "epoch": 15.017721518987342, + "grad_norm": 0.02256944589316845, + "learning_rate": 3.790436005625879e-05, + "loss": 0.057, + "step": 5020 + }, + { + "epoch": 15.018354430379746, + "grad_norm": 0.2824922204017639, + "learning_rate": 3.786919831223629e-05, + "loss": 0.0738, + "step": 5030 + }, + { + "epoch": 15.018987341772151, + "grad_norm": 0.2349025160074234, + "learning_rate": 3.7834036568213786e-05, + "loss": 0.125, + "step": 5040 + }, + { + "epoch": 15.019620253164558, + "grad_norm": 1.1086318492889404, + "learning_rate": 3.7798874824191285e-05, + "loss": 0.0143, + "step": 5050 + }, + { + "epoch": 15.02, + "eval_accuracy": 0.9692898272552783, + "eval_loss": 0.061516787856817245, + "eval_runtime": 919.3915, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.072, + "step": 5056 + }, + { + "epoch": 16.000253164556963, + "grad_norm": 0.026827219873666763, + "learning_rate": 3.7763713080168776e-05, + "loss": 0.0445, + "step": 5060 + }, + { + "epoch": 16.000886075949367, + "grad_norm": 0.04409428685903549, + "learning_rate": 3.7728551336146275e-05, + "loss": 0.0879, + "step": 5070 + }, + { + "epoch": 16.001518987341772, + "grad_norm": 0.038987431675195694, + "learning_rate": 3.769338959212377e-05, + "loss": 0.0599, + "step": 5080 + }, + { + "epoch": 16.002151898734176, + "grad_norm": 4.49123477935791, + "learning_rate": 3.765822784810127e-05, + "loss": 0.0703, + "step": 5090 + }, + { + "epoch": 16.00278481012658, + "grad_norm": 0.03677366301417351, + "learning_rate": 3.762306610407876e-05, + "loss": 0.075, + "step": 5100 + }, + { + "epoch": 16.003417721518986, + "grad_norm": 3.2981064319610596, + "learning_rate": 3.758790436005626e-05, + "loss": 0.1017, + "step": 5110 + }, + { + "epoch": 16.004050632911394, + "grad_norm": 1.8348667621612549, + "learning_rate": 3.755274261603376e-05, + "loss": 0.0831, + "step": 5120 + }, + { + "epoch": 16.0046835443038, + "grad_norm": 0.023824410513043404, + "learning_rate": 3.751758087201125e-05, + "loss": 0.0696, + "step": 5130 + }, + { + "epoch": 16.005316455696203, + "grad_norm": 0.04150988906621933, + "learning_rate": 3.748241912798875e-05, + "loss": 0.0828, + "step": 5140 + }, + { + "epoch": 16.005949367088608, + "grad_norm": 0.020287124440073967, + "learning_rate": 3.744725738396625e-05, + "loss": 0.0027, + "step": 5150 + }, + { + "epoch": 16.006582278481012, + "grad_norm": 0.999756932258606, + "learning_rate": 3.7412095639943746e-05, + "loss": 0.0737, + "step": 5160 + }, + { + "epoch": 16.007215189873417, + "grad_norm": 0.02667284570634365, + "learning_rate": 3.737693389592124e-05, + "loss": 0.0686, + "step": 5170 + }, + { + "epoch": 16.00784810126582, + "grad_norm": 2.912930727005005, + "learning_rate": 3.7341772151898736e-05, + "loss": 0.0712, + "step": 5180 + }, + { + "epoch": 16.008481012658226, + "grad_norm": 0.03943086788058281, + "learning_rate": 3.730661040787623e-05, + "loss": 0.1255, + "step": 5190 + }, + { + "epoch": 16.009113924050634, + "grad_norm": 0.02671695314347744, + "learning_rate": 3.727144866385373e-05, + "loss": 0.1072, + "step": 5200 + }, + { + "epoch": 16.00974683544304, + "grad_norm": 0.052541956305503845, + "learning_rate": 3.7236286919831224e-05, + "loss": 0.0429, + "step": 5210 + }, + { + "epoch": 16.010379746835444, + "grad_norm": 1.465926170349121, + "learning_rate": 3.720112517580872e-05, + "loss": 0.0737, + "step": 5220 + }, + { + "epoch": 16.01101265822785, + "grad_norm": 0.02939579077064991, + "learning_rate": 3.7165963431786214e-05, + "loss": 0.026, + "step": 5230 + }, + { + "epoch": 16.011645569620253, + "grad_norm": 15.934955596923828, + "learning_rate": 3.713080168776372e-05, + "loss": 0.1409, + "step": 5240 + }, + { + "epoch": 16.012278481012657, + "grad_norm": 0.3337193727493286, + "learning_rate": 3.709563994374121e-05, + "loss": 0.1064, + "step": 5250 + }, + { + "epoch": 16.012911392405062, + "grad_norm": 1.509924292564392, + "learning_rate": 3.706047819971871e-05, + "loss": 0.1198, + "step": 5260 + }, + { + "epoch": 16.013544303797467, + "grad_norm": 1.6039562225341797, + "learning_rate": 3.70253164556962e-05, + "loss": 0.1751, + "step": 5270 + }, + { + "epoch": 16.014177215189875, + "grad_norm": 0.024508150294423103, + "learning_rate": 3.6990154711673706e-05, + "loss": 0.2054, + "step": 5280 + }, + { + "epoch": 16.01481012658228, + "grad_norm": 0.10826098173856735, + "learning_rate": 3.69549929676512e-05, + "loss": 0.0953, + "step": 5290 + }, + { + "epoch": 16.015443037974684, + "grad_norm": 0.1340492069721222, + "learning_rate": 3.6919831223628695e-05, + "loss": 0.1661, + "step": 5300 + }, + { + "epoch": 16.01607594936709, + "grad_norm": 0.20521284639835358, + "learning_rate": 3.688466947960619e-05, + "loss": 0.0446, + "step": 5310 + }, + { + "epoch": 16.016708860759493, + "grad_norm": 2.418673276901245, + "learning_rate": 3.6849507735583685e-05, + "loss": 0.1078, + "step": 5320 + }, + { + "epoch": 16.017341772151898, + "grad_norm": 0.0307689867913723, + "learning_rate": 3.6814345991561184e-05, + "loss": 0.0744, + "step": 5330 + }, + { + "epoch": 16.017974683544303, + "grad_norm": 12.458963394165039, + "learning_rate": 3.6779184247538675e-05, + "loss": 0.1081, + "step": 5340 + }, + { + "epoch": 16.018607594936707, + "grad_norm": 0.07104455679655075, + "learning_rate": 3.674402250351618e-05, + "loss": 0.0946, + "step": 5350 + }, + { + "epoch": 16.019240506329115, + "grad_norm": 0.0781354159116745, + "learning_rate": 3.670886075949367e-05, + "loss": 0.0375, + "step": 5360 + }, + { + "epoch": 16.01987341772152, + "grad_norm": 0.041876938194036484, + "learning_rate": 3.667369901547117e-05, + "loss": 0.0621, + "step": 5370 + }, + { + "epoch": 16.02, + "eval_accuracy": 0.9596928982725528, + "eval_loss": 0.07680489867925644, + "eval_runtime": 887.3026, + "eval_samples_per_second": 0.587, + "eval_steps_per_second": 0.074, + "step": 5372 + }, + { + "epoch": 17.000506329113925, + "grad_norm": 0.03362284228205681, + "learning_rate": 3.663853727144866e-05, + "loss": 0.0939, + "step": 5380 + }, + { + "epoch": 17.00113924050633, + "grad_norm": 2.931182384490967, + "learning_rate": 3.660337552742617e-05, + "loss": 0.0942, + "step": 5390 + }, + { + "epoch": 17.001772151898734, + "grad_norm": 9.610499382019043, + "learning_rate": 3.656821378340366e-05, + "loss": 0.2421, + "step": 5400 + }, + { + "epoch": 17.00240506329114, + "grad_norm": 2.01033091545105, + "learning_rate": 3.653305203938116e-05, + "loss": 0.0545, + "step": 5410 + }, + { + "epoch": 17.003037974683544, + "grad_norm": 0.5791252851486206, + "learning_rate": 3.649789029535865e-05, + "loss": 0.0443, + "step": 5420 + }, + { + "epoch": 17.00367088607595, + "grad_norm": 2.2349610328674316, + "learning_rate": 3.646272855133615e-05, + "loss": 0.1949, + "step": 5430 + }, + { + "epoch": 17.004303797468353, + "grad_norm": 0.06409902125597, + "learning_rate": 3.6427566807313645e-05, + "loss": 0.0731, + "step": 5440 + }, + { + "epoch": 17.00493670886076, + "grad_norm": 0.11862684041261673, + "learning_rate": 3.639240506329114e-05, + "loss": 0.0298, + "step": 5450 + }, + { + "epoch": 17.005569620253166, + "grad_norm": 1.8723206520080566, + "learning_rate": 3.6357243319268635e-05, + "loss": 0.1438, + "step": 5460 + }, + { + "epoch": 17.00620253164557, + "grad_norm": 0.04506813734769821, + "learning_rate": 3.632208157524613e-05, + "loss": 0.1927, + "step": 5470 + }, + { + "epoch": 17.006835443037975, + "grad_norm": 0.07324240356683731, + "learning_rate": 3.628691983122363e-05, + "loss": 0.2084, + "step": 5480 + }, + { + "epoch": 17.00746835443038, + "grad_norm": 16.583223342895508, + "learning_rate": 3.625175808720113e-05, + "loss": 0.1256, + "step": 5490 + }, + { + "epoch": 17.008101265822784, + "grad_norm": 2.8780710697174072, + "learning_rate": 3.621659634317862e-05, + "loss": 0.1533, + "step": 5500 + }, + { + "epoch": 17.00873417721519, + "grad_norm": 0.049451783299446106, + "learning_rate": 3.618143459915612e-05, + "loss": 0.0673, + "step": 5510 + }, + { + "epoch": 17.009367088607593, + "grad_norm": 1.1349679231643677, + "learning_rate": 3.614627285513362e-05, + "loss": 0.009, + "step": 5520 + }, + { + "epoch": 17.01, + "grad_norm": 0.08764008432626724, + "learning_rate": 3.611111111111111e-05, + "loss": 0.0392, + "step": 5530 + }, + { + "epoch": 17.010632911392406, + "grad_norm": 0.04414183646440506, + "learning_rate": 3.607594936708861e-05, + "loss": 0.1909, + "step": 5540 + }, + { + "epoch": 17.01126582278481, + "grad_norm": 2.099726676940918, + "learning_rate": 3.6040787623066106e-05, + "loss": 0.3287, + "step": 5550 + }, + { + "epoch": 17.011898734177215, + "grad_norm": 0.1605502963066101, + "learning_rate": 3.6005625879043604e-05, + "loss": 0.1195, + "step": 5560 + }, + { + "epoch": 17.01253164556962, + "grad_norm": 0.11115490645170212, + "learning_rate": 3.5970464135021096e-05, + "loss": 0.1011, + "step": 5570 + }, + { + "epoch": 17.013164556962025, + "grad_norm": 1.932108998298645, + "learning_rate": 3.5935302390998594e-05, + "loss": 0.1557, + "step": 5580 + }, + { + "epoch": 17.01379746835443, + "grad_norm": 0.022036854177713394, + "learning_rate": 3.590014064697609e-05, + "loss": 0.0526, + "step": 5590 + }, + { + "epoch": 17.014430379746834, + "grad_norm": 0.0311493631452322, + "learning_rate": 3.586497890295359e-05, + "loss": 0.1158, + "step": 5600 + }, + { + "epoch": 17.015063291139242, + "grad_norm": 0.37227359414100647, + "learning_rate": 3.582981715893108e-05, + "loss": 0.0607, + "step": 5610 + }, + { + "epoch": 17.015696202531647, + "grad_norm": 0.047853514552116394, + "learning_rate": 3.579465541490858e-05, + "loss": 0.1834, + "step": 5620 + }, + { + "epoch": 17.01632911392405, + "grad_norm": 0.798313558101654, + "learning_rate": 3.575949367088608e-05, + "loss": 0.0497, + "step": 5630 + }, + { + "epoch": 17.016962025316456, + "grad_norm": 0.020088857039809227, + "learning_rate": 3.572433192686358e-05, + "loss": 0.0975, + "step": 5640 + }, + { + "epoch": 17.01759493670886, + "grad_norm": 3.423915386199951, + "learning_rate": 3.568917018284107e-05, + "loss": 0.1255, + "step": 5650 + }, + { + "epoch": 17.018227848101265, + "grad_norm": 2.1361560821533203, + "learning_rate": 3.565400843881857e-05, + "loss": 0.1294, + "step": 5660 + }, + { + "epoch": 17.01886075949367, + "grad_norm": 3.7424023151397705, + "learning_rate": 3.5618846694796066e-05, + "loss": 0.21, + "step": 5670 + }, + { + "epoch": 17.019493670886074, + "grad_norm": 0.9021095633506775, + "learning_rate": 3.558368495077356e-05, + "loss": 0.0597, + "step": 5680 + }, + { + "epoch": 17.02, + "eval_accuracy": 0.963531669865643, + "eval_loss": 0.08725160360336304, + "eval_runtime": 887.9543, + "eval_samples_per_second": 0.587, + "eval_steps_per_second": 0.074, + "step": 5688 + }, + { + "epoch": 18.00012658227848, + "grad_norm": 9.254701614379883, + "learning_rate": 3.5548523206751056e-05, + "loss": 0.0679, + "step": 5690 + }, + { + "epoch": 18.000759493670888, + "grad_norm": 0.09168372303247452, + "learning_rate": 3.551336146272855e-05, + "loss": 0.0487, + "step": 5700 + }, + { + "epoch": 18.001392405063292, + "grad_norm": 1.7651177644729614, + "learning_rate": 3.547819971870605e-05, + "loss": 0.1359, + "step": 5710 + }, + { + "epoch": 18.002025316455697, + "grad_norm": 1.6308825016021729, + "learning_rate": 3.5443037974683544e-05, + "loss": 0.0374, + "step": 5720 + }, + { + "epoch": 18.0026582278481, + "grad_norm": 0.029071198776364326, + "learning_rate": 3.540787623066104e-05, + "loss": 0.0867, + "step": 5730 + }, + { + "epoch": 18.003291139240506, + "grad_norm": 1.1404354572296143, + "learning_rate": 3.5372714486638534e-05, + "loss": 0.1032, + "step": 5740 + }, + { + "epoch": 18.00392405063291, + "grad_norm": 0.060701146721839905, + "learning_rate": 3.533755274261604e-05, + "loss": 0.0807, + "step": 5750 + }, + { + "epoch": 18.004556962025315, + "grad_norm": 0.023883167654275894, + "learning_rate": 3.530239099859353e-05, + "loss": 0.0917, + "step": 5760 + }, + { + "epoch": 18.00518987341772, + "grad_norm": 2.6466054916381836, + "learning_rate": 3.526722925457103e-05, + "loss": 0.078, + "step": 5770 + }, + { + "epoch": 18.005822784810128, + "grad_norm": 0.2958976626396179, + "learning_rate": 3.523206751054853e-05, + "loss": 0.0468, + "step": 5780 + }, + { + "epoch": 18.006455696202533, + "grad_norm": 0.08598524332046509, + "learning_rate": 3.5196905766526025e-05, + "loss": 0.0925, + "step": 5790 + }, + { + "epoch": 18.007088607594937, + "grad_norm": 0.02307640202343464, + "learning_rate": 3.516174402250352e-05, + "loss": 0.0784, + "step": 5800 + }, + { + "epoch": 18.007721518987342, + "grad_norm": 1.9384537935256958, + "learning_rate": 3.5126582278481015e-05, + "loss": 0.0645, + "step": 5810 + }, + { + "epoch": 18.008354430379747, + "grad_norm": 0.06027079373598099, + "learning_rate": 3.5091420534458513e-05, + "loss": 0.1358, + "step": 5820 + }, + { + "epoch": 18.00898734177215, + "grad_norm": 1.6383880376815796, + "learning_rate": 3.505625879043601e-05, + "loss": 0.0505, + "step": 5830 + }, + { + "epoch": 18.009620253164556, + "grad_norm": 1.355348825454712, + "learning_rate": 3.50210970464135e-05, + "loss": 0.0574, + "step": 5840 + }, + { + "epoch": 18.01025316455696, + "grad_norm": 2.113496780395508, + "learning_rate": 3.4985935302391e-05, + "loss": 0.1005, + "step": 5850 + }, + { + "epoch": 18.01088607594937, + "grad_norm": 0.022542107850313187, + "learning_rate": 3.49507735583685e-05, + "loss": 0.0671, + "step": 5860 + }, + { + "epoch": 18.011518987341773, + "grad_norm": 0.05946587771177292, + "learning_rate": 3.491561181434599e-05, + "loss": 0.0933, + "step": 5870 + }, + { + "epoch": 18.012151898734178, + "grad_norm": 1.936155915260315, + "learning_rate": 3.488045007032349e-05, + "loss": 0.0975, + "step": 5880 + }, + { + "epoch": 18.012784810126583, + "grad_norm": 0.03399858996272087, + "learning_rate": 3.484528832630098e-05, + "loss": 0.0909, + "step": 5890 + }, + { + "epoch": 18.013417721518987, + "grad_norm": 0.030509311705827713, + "learning_rate": 3.4810126582278487e-05, + "loss": 0.0685, + "step": 5900 + }, + { + "epoch": 18.014050632911392, + "grad_norm": 0.016794001683592796, + "learning_rate": 3.477496483825598e-05, + "loss": 0.0481, + "step": 5910 + }, + { + "epoch": 18.014683544303796, + "grad_norm": 2.368962049484253, + "learning_rate": 3.4739803094233476e-05, + "loss": 0.2066, + "step": 5920 + }, + { + "epoch": 18.0153164556962, + "grad_norm": 1.5589197874069214, + "learning_rate": 3.470464135021097e-05, + "loss": 0.0798, + "step": 5930 + }, + { + "epoch": 18.01594936708861, + "grad_norm": 2.055568218231201, + "learning_rate": 3.466947960618847e-05, + "loss": 0.0384, + "step": 5940 + }, + { + "epoch": 18.016582278481014, + "grad_norm": 2.212440013885498, + "learning_rate": 3.4634317862165965e-05, + "loss": 0.0979, + "step": 5950 + }, + { + "epoch": 18.01721518987342, + "grad_norm": 0.022211147472262383, + "learning_rate": 3.459915611814346e-05, + "loss": 0.0575, + "step": 5960 + }, + { + "epoch": 18.017848101265823, + "grad_norm": 2.110724687576294, + "learning_rate": 3.4563994374120954e-05, + "loss": 0.0568, + "step": 5970 + }, + { + "epoch": 18.018481012658228, + "grad_norm": 0.025756409391760826, + "learning_rate": 3.452883263009846e-05, + "loss": 0.0429, + "step": 5980 + }, + { + "epoch": 18.019113924050632, + "grad_norm": 2.1070611476898193, + "learning_rate": 3.449367088607595e-05, + "loss": 0.0821, + "step": 5990 + }, + { + "epoch": 18.019746835443037, + "grad_norm": 0.043322015553712845, + "learning_rate": 3.445850914205345e-05, + "loss": 0.0696, + "step": 6000 + }, + { + "epoch": 18.02, + "eval_accuracy": 0.9539347408829175, + "eval_loss": 0.11077545583248138, + "eval_runtime": 890.6431, + "eval_samples_per_second": 0.585, + "eval_steps_per_second": 0.074, + "step": 6004 + }, + { + "epoch": 19.000379746835442, + "grad_norm": 1.7157703638076782, + "learning_rate": 3.442334739803094e-05, + "loss": 0.184, + "step": 6010 + }, + { + "epoch": 19.001012658227847, + "grad_norm": 0.03436505049467087, + "learning_rate": 3.438818565400844e-05, + "loss": 0.1121, + "step": 6020 + }, + { + "epoch": 19.001645569620255, + "grad_norm": 0.02937289886176586, + "learning_rate": 3.435302390998594e-05, + "loss": 0.1085, + "step": 6030 + }, + { + "epoch": 19.00227848101266, + "grad_norm": 2.090529680252075, + "learning_rate": 3.431786216596343e-05, + "loss": 0.1076, + "step": 6040 + }, + { + "epoch": 19.002911392405064, + "grad_norm": 2.621086835861206, + "learning_rate": 3.428270042194093e-05, + "loss": 0.0475, + "step": 6050 + }, + { + "epoch": 19.00354430379747, + "grad_norm": 0.081081323325634, + "learning_rate": 3.4247538677918426e-05, + "loss": 0.1206, + "step": 6060 + }, + { + "epoch": 19.004177215189873, + "grad_norm": 0.9470970034599304, + "learning_rate": 3.4212376933895924e-05, + "loss": 0.0197, + "step": 6070 + }, + { + "epoch": 19.004810126582278, + "grad_norm": 0.6718599200248718, + "learning_rate": 3.4177215189873416e-05, + "loss": 0.0415, + "step": 6080 + }, + { + "epoch": 19.005443037974683, + "grad_norm": 0.017539095133543015, + "learning_rate": 3.4142053445850914e-05, + "loss": 0.0924, + "step": 6090 + }, + { + "epoch": 19.006075949367087, + "grad_norm": 0.04726846516132355, + "learning_rate": 3.410689170182841e-05, + "loss": 0.0091, + "step": 6100 + }, + { + "epoch": 19.006708860759495, + "grad_norm": 1.5941773653030396, + "learning_rate": 3.407172995780591e-05, + "loss": 0.1435, + "step": 6110 + }, + { + "epoch": 19.0073417721519, + "grad_norm": 2.2858848571777344, + "learning_rate": 3.40365682137834e-05, + "loss": 0.0776, + "step": 6120 + }, + { + "epoch": 19.007974683544305, + "grad_norm": 0.03234981372952461, + "learning_rate": 3.40014064697609e-05, + "loss": 0.0483, + "step": 6130 + }, + { + "epoch": 19.00860759493671, + "grad_norm": 0.07574064284563065, + "learning_rate": 3.39662447257384e-05, + "loss": 0.0784, + "step": 6140 + }, + { + "epoch": 19.009240506329114, + "grad_norm": 0.016080401837825775, + "learning_rate": 3.39310829817159e-05, + "loss": 0.0479, + "step": 6150 + }, + { + "epoch": 19.00987341772152, + "grad_norm": 0.546974241733551, + "learning_rate": 3.389592123769339e-05, + "loss": 0.0931, + "step": 6160 + }, + { + "epoch": 19.010506329113923, + "grad_norm": 18.685945510864258, + "learning_rate": 3.386075949367089e-05, + "loss": 0.0846, + "step": 6170 + }, + { + "epoch": 19.011139240506328, + "grad_norm": 3.7618796825408936, + "learning_rate": 3.3825597749648385e-05, + "loss": 0.1187, + "step": 6180 + }, + { + "epoch": 19.011772151898736, + "grad_norm": 3.0678319931030273, + "learning_rate": 3.3790436005625884e-05, + "loss": 0.2192, + "step": 6190 + }, + { + "epoch": 19.01240506329114, + "grad_norm": 0.021779673174023628, + "learning_rate": 3.3755274261603375e-05, + "loss": 0.0531, + "step": 6200 + }, + { + "epoch": 19.013037974683545, + "grad_norm": 2.7021806240081787, + "learning_rate": 3.3720112517580874e-05, + "loss": 0.0932, + "step": 6210 + }, + { + "epoch": 19.01367088607595, + "grad_norm": 2.251648426055908, + "learning_rate": 3.368495077355837e-05, + "loss": 0.1122, + "step": 6220 + }, + { + "epoch": 19.014303797468354, + "grad_norm": 18.46161651611328, + "learning_rate": 3.3649789029535864e-05, + "loss": 0.2242, + "step": 6230 + }, + { + "epoch": 19.01493670886076, + "grad_norm": 1.9724690914154053, + "learning_rate": 3.361462728551336e-05, + "loss": 0.1283, + "step": 6240 + }, + { + "epoch": 19.015569620253164, + "grad_norm": 1.8774776458740234, + "learning_rate": 3.357946554149086e-05, + "loss": 0.0629, + "step": 6250 + }, + { + "epoch": 19.01620253164557, + "grad_norm": 0.08820010721683502, + "learning_rate": 3.354430379746836e-05, + "loss": 0.0487, + "step": 6260 + }, + { + "epoch": 19.016835443037976, + "grad_norm": 0.9228050708770752, + "learning_rate": 3.350914205344585e-05, + "loss": 0.1959, + "step": 6270 + }, + { + "epoch": 19.01746835443038, + "grad_norm": 0.061690233647823334, + "learning_rate": 3.347398030942335e-05, + "loss": 0.1182, + "step": 6280 + }, + { + "epoch": 19.018101265822786, + "grad_norm": 23.2619686126709, + "learning_rate": 3.343881856540085e-05, + "loss": 0.1069, + "step": 6290 + }, + { + "epoch": 19.01873417721519, + "grad_norm": 0.0341399684548378, + "learning_rate": 3.3403656821378345e-05, + "loss": 0.1889, + "step": 6300 + }, + { + "epoch": 19.019367088607595, + "grad_norm": 14.88992691040039, + "learning_rate": 3.3368495077355837e-05, + "loss": 0.1285, + "step": 6310 + }, + { + "epoch": 19.02, + "grad_norm": 0.3174104392528534, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.2761, + "step": 6320 + }, + { + "epoch": 19.02, + "eval_accuracy": 0.9520153550863724, + "eval_loss": 0.14126819372177124, + "eval_runtime": 862.4572, + "eval_samples_per_second": 0.604, + "eval_steps_per_second": 0.077, + "step": 6320 + }, + { + "epoch": 20.000632911392405, + "grad_norm": 15.443863868713379, + "learning_rate": 3.329817158931083e-05, + "loss": 0.2663, + "step": 6330 + }, + { + "epoch": 20.00126582278481, + "grad_norm": 0.15935222804546356, + "learning_rate": 3.326300984528833e-05, + "loss": 0.0412, + "step": 6340 + }, + { + "epoch": 20.001898734177214, + "grad_norm": 2.3951077461242676, + "learning_rate": 3.322784810126582e-05, + "loss": 0.1944, + "step": 6350 + }, + { + "epoch": 20.00253164556962, + "grad_norm": 0.055507030338048935, + "learning_rate": 3.319268635724332e-05, + "loss": 0.0692, + "step": 6360 + }, + { + "epoch": 20.003164556962027, + "grad_norm": 15.818922996520996, + "learning_rate": 3.315752461322082e-05, + "loss": 0.1544, + "step": 6370 + }, + { + "epoch": 20.00379746835443, + "grad_norm": 0.0613020621240139, + "learning_rate": 3.312236286919831e-05, + "loss": 0.0584, + "step": 6380 + }, + { + "epoch": 20.004430379746836, + "grad_norm": 1.5182740688323975, + "learning_rate": 3.308720112517581e-05, + "loss": 0.0507, + "step": 6390 + }, + { + "epoch": 20.00506329113924, + "grad_norm": 0.17965848743915558, + "learning_rate": 3.305203938115331e-05, + "loss": 0.1409, + "step": 6400 + }, + { + "epoch": 20.005696202531645, + "grad_norm": 13.088379859924316, + "learning_rate": 3.3016877637130806e-05, + "loss": 0.1933, + "step": 6410 + }, + { + "epoch": 20.00632911392405, + "grad_norm": 3.075198173522949, + "learning_rate": 3.29817158931083e-05, + "loss": 0.0366, + "step": 6420 + }, + { + "epoch": 20.006962025316454, + "grad_norm": 0.25585731863975525, + "learning_rate": 3.2946554149085796e-05, + "loss": 0.0869, + "step": 6430 + }, + { + "epoch": 20.00759493670886, + "grad_norm": 1.9562506675720215, + "learning_rate": 3.291139240506329e-05, + "loss": 0.0486, + "step": 6440 + }, + { + "epoch": 20.008227848101267, + "grad_norm": 1.8931715488433838, + "learning_rate": 3.287623066104079e-05, + "loss": 0.1142, + "step": 6450 + }, + { + "epoch": 20.008860759493672, + "grad_norm": 1.9258003234863281, + "learning_rate": 3.2841068917018284e-05, + "loss": 0.0716, + "step": 6460 + }, + { + "epoch": 20.009493670886076, + "grad_norm": 0.2476690262556076, + "learning_rate": 3.280590717299578e-05, + "loss": 0.0593, + "step": 6470 + }, + { + "epoch": 20.01012658227848, + "grad_norm": 0.041350286453962326, + "learning_rate": 3.2770745428973274e-05, + "loss": 0.0521, + "step": 6480 + }, + { + "epoch": 20.010759493670886, + "grad_norm": 6.784064769744873, + "learning_rate": 3.273558368495078e-05, + "loss": 0.0805, + "step": 6490 + }, + { + "epoch": 20.01139240506329, + "grad_norm": 1.9818538427352905, + "learning_rate": 3.270042194092827e-05, + "loss": 0.0722, + "step": 6500 + }, + { + "epoch": 20.012025316455695, + "grad_norm": 2.0829291343688965, + "learning_rate": 3.266526019690577e-05, + "loss": 0.1185, + "step": 6510 + }, + { + "epoch": 20.0126582278481, + "grad_norm": 0.06409807503223419, + "learning_rate": 3.263009845288326e-05, + "loss": 0.0777, + "step": 6520 + }, + { + "epoch": 20.013291139240508, + "grad_norm": 0.052046943455934525, + "learning_rate": 3.2594936708860766e-05, + "loss": 0.053, + "step": 6530 + }, + { + "epoch": 20.013924050632912, + "grad_norm": 6.423180103302002, + "learning_rate": 3.255977496483826e-05, + "loss": 0.1333, + "step": 6540 + }, + { + "epoch": 20.014556962025317, + "grad_norm": 0.04574073851108551, + "learning_rate": 3.2524613220815756e-05, + "loss": 0.1059, + "step": 6550 + }, + { + "epoch": 20.01518987341772, + "grad_norm": 5.25015115737915, + "learning_rate": 3.248945147679325e-05, + "loss": 0.1471, + "step": 6560 + }, + { + "epoch": 20.015822784810126, + "grad_norm": 0.036584123969078064, + "learning_rate": 3.2454289732770746e-05, + "loss": 0.2641, + "step": 6570 + }, + { + "epoch": 20.01645569620253, + "grad_norm": 10.215600967407227, + "learning_rate": 3.2419127988748244e-05, + "loss": 0.204, + "step": 6580 + }, + { + "epoch": 20.017088607594935, + "grad_norm": 0.7967722415924072, + "learning_rate": 3.2383966244725736e-05, + "loss": 0.1105, + "step": 6590 + }, + { + "epoch": 20.01772151898734, + "grad_norm": 28.42278480529785, + "learning_rate": 3.234880450070324e-05, + "loss": 0.1647, + "step": 6600 + }, + { + "epoch": 20.01835443037975, + "grad_norm": 0.14231210947036743, + "learning_rate": 3.231364275668073e-05, + "loss": 0.0553, + "step": 6610 + }, + { + "epoch": 20.018987341772153, + "grad_norm": 0.05637258663773537, + "learning_rate": 3.227848101265823e-05, + "loss": 0.074, + "step": 6620 + }, + { + "epoch": 20.019620253164558, + "grad_norm": 0.2689402401447296, + "learning_rate": 3.224331926863572e-05, + "loss": 0.129, + "step": 6630 + }, + { + "epoch": 20.02, + "eval_accuracy": 0.9520153550863724, + "eval_loss": 0.1470969319343567, + "eval_runtime": 903.3982, + "eval_samples_per_second": 0.577, + "eval_steps_per_second": 0.073, + "step": 6636 + }, + { + "epoch": 21.000253164556963, + "grad_norm": 1.2210781574249268, + "learning_rate": 3.220815752461323e-05, + "loss": 0.1749, + "step": 6640 + }, + { + "epoch": 21.000886075949367, + "grad_norm": 1.918101191520691, + "learning_rate": 3.217299578059072e-05, + "loss": 0.109, + "step": 6650 + }, + { + "epoch": 21.001518987341772, + "grad_norm": 1.1142789125442505, + "learning_rate": 3.213783403656822e-05, + "loss": 0.2533, + "step": 6660 + }, + { + "epoch": 21.002151898734176, + "grad_norm": 2.3127224445343018, + "learning_rate": 3.210267229254571e-05, + "loss": 0.1336, + "step": 6670 + }, + { + "epoch": 21.00278481012658, + "grad_norm": 2.306687116622925, + "learning_rate": 3.2067510548523214e-05, + "loss": 0.1947, + "step": 6680 + }, + { + "epoch": 21.003417721518986, + "grad_norm": 0.024246342480182648, + "learning_rate": 3.2032348804500705e-05, + "loss": 0.0446, + "step": 6690 + }, + { + "epoch": 21.004050632911394, + "grad_norm": 0.0923205316066742, + "learning_rate": 3.1997187060478204e-05, + "loss": 0.0261, + "step": 6700 + }, + { + "epoch": 21.0046835443038, + "grad_norm": 0.020252803340554237, + "learning_rate": 3.1962025316455695e-05, + "loss": 0.0664, + "step": 6710 + }, + { + "epoch": 21.005316455696203, + "grad_norm": 1.8104360103607178, + "learning_rate": 3.1926863572433193e-05, + "loss": 0.0501, + "step": 6720 + }, + { + "epoch": 21.005949367088608, + "grad_norm": 0.13536378741264343, + "learning_rate": 3.189170182841069e-05, + "loss": 0.1107, + "step": 6730 + }, + { + "epoch": 21.006582278481012, + "grad_norm": 0.023490697145462036, + "learning_rate": 3.185654008438819e-05, + "loss": 0.0581, + "step": 6740 + }, + { + "epoch": 21.007215189873417, + "grad_norm": 0.011167807504534721, + "learning_rate": 3.182137834036568e-05, + "loss": 0.0835, + "step": 6750 + }, + { + "epoch": 21.00784810126582, + "grad_norm": 2.162050485610962, + "learning_rate": 3.178621659634318e-05, + "loss": 0.0804, + "step": 6760 + }, + { + "epoch": 21.008481012658226, + "grad_norm": 0.026973918080329895, + "learning_rate": 3.175105485232068e-05, + "loss": 0.0677, + "step": 6770 + }, + { + "epoch": 21.009113924050634, + "grad_norm": 2.1330974102020264, + "learning_rate": 3.171589310829817e-05, + "loss": 0.1016, + "step": 6780 + }, + { + "epoch": 21.00974683544304, + "grad_norm": 1.0756934881210327, + "learning_rate": 3.168073136427567e-05, + "loss": 0.0232, + "step": 6790 + }, + { + "epoch": 21.010379746835444, + "grad_norm": 18.678178787231445, + "learning_rate": 3.1645569620253167e-05, + "loss": 0.0977, + "step": 6800 + }, + { + "epoch": 21.01101265822785, + "grad_norm": 1.5976709127426147, + "learning_rate": 3.1610407876230665e-05, + "loss": 0.069, + "step": 6810 + }, + { + "epoch": 21.011645569620253, + "grad_norm": 0.5849307775497437, + "learning_rate": 3.1575246132208156e-05, + "loss": 0.1015, + "step": 6820 + }, + { + "epoch": 21.012278481012657, + "grad_norm": 3.8058197498321533, + "learning_rate": 3.1540084388185655e-05, + "loss": 0.1165, + "step": 6830 + }, + { + "epoch": 21.012911392405062, + "grad_norm": 1.8044683933258057, + "learning_rate": 3.150492264416315e-05, + "loss": 0.0801, + "step": 6840 + }, + { + "epoch": 21.013544303797467, + "grad_norm": 1.89737868309021, + "learning_rate": 3.146976090014065e-05, + "loss": 0.0453, + "step": 6850 + }, + { + "epoch": 21.014177215189875, + "grad_norm": 2.135648250579834, + "learning_rate": 3.143459915611814e-05, + "loss": 0.0314, + "step": 6860 + }, + { + "epoch": 21.01481012658228, + "grad_norm": 0.015429515391588211, + "learning_rate": 3.139943741209564e-05, + "loss": 0.1356, + "step": 6870 + }, + { + "epoch": 21.015443037974684, + "grad_norm": 0.029538467526435852, + "learning_rate": 3.136427566807314e-05, + "loss": 0.0375, + "step": 6880 + }, + { + "epoch": 21.01607594936709, + "grad_norm": 0.013729414902627468, + "learning_rate": 3.132911392405064e-05, + "loss": 0.154, + "step": 6890 + }, + { + "epoch": 21.016708860759493, + "grad_norm": 1.6405558586120605, + "learning_rate": 3.129395218002813e-05, + "loss": 0.0856, + "step": 6900 + }, + { + "epoch": 21.017341772151898, + "grad_norm": 0.02224159613251686, + "learning_rate": 3.125879043600563e-05, + "loss": 0.0482, + "step": 6910 + }, + { + "epoch": 21.017974683544303, + "grad_norm": 0.024815354496240616, + "learning_rate": 3.1223628691983126e-05, + "loss": 0.0808, + "step": 6920 + }, + { + "epoch": 21.018607594936707, + "grad_norm": 2.0164098739624023, + "learning_rate": 3.118846694796062e-05, + "loss": 0.0392, + "step": 6930 + }, + { + "epoch": 21.019240506329115, + "grad_norm": 1.8977909088134766, + "learning_rate": 3.1153305203938116e-05, + "loss": 0.0415, + "step": 6940 + }, + { + "epoch": 21.01987341772152, + "grad_norm": 2.1623473167419434, + "learning_rate": 3.111814345991561e-05, + "loss": 0.0828, + "step": 6950 + }, + { + "epoch": 21.02, + "eval_accuracy": 0.9673704414587332, + "eval_loss": 0.06080710142850876, + "eval_runtime": 897.4486, + "eval_samples_per_second": 0.581, + "eval_steps_per_second": 0.074, + "step": 6952 + }, + { + "epoch": 22.000506329113925, + "grad_norm": 1.7337490320205688, + "learning_rate": 3.108298171589311e-05, + "loss": 0.0741, + "step": 6960 + }, + { + "epoch": 22.00113924050633, + "grad_norm": 0.029260125011205673, + "learning_rate": 3.1047819971870604e-05, + "loss": 0.0384, + "step": 6970 + }, + { + "epoch": 22.001772151898734, + "grad_norm": 0.014634775929152966, + "learning_rate": 3.10126582278481e-05, + "loss": 0.0472, + "step": 6980 + }, + { + "epoch": 22.00240506329114, + "grad_norm": 0.031179388985037804, + "learning_rate": 3.0977496483825594e-05, + "loss": 0.0498, + "step": 6990 + }, + { + "epoch": 22.003037974683544, + "grad_norm": 2.017439126968384, + "learning_rate": 3.09423347398031e-05, + "loss": 0.0726, + "step": 7000 + }, + { + "epoch": 22.00367088607595, + "grad_norm": 0.01198955811560154, + "learning_rate": 3.090717299578059e-05, + "loss": 0.0546, + "step": 7010 + }, + { + "epoch": 22.004303797468353, + "grad_norm": 1.72632896900177, + "learning_rate": 3.087201125175809e-05, + "loss": 0.0686, + "step": 7020 + }, + { + "epoch": 22.00493670886076, + "grad_norm": 0.013193360529839993, + "learning_rate": 3.083684950773559e-05, + "loss": 0.0543, + "step": 7030 + }, + { + "epoch": 22.005569620253166, + "grad_norm": 0.2840830385684967, + "learning_rate": 3.0801687763713086e-05, + "loss": 0.0632, + "step": 7040 + }, + { + "epoch": 22.00620253164557, + "grad_norm": 3.4681308269500732, + "learning_rate": 3.076652601969058e-05, + "loss": 0.1418, + "step": 7050 + }, + { + "epoch": 22.006835443037975, + "grad_norm": 2.013949155807495, + "learning_rate": 3.0731364275668076e-05, + "loss": 0.1189, + "step": 7060 + }, + { + "epoch": 22.00746835443038, + "grad_norm": 0.03054201602935791, + "learning_rate": 3.0696202531645574e-05, + "loss": 0.0485, + "step": 7070 + }, + { + "epoch": 22.008101265822784, + "grad_norm": 0.01309112273156643, + "learning_rate": 3.0661040787623065e-05, + "loss": 0.0337, + "step": 7080 + }, + { + "epoch": 22.00873417721519, + "grad_norm": 0.009347977116703987, + "learning_rate": 3.0625879043600564e-05, + "loss": 0.1198, + "step": 7090 + }, + { + "epoch": 22.009367088607593, + "grad_norm": 1.491429328918457, + "learning_rate": 3.059071729957806e-05, + "loss": 0.0425, + "step": 7100 + }, + { + "epoch": 22.01, + "grad_norm": 2.0944862365722656, + "learning_rate": 3.055555555555556e-05, + "loss": 0.0553, + "step": 7110 + }, + { + "epoch": 22.010632911392406, + "grad_norm": 0.02638574317097664, + "learning_rate": 3.052039381153305e-05, + "loss": 0.034, + "step": 7120 + }, + { + "epoch": 22.01126582278481, + "grad_norm": 0.053611740469932556, + "learning_rate": 3.048523206751055e-05, + "loss": 0.0355, + "step": 7130 + }, + { + "epoch": 22.011898734177215, + "grad_norm": 0.020069239661097527, + "learning_rate": 3.0450070323488045e-05, + "loss": 0.0657, + "step": 7140 + }, + { + "epoch": 22.01253164556962, + "grad_norm": 0.7463811635971069, + "learning_rate": 3.0414908579465547e-05, + "loss": 0.119, + "step": 7150 + }, + { + "epoch": 22.013164556962025, + "grad_norm": 1.8718035221099854, + "learning_rate": 3.0379746835443042e-05, + "loss": 0.0826, + "step": 7160 + }, + { + "epoch": 22.01379746835443, + "grad_norm": 1.7672007083892822, + "learning_rate": 3.0344585091420537e-05, + "loss": 0.0616, + "step": 7170 + }, + { + "epoch": 22.014430379746834, + "grad_norm": 0.014087089337408543, + "learning_rate": 3.0309423347398032e-05, + "loss": 0.0753, + "step": 7180 + }, + { + "epoch": 22.015063291139242, + "grad_norm": 2.0793707370758057, + "learning_rate": 3.027426160337553e-05, + "loss": 0.0782, + "step": 7190 + }, + { + "epoch": 22.015696202531647, + "grad_norm": 0.040184978395700455, + "learning_rate": 3.0239099859353025e-05, + "loss": 0.0132, + "step": 7200 + }, + { + "epoch": 22.01632911392405, + "grad_norm": 3.020726203918457, + "learning_rate": 3.020393811533052e-05, + "loss": 0.0371, + "step": 7210 + }, + { + "epoch": 22.016962025316456, + "grad_norm": 0.02047577127814293, + "learning_rate": 3.0168776371308015e-05, + "loss": 0.0535, + "step": 7220 + }, + { + "epoch": 22.01759493670886, + "grad_norm": 0.014698415994644165, + "learning_rate": 3.0133614627285517e-05, + "loss": 0.063, + "step": 7230 + }, + { + "epoch": 22.018227848101265, + "grad_norm": 0.016574831679463387, + "learning_rate": 3.009845288326301e-05, + "loss": 0.0531, + "step": 7240 + }, + { + "epoch": 22.01886075949367, + "grad_norm": 0.010280176065862179, + "learning_rate": 3.0063291139240506e-05, + "loss": 0.0952, + "step": 7250 + }, + { + "epoch": 22.019493670886074, + "grad_norm": 0.03817346692085266, + "learning_rate": 3.0028129395218e-05, + "loss": 0.0544, + "step": 7260 + }, + { + "epoch": 22.02, + "eval_accuracy": 0.9712092130518234, + "eval_loss": 0.05330738425254822, + "eval_runtime": 868.0763, + "eval_samples_per_second": 0.6, + "eval_steps_per_second": 0.076, + "step": 7268 + }, + { + "epoch": 23.00012658227848, + "grad_norm": 0.014792737551033497, + "learning_rate": 2.9992967651195503e-05, + "loss": 0.0958, + "step": 7270 + }, + { + "epoch": 23.000759493670888, + "grad_norm": 0.01068217121064663, + "learning_rate": 2.9957805907172998e-05, + "loss": 0.0747, + "step": 7280 + }, + { + "epoch": 23.001392405063292, + "grad_norm": 1.8503497838974, + "learning_rate": 2.9922644163150493e-05, + "loss": 0.0916, + "step": 7290 + }, + { + "epoch": 23.002025316455697, + "grad_norm": 1.8655881881713867, + "learning_rate": 2.9887482419127988e-05, + "loss": 0.0526, + "step": 7300 + }, + { + "epoch": 23.0026582278481, + "grad_norm": 2.7641124725341797, + "learning_rate": 2.985232067510549e-05, + "loss": 0.0862, + "step": 7310 + }, + { + "epoch": 23.003291139240506, + "grad_norm": 0.013922685757279396, + "learning_rate": 2.9817158931082985e-05, + "loss": 0.0557, + "step": 7320 + }, + { + "epoch": 23.00392405063291, + "grad_norm": 1.8126795291900635, + "learning_rate": 2.978199718706048e-05, + "loss": 0.1077, + "step": 7330 + }, + { + "epoch": 23.004556962025315, + "grad_norm": 0.016684675589203835, + "learning_rate": 2.9746835443037974e-05, + "loss": 0.1824, + "step": 7340 + }, + { + "epoch": 23.00518987341772, + "grad_norm": 0.010096821002662182, + "learning_rate": 2.9711673699015473e-05, + "loss": 0.0616, + "step": 7350 + }, + { + "epoch": 23.005822784810128, + "grad_norm": 0.015037915669381618, + "learning_rate": 2.9676511954992968e-05, + "loss": 0.029, + "step": 7360 + }, + { + "epoch": 23.006455696202533, + "grad_norm": 0.34261658787727356, + "learning_rate": 2.9641350210970466e-05, + "loss": 0.0256, + "step": 7370 + }, + { + "epoch": 23.007088607594937, + "grad_norm": 2.0921599864959717, + "learning_rate": 2.960618846694796e-05, + "loss": 0.0621, + "step": 7380 + }, + { + "epoch": 23.007721518987342, + "grad_norm": 0.009613982401788235, + "learning_rate": 2.957102672292546e-05, + "loss": 0.0634, + "step": 7390 + }, + { + "epoch": 23.008354430379747, + "grad_norm": 0.009403366595506668, + "learning_rate": 2.9535864978902954e-05, + "loss": 0.1097, + "step": 7400 + }, + { + "epoch": 23.00898734177215, + "grad_norm": 0.009238903410732746, + "learning_rate": 2.950070323488045e-05, + "loss": 0.0313, + "step": 7410 + }, + { + "epoch": 23.009620253164556, + "grad_norm": 0.01602065935730934, + "learning_rate": 2.9465541490857944e-05, + "loss": 0.0149, + "step": 7420 + }, + { + "epoch": 23.01025316455696, + "grad_norm": 0.01591380685567856, + "learning_rate": 2.9430379746835446e-05, + "loss": 0.0474, + "step": 7430 + }, + { + "epoch": 23.01088607594937, + "grad_norm": 1.879072666168213, + "learning_rate": 2.939521800281294e-05, + "loss": 0.0615, + "step": 7440 + }, + { + "epoch": 23.011518987341773, + "grad_norm": 0.014801949262619019, + "learning_rate": 2.9360056258790436e-05, + "loss": 0.0654, + "step": 7450 + }, + { + "epoch": 23.012151898734178, + "grad_norm": 0.05049045756459236, + "learning_rate": 2.9324894514767937e-05, + "loss": 0.025, + "step": 7460 + }, + { + "epoch": 23.012784810126583, + "grad_norm": 1.6079883575439453, + "learning_rate": 2.9289732770745432e-05, + "loss": 0.0663, + "step": 7470 + }, + { + "epoch": 23.013417721518987, + "grad_norm": 0.018271761015057564, + "learning_rate": 2.9254571026722927e-05, + "loss": 0.0164, + "step": 7480 + }, + { + "epoch": 23.014050632911392, + "grad_norm": 0.9706446528434753, + "learning_rate": 2.9219409282700422e-05, + "loss": 0.0981, + "step": 7490 + }, + { + "epoch": 23.014683544303796, + "grad_norm": 0.010271470062434673, + "learning_rate": 2.9184247538677924e-05, + "loss": 0.0293, + "step": 7500 + }, + { + "epoch": 23.0153164556962, + "grad_norm": 1.9835656881332397, + "learning_rate": 2.914908579465542e-05, + "loss": 0.1089, + "step": 7510 + }, + { + "epoch": 23.01594936708861, + "grad_norm": 0.01825304701924324, + "learning_rate": 2.9113924050632914e-05, + "loss": 0.0524, + "step": 7520 + }, + { + "epoch": 23.016582278481014, + "grad_norm": 0.009788533672690392, + "learning_rate": 2.907876230661041e-05, + "loss": 0.065, + "step": 7530 + }, + { + "epoch": 23.01721518987342, + "grad_norm": 2.2043983936309814, + "learning_rate": 2.9043600562587907e-05, + "loss": 0.1945, + "step": 7540 + }, + { + "epoch": 23.017848101265823, + "grad_norm": 0.00593506870791316, + "learning_rate": 2.9008438818565402e-05, + "loss": 0.0846, + "step": 7550 + }, + { + "epoch": 23.018481012658228, + "grad_norm": 0.016028795391321182, + "learning_rate": 2.8973277074542897e-05, + "loss": 0.0697, + "step": 7560 + }, + { + "epoch": 23.019113924050632, + "grad_norm": 2.0164456367492676, + "learning_rate": 2.8938115330520392e-05, + "loss": 0.0579, + "step": 7570 + }, + { + "epoch": 23.019746835443037, + "grad_norm": 0.009185468778014183, + "learning_rate": 2.8902953586497894e-05, + "loss": 0.0509, + "step": 7580 + }, + { + "epoch": 23.02, + "eval_accuracy": 0.9750479846449136, + "eval_loss": 0.04993312060832977, + "eval_runtime": 884.1639, + "eval_samples_per_second": 0.589, + "eval_steps_per_second": 0.075, + "step": 7584 + }, + { + "epoch": 24.000379746835442, + "grad_norm": 0.007606441620737314, + "learning_rate": 2.886779184247539e-05, + "loss": 0.086, + "step": 7590 + }, + { + "epoch": 24.001012658227847, + "grad_norm": 1.587971806526184, + "learning_rate": 2.8832630098452884e-05, + "loss": 0.0666, + "step": 7600 + }, + { + "epoch": 24.001645569620255, + "grad_norm": 0.06458862125873566, + "learning_rate": 2.879746835443038e-05, + "loss": 0.1167, + "step": 7610 + }, + { + "epoch": 24.00227848101266, + "grad_norm": 0.008511193096637726, + "learning_rate": 2.876230661040788e-05, + "loss": 0.0582, + "step": 7620 + }, + { + "epoch": 24.002911392405064, + "grad_norm": 0.01144248154014349, + "learning_rate": 2.8727144866385375e-05, + "loss": 0.0537, + "step": 7630 + }, + { + "epoch": 24.00354430379747, + "grad_norm": 2.393695116043091, + "learning_rate": 2.869198312236287e-05, + "loss": 0.0521, + "step": 7640 + }, + { + "epoch": 24.004177215189873, + "grad_norm": 0.010813113301992416, + "learning_rate": 2.8656821378340365e-05, + "loss": 0.0567, + "step": 7650 + }, + { + "epoch": 24.004810126582278, + "grad_norm": 0.6152679920196533, + "learning_rate": 2.8621659634317867e-05, + "loss": 0.0387, + "step": 7660 + }, + { + "epoch": 24.005443037974683, + "grad_norm": 1.79938542842865, + "learning_rate": 2.858649789029536e-05, + "loss": 0.0198, + "step": 7670 + }, + { + "epoch": 24.006075949367087, + "grad_norm": 2.5060055255889893, + "learning_rate": 2.8551336146272857e-05, + "loss": 0.142, + "step": 7680 + }, + { + "epoch": 24.006708860759495, + "grad_norm": 0.009323079138994217, + "learning_rate": 2.851617440225035e-05, + "loss": 0.0479, + "step": 7690 + }, + { + "epoch": 24.0073417721519, + "grad_norm": 2.835371732711792, + "learning_rate": 2.848101265822785e-05, + "loss": 0.0361, + "step": 7700 + }, + { + "epoch": 24.007974683544305, + "grad_norm": 2.476736545562744, + "learning_rate": 2.8445850914205345e-05, + "loss": 0.1028, + "step": 7710 + }, + { + "epoch": 24.00860759493671, + "grad_norm": 0.011100457981228828, + "learning_rate": 2.8410689170182843e-05, + "loss": 0.109, + "step": 7720 + }, + { + "epoch": 24.009240506329114, + "grad_norm": 0.009698892012238503, + "learning_rate": 2.8375527426160338e-05, + "loss": 0.0402, + "step": 7730 + }, + { + "epoch": 24.00987341772152, + "grad_norm": 0.32084423303604126, + "learning_rate": 2.8340365682137836e-05, + "loss": 0.0261, + "step": 7740 + }, + { + "epoch": 24.010506329113923, + "grad_norm": 0.03804817050695419, + "learning_rate": 2.830520393811533e-05, + "loss": 0.0519, + "step": 7750 + }, + { + "epoch": 24.011139240506328, + "grad_norm": 0.02210908569395542, + "learning_rate": 2.8270042194092826e-05, + "loss": 0.1201, + "step": 7760 + }, + { + "epoch": 24.011772151898736, + "grad_norm": 0.013488766737282276, + "learning_rate": 2.823488045007032e-05, + "loss": 0.1115, + "step": 7770 + }, + { + "epoch": 24.01240506329114, + "grad_norm": 0.01102654542773962, + "learning_rate": 2.8199718706047823e-05, + "loss": 0.0377, + "step": 7780 + }, + { + "epoch": 24.013037974683545, + "grad_norm": 2.595425844192505, + "learning_rate": 2.8164556962025318e-05, + "loss": 0.2162, + "step": 7790 + }, + { + "epoch": 24.01367088607595, + "grad_norm": 0.3236597180366516, + "learning_rate": 2.8129395218002813e-05, + "loss": 0.107, + "step": 7800 + }, + { + "epoch": 24.014303797468354, + "grad_norm": 0.769616961479187, + "learning_rate": 2.8094233473980308e-05, + "loss": 0.1602, + "step": 7810 + }, + { + "epoch": 24.01493670886076, + "grad_norm": 0.10283089429140091, + "learning_rate": 2.805907172995781e-05, + "loss": 0.0909, + "step": 7820 + }, + { + "epoch": 24.015569620253164, + "grad_norm": 0.02031938172876835, + "learning_rate": 2.8023909985935304e-05, + "loss": 0.1172, + "step": 7830 + }, + { + "epoch": 24.01620253164557, + "grad_norm": 1.7063891887664795, + "learning_rate": 2.79887482419128e-05, + "loss": 0.0797, + "step": 7840 + }, + { + "epoch": 24.016835443037976, + "grad_norm": 1.6002981662750244, + "learning_rate": 2.7953586497890294e-05, + "loss": 0.0447, + "step": 7850 + }, + { + "epoch": 24.01746835443038, + "grad_norm": 1.945346474647522, + "learning_rate": 2.7918424753867796e-05, + "loss": 0.0461, + "step": 7860 + }, + { + "epoch": 24.018101265822786, + "grad_norm": 2.0498995780944824, + "learning_rate": 2.788326300984529e-05, + "loss": 0.1525, + "step": 7870 + }, + { + "epoch": 24.01873417721519, + "grad_norm": 0.018395045772194862, + "learning_rate": 2.7848101265822786e-05, + "loss": 0.1554, + "step": 7880 + }, + { + "epoch": 24.019367088607595, + "grad_norm": 2.4818601608276367, + "learning_rate": 2.7812939521800284e-05, + "loss": 0.2115, + "step": 7890 + }, + { + "epoch": 24.02, + "grad_norm": 0.01920860819518566, + "learning_rate": 2.777777777777778e-05, + "loss": 0.0308, + "step": 7900 + }, + { + "epoch": 24.02, + "eval_accuracy": 0.9596928982725528, + "eval_loss": 0.09555886685848236, + "eval_runtime": 895.4045, + "eval_samples_per_second": 0.582, + "eval_steps_per_second": 0.074, + "step": 7900 + }, + { + "epoch": 25.000632911392405, + "grad_norm": 0.9809036254882812, + "learning_rate": 2.7742616033755274e-05, + "loss": 0.0833, + "step": 7910 + }, + { + "epoch": 25.00126582278481, + "grad_norm": 0.0848427340388298, + "learning_rate": 2.770745428973277e-05, + "loss": 0.04, + "step": 7920 + }, + { + "epoch": 25.001898734177214, + "grad_norm": 1.635724663734436, + "learning_rate": 2.767229254571027e-05, + "loss": 0.1677, + "step": 7930 + }, + { + "epoch": 25.00253164556962, + "grad_norm": 1.4175573587417603, + "learning_rate": 2.7637130801687766e-05, + "loss": 0.0733, + "step": 7940 + }, + { + "epoch": 25.003164556962027, + "grad_norm": 2.555398464202881, + "learning_rate": 2.760196905766526e-05, + "loss": 0.0981, + "step": 7950 + }, + { + "epoch": 25.00379746835443, + "grad_norm": 0.024168817326426506, + "learning_rate": 2.7566807313642756e-05, + "loss": 0.0517, + "step": 7960 + }, + { + "epoch": 25.004430379746836, + "grad_norm": 2.4232094287872314, + "learning_rate": 2.7531645569620257e-05, + "loss": 0.1317, + "step": 7970 + }, + { + "epoch": 25.00506329113924, + "grad_norm": 0.030434638261795044, + "learning_rate": 2.7496483825597752e-05, + "loss": 0.162, + "step": 7980 + }, + { + "epoch": 25.005696202531645, + "grad_norm": 0.24577154219150543, + "learning_rate": 2.7461322081575247e-05, + "loss": 0.0346, + "step": 7990 + }, + { + "epoch": 25.00632911392405, + "grad_norm": 0.009798028506338596, + "learning_rate": 2.7426160337552742e-05, + "loss": 0.0209, + "step": 8000 + }, + { + "epoch": 25.006962025316454, + "grad_norm": 0.009709338657557964, + "learning_rate": 2.7390998593530244e-05, + "loss": 0.0329, + "step": 8010 + }, + { + "epoch": 25.00759493670886, + "grad_norm": 0.025753848254680634, + "learning_rate": 2.735583684950774e-05, + "loss": 0.0368, + "step": 8020 + }, + { + "epoch": 25.008227848101267, + "grad_norm": 1.9421815872192383, + "learning_rate": 2.7320675105485234e-05, + "loss": 0.0755, + "step": 8030 + }, + { + "epoch": 25.008860759493672, + "grad_norm": 0.00867847353219986, + "learning_rate": 2.728551336146273e-05, + "loss": 0.0265, + "step": 8040 + }, + { + "epoch": 25.009493670886076, + "grad_norm": 0.008214855566620827, + "learning_rate": 2.7250351617440227e-05, + "loss": 0.0698, + "step": 8050 + }, + { + "epoch": 25.01012658227848, + "grad_norm": 0.013393707573413849, + "learning_rate": 2.7215189873417722e-05, + "loss": 0.1019, + "step": 8060 + }, + { + "epoch": 25.010759493670886, + "grad_norm": 0.007839508354663849, + "learning_rate": 2.718002812939522e-05, + "loss": 0.0401, + "step": 8070 + }, + { + "epoch": 25.01139240506329, + "grad_norm": 1.8051772117614746, + "learning_rate": 2.7144866385372715e-05, + "loss": 0.0971, + "step": 8080 + }, + { + "epoch": 25.012025316455695, + "grad_norm": 1.6515543460845947, + "learning_rate": 2.7109704641350213e-05, + "loss": 0.097, + "step": 8090 + }, + { + "epoch": 25.0126582278481, + "grad_norm": 1.732775330543518, + "learning_rate": 2.707454289732771e-05, + "loss": 0.0661, + "step": 8100 + }, + { + "epoch": 25.013291139240508, + "grad_norm": 2.0589871406555176, + "learning_rate": 2.7039381153305203e-05, + "loss": 0.0727, + "step": 8110 + }, + { + "epoch": 25.013924050632912, + "grad_norm": 0.017162639647722244, + "learning_rate": 2.7004219409282698e-05, + "loss": 0.0666, + "step": 8120 + }, + { + "epoch": 25.014556962025317, + "grad_norm": 0.025243617594242096, + "learning_rate": 2.69690576652602e-05, + "loss": 0.1164, + "step": 8130 + }, + { + "epoch": 25.01518987341772, + "grad_norm": 0.011467033065855503, + "learning_rate": 2.6933895921237695e-05, + "loss": 0.0974, + "step": 8140 + }, + { + "epoch": 25.015822784810126, + "grad_norm": 2.5216453075408936, + "learning_rate": 2.689873417721519e-05, + "loss": 0.1147, + "step": 8150 + }, + { + "epoch": 25.01645569620253, + "grad_norm": 0.011125218123197556, + "learning_rate": 2.6863572433192685e-05, + "loss": 0.011, + "step": 8160 + }, + { + "epoch": 25.017088607594935, + "grad_norm": 0.0575890839099884, + "learning_rate": 2.6828410689170186e-05, + "loss": 0.0757, + "step": 8170 + }, + { + "epoch": 25.01772151898734, + "grad_norm": 0.01529190968722105, + "learning_rate": 2.679324894514768e-05, + "loss": 0.1118, + "step": 8180 + }, + { + "epoch": 25.01835443037975, + "grad_norm": 1.783360481262207, + "learning_rate": 2.6758087201125176e-05, + "loss": 0.0738, + "step": 8190 + }, + { + "epoch": 25.018987341772153, + "grad_norm": 0.2645050585269928, + "learning_rate": 2.672292545710267e-05, + "loss": 0.0719, + "step": 8200 + }, + { + "epoch": 25.019620253164558, + "grad_norm": 1.2059509754180908, + "learning_rate": 2.6687763713080173e-05, + "loss": 0.0729, + "step": 8210 + }, + { + "epoch": 25.02, + "eval_accuracy": 0.9731285988483686, + "eval_loss": 0.07531290501356125, + "eval_runtime": 883.3121, + "eval_samples_per_second": 0.59, + "eval_steps_per_second": 0.075, + "step": 8216 + }, + { + "epoch": 26.000253164556963, + "grad_norm": 0.11341290175914764, + "learning_rate": 2.6652601969057668e-05, + "loss": 0.1398, + "step": 8220 + }, + { + "epoch": 26.000886075949367, + "grad_norm": 0.1513560563325882, + "learning_rate": 2.6617440225035163e-05, + "loss": 0.0599, + "step": 8230 + }, + { + "epoch": 26.001518987341772, + "grad_norm": 0.008346611633896828, + "learning_rate": 2.6582278481012658e-05, + "loss": 0.0526, + "step": 8240 + }, + { + "epoch": 26.002151898734176, + "grad_norm": 0.04121720790863037, + "learning_rate": 2.6547116736990156e-05, + "loss": 0.0438, + "step": 8250 + }, + { + "epoch": 26.00278481012658, + "grad_norm": 0.019222108647227287, + "learning_rate": 2.651195499296765e-05, + "loss": 0.0679, + "step": 8260 + }, + { + "epoch": 26.003417721518986, + "grad_norm": 2.1048424243927, + "learning_rate": 2.6476793248945146e-05, + "loss": 0.0947, + "step": 8270 + }, + { + "epoch": 26.004050632911394, + "grad_norm": 2.955946683883667, + "learning_rate": 2.6441631504922648e-05, + "loss": 0.1254, + "step": 8280 + }, + { + "epoch": 26.0046835443038, + "grad_norm": 0.013249853625893593, + "learning_rate": 2.6406469760900143e-05, + "loss": 0.0344, + "step": 8290 + }, + { + "epoch": 26.005316455696203, + "grad_norm": 0.008436400443315506, + "learning_rate": 2.6371308016877638e-05, + "loss": 0.0611, + "step": 8300 + }, + { + "epoch": 26.005949367088608, + "grad_norm": 1.8616793155670166, + "learning_rate": 2.6336146272855133e-05, + "loss": 0.0528, + "step": 8310 + }, + { + "epoch": 26.006582278481012, + "grad_norm": 0.030170224606990814, + "learning_rate": 2.6300984528832634e-05, + "loss": 0.0981, + "step": 8320 + }, + { + "epoch": 26.007215189873417, + "grad_norm": 0.011277982033789158, + "learning_rate": 2.626582278481013e-05, + "loss": 0.0216, + "step": 8330 + }, + { + "epoch": 26.00784810126582, + "grad_norm": 0.009784871712327003, + "learning_rate": 2.6230661040787624e-05, + "loss": 0.0485, + "step": 8340 + }, + { + "epoch": 26.008481012658226, + "grad_norm": 4.80241584777832, + "learning_rate": 2.619549929676512e-05, + "loss": 0.1301, + "step": 8350 + }, + { + "epoch": 26.009113924050634, + "grad_norm": 2.4445245265960693, + "learning_rate": 2.616033755274262e-05, + "loss": 0.0571, + "step": 8360 + }, + { + "epoch": 26.00974683544304, + "grad_norm": 0.009524138644337654, + "learning_rate": 2.6125175808720116e-05, + "loss": 0.0229, + "step": 8370 + }, + { + "epoch": 26.010379746835444, + "grad_norm": 0.010364298708736897, + "learning_rate": 2.609001406469761e-05, + "loss": 0.0559, + "step": 8380 + }, + { + "epoch": 26.01101265822785, + "grad_norm": 0.047712892293930054, + "learning_rate": 2.6054852320675106e-05, + "loss": 0.1409, + "step": 8390 + }, + { + "epoch": 26.011645569620253, + "grad_norm": 0.010216879658401012, + "learning_rate": 2.6019690576652604e-05, + "loss": 0.0701, + "step": 8400 + }, + { + "epoch": 26.012278481012657, + "grad_norm": 1.5211375951766968, + "learning_rate": 2.5984528832630102e-05, + "loss": 0.0774, + "step": 8410 + }, + { + "epoch": 26.012911392405062, + "grad_norm": 4.259005546569824, + "learning_rate": 2.5949367088607597e-05, + "loss": 0.0846, + "step": 8420 + }, + { + "epoch": 26.013544303797467, + "grad_norm": 0.008692068047821522, + "learning_rate": 2.5914205344585092e-05, + "loss": 0.2192, + "step": 8430 + }, + { + "epoch": 26.014177215189875, + "grad_norm": 0.9112133383750916, + "learning_rate": 2.587904360056259e-05, + "loss": 0.112, + "step": 8440 + }, + { + "epoch": 26.01481012658228, + "grad_norm": 1.4133566617965698, + "learning_rate": 2.5843881856540085e-05, + "loss": 0.0895, + "step": 8450 + }, + { + "epoch": 26.015443037974684, + "grad_norm": 0.019907064735889435, + "learning_rate": 2.580872011251758e-05, + "loss": 0.0752, + "step": 8460 + }, + { + "epoch": 26.01607594936709, + "grad_norm": 0.22535677254199982, + "learning_rate": 2.5773558368495075e-05, + "loss": 0.1745, + "step": 8470 + }, + { + "epoch": 26.016708860759493, + "grad_norm": 0.013840797357261181, + "learning_rate": 2.5738396624472577e-05, + "loss": 0.0569, + "step": 8480 + }, + { + "epoch": 26.017341772151898, + "grad_norm": 1.7949068546295166, + "learning_rate": 2.5703234880450072e-05, + "loss": 0.1005, + "step": 8490 + }, + { + "epoch": 26.017974683544303, + "grad_norm": 5.203663349151611, + "learning_rate": 2.5668073136427567e-05, + "loss": 0.1344, + "step": 8500 + }, + { + "epoch": 26.018607594936707, + "grad_norm": 1.7371711730957031, + "learning_rate": 2.5632911392405062e-05, + "loss": 0.049, + "step": 8510 + }, + { + "epoch": 26.019240506329115, + "grad_norm": 0.03200330212712288, + "learning_rate": 2.5597749648382564e-05, + "loss": 0.0778, + "step": 8520 + }, + { + "epoch": 26.01987341772152, + "grad_norm": 16.085002899169922, + "learning_rate": 2.556258790436006e-05, + "loss": 0.2328, + "step": 8530 + }, + { + "epoch": 26.02, + "eval_accuracy": 0.9654510556621881, + "eval_loss": 0.07737769931554794, + "eval_runtime": 872.9767, + "eval_samples_per_second": 0.597, + "eval_steps_per_second": 0.076, + "step": 8532 + }, + { + "epoch": 27.000506329113925, + "grad_norm": 2.101548194885254, + "learning_rate": 2.5527426160337553e-05, + "loss": 0.0209, + "step": 8540 + }, + { + "epoch": 27.00113924050633, + "grad_norm": 2.257962942123413, + "learning_rate": 2.549226441631505e-05, + "loss": 0.2402, + "step": 8550 + }, + { + "epoch": 27.001772151898734, + "grad_norm": 1.685132622718811, + "learning_rate": 2.545710267229255e-05, + "loss": 0.0636, + "step": 8560 + }, + { + "epoch": 27.00240506329114, + "grad_norm": 1.7050256729125977, + "learning_rate": 2.5421940928270045e-05, + "loss": 0.0449, + "step": 8570 + }, + { + "epoch": 27.003037974683544, + "grad_norm": 1.991898775100708, + "learning_rate": 2.538677918424754e-05, + "loss": 0.0455, + "step": 8580 + }, + { + "epoch": 27.00367088607595, + "grad_norm": 1.5045628547668457, + "learning_rate": 2.5351617440225035e-05, + "loss": 0.0672, + "step": 8590 + }, + { + "epoch": 27.004303797468353, + "grad_norm": 0.01319907046854496, + "learning_rate": 2.5316455696202533e-05, + "loss": 0.094, + "step": 8600 + }, + { + "epoch": 27.00493670886076, + "grad_norm": 0.01158622931689024, + "learning_rate": 2.5281293952180028e-05, + "loss": 0.068, + "step": 8610 + }, + { + "epoch": 27.005569620253166, + "grad_norm": 1.6995807886123657, + "learning_rate": 2.5246132208157523e-05, + "loss": 0.1678, + "step": 8620 + }, + { + "epoch": 27.00620253164557, + "grad_norm": 0.012468636967241764, + "learning_rate": 2.521097046413502e-05, + "loss": 0.0623, + "step": 8630 + }, + { + "epoch": 27.006835443037975, + "grad_norm": 1.8979823589324951, + "learning_rate": 2.517580872011252e-05, + "loss": 0.1167, + "step": 8640 + }, + { + "epoch": 27.00746835443038, + "grad_norm": 0.03327177092432976, + "learning_rate": 2.5140646976090015e-05, + "loss": 0.1498, + "step": 8650 + }, + { + "epoch": 27.008101265822784, + "grad_norm": 0.013360361568629742, + "learning_rate": 2.510548523206751e-05, + "loss": 0.149, + "step": 8660 + }, + { + "epoch": 27.00873417721519, + "grad_norm": 0.04175138100981712, + "learning_rate": 2.5070323488045005e-05, + "loss": 0.0581, + "step": 8670 + }, + { + "epoch": 27.009367088607593, + "grad_norm": 1.2819527387619019, + "learning_rate": 2.5035161744022506e-05, + "loss": 0.0572, + "step": 8680 + }, + { + "epoch": 27.01, + "grad_norm": 3.946152448654175, + "learning_rate": 2.5e-05, + "loss": 0.1165, + "step": 8690 + }, + { + "epoch": 27.010632911392406, + "grad_norm": 1.6609853506088257, + "learning_rate": 2.49648382559775e-05, + "loss": 0.0659, + "step": 8700 + }, + { + "epoch": 27.01126582278481, + "grad_norm": 0.019263017922639847, + "learning_rate": 2.4929676511954994e-05, + "loss": 0.1063, + "step": 8710 + }, + { + "epoch": 27.011898734177215, + "grad_norm": 0.01123301312327385, + "learning_rate": 2.4894514767932493e-05, + "loss": 0.0287, + "step": 8720 + }, + { + "epoch": 27.01253164556962, + "grad_norm": 0.03187470883131027, + "learning_rate": 2.4859353023909988e-05, + "loss": 0.0525, + "step": 8730 + }, + { + "epoch": 27.013164556962025, + "grad_norm": 0.030269309878349304, + "learning_rate": 2.4824191279887486e-05, + "loss": 0.0155, + "step": 8740 + }, + { + "epoch": 27.01379746835443, + "grad_norm": 0.04713929817080498, + "learning_rate": 2.478902953586498e-05, + "loss": 0.103, + "step": 8750 + }, + { + "epoch": 27.014430379746834, + "grad_norm": 0.007141268812119961, + "learning_rate": 2.475386779184248e-05, + "loss": 0.0338, + "step": 8760 + }, + { + "epoch": 27.015063291139242, + "grad_norm": 0.008212805725634098, + "learning_rate": 2.4718706047819974e-05, + "loss": 0.0764, + "step": 8770 + }, + { + "epoch": 27.015696202531647, + "grad_norm": 0.007574188522994518, + "learning_rate": 2.468354430379747e-05, + "loss": 0.0772, + "step": 8780 + }, + { + "epoch": 27.01632911392405, + "grad_norm": 1.347557544708252, + "learning_rate": 2.4648382559774964e-05, + "loss": 0.0767, + "step": 8790 + }, + { + "epoch": 27.016962025316456, + "grad_norm": 0.008004224859178066, + "learning_rate": 2.4613220815752462e-05, + "loss": 0.0405, + "step": 8800 + }, + { + "epoch": 27.01759493670886, + "grad_norm": 0.06255421787500381, + "learning_rate": 2.4578059071729957e-05, + "loss": 0.0634, + "step": 8810 + }, + { + "epoch": 27.018227848101265, + "grad_norm": 0.014694114215672016, + "learning_rate": 2.4542897327707456e-05, + "loss": 0.0704, + "step": 8820 + }, + { + "epoch": 27.01886075949367, + "grad_norm": 0.0064834184013307095, + "learning_rate": 2.450773558368495e-05, + "loss": 0.0492, + "step": 8830 + }, + { + "epoch": 27.019493670886074, + "grad_norm": 0.01545525249093771, + "learning_rate": 2.447257383966245e-05, + "loss": 0.1085, + "step": 8840 + }, + { + "epoch": 27.02, + "eval_accuracy": 0.9692898272552783, + "eval_loss": 0.06086457893252373, + "eval_runtime": 940.0459, + "eval_samples_per_second": 0.554, + "eval_steps_per_second": 0.07, + "step": 8848 + }, + { + "epoch": 28.00012658227848, + "grad_norm": 1.5009100437164307, + "learning_rate": 2.4437412095639944e-05, + "loss": 0.0116, + "step": 8850 + }, + { + "epoch": 28.000759493670888, + "grad_norm": 1.8109427690505981, + "learning_rate": 2.4402250351617442e-05, + "loss": 0.1101, + "step": 8860 + }, + { + "epoch": 28.001392405063292, + "grad_norm": 0.016067614778876305, + "learning_rate": 2.4367088607594937e-05, + "loss": 0.0632, + "step": 8870 + }, + { + "epoch": 28.002025316455697, + "grad_norm": 1.9723069667816162, + "learning_rate": 2.4331926863572436e-05, + "loss": 0.0556, + "step": 8880 + }, + { + "epoch": 28.0026582278481, + "grad_norm": 6.840577125549316, + "learning_rate": 2.429676511954993e-05, + "loss": 0.0614, + "step": 8890 + }, + { + "epoch": 28.003291139240506, + "grad_norm": 0.00987607054412365, + "learning_rate": 2.426160337552743e-05, + "loss": 0.0619, + "step": 8900 + }, + { + "epoch": 28.00392405063291, + "grad_norm": 1.5094603300094604, + "learning_rate": 2.4226441631504924e-05, + "loss": 0.0817, + "step": 8910 + }, + { + "epoch": 28.004556962025315, + "grad_norm": 0.0056548151187598705, + "learning_rate": 2.4191279887482422e-05, + "loss": 0.0395, + "step": 8920 + }, + { + "epoch": 28.00518987341772, + "grad_norm": 1.6165566444396973, + "learning_rate": 2.4156118143459917e-05, + "loss": 0.0196, + "step": 8930 + }, + { + "epoch": 28.005822784810128, + "grad_norm": 0.011191487312316895, + "learning_rate": 2.4120956399437415e-05, + "loss": 0.0488, + "step": 8940 + }, + { + "epoch": 28.006455696202533, + "grad_norm": 0.013476500287652016, + "learning_rate": 2.408579465541491e-05, + "loss": 0.077, + "step": 8950 + }, + { + "epoch": 28.007088607594937, + "grad_norm": 0.01594310626387596, + "learning_rate": 2.4050632911392405e-05, + "loss": 0.0764, + "step": 8960 + }, + { + "epoch": 28.007721518987342, + "grad_norm": 2.4658524990081787, + "learning_rate": 2.40154711673699e-05, + "loss": 0.0816, + "step": 8970 + }, + { + "epoch": 28.008354430379747, + "grad_norm": 0.050282131880521774, + "learning_rate": 2.39803094233474e-05, + "loss": 0.0483, + "step": 8980 + }, + { + "epoch": 28.00898734177215, + "grad_norm": 0.0738360807299614, + "learning_rate": 2.3945147679324893e-05, + "loss": 0.0325, + "step": 8990 + }, + { + "epoch": 28.009620253164556, + "grad_norm": 2.087075710296631, + "learning_rate": 2.3909985935302392e-05, + "loss": 0.1095, + "step": 9000 + }, + { + "epoch": 28.01025316455696, + "grad_norm": 0.012715667486190796, + "learning_rate": 2.3874824191279887e-05, + "loss": 0.0548, + "step": 9010 + }, + { + "epoch": 28.01088607594937, + "grad_norm": 2.233593463897705, + "learning_rate": 2.3839662447257385e-05, + "loss": 0.0342, + "step": 9020 + }, + { + "epoch": 28.011518987341773, + "grad_norm": 0.016205696389079094, + "learning_rate": 2.380450070323488e-05, + "loss": 0.0916, + "step": 9030 + }, + { + "epoch": 28.012151898734178, + "grad_norm": 0.008019771426916122, + "learning_rate": 2.3769338959212378e-05, + "loss": 0.0851, + "step": 9040 + }, + { + "epoch": 28.012784810126583, + "grad_norm": 1.6921287775039673, + "learning_rate": 2.3734177215189873e-05, + "loss": 0.0543, + "step": 9050 + }, + { + "epoch": 28.013417721518987, + "grad_norm": 0.03432793170213699, + "learning_rate": 2.369901547116737e-05, + "loss": 0.1162, + "step": 9060 + }, + { + "epoch": 28.014050632911392, + "grad_norm": 4.5309648513793945, + "learning_rate": 2.3663853727144866e-05, + "loss": 0.1159, + "step": 9070 + }, + { + "epoch": 28.014683544303796, + "grad_norm": 1.7229171991348267, + "learning_rate": 2.3628691983122365e-05, + "loss": 0.0453, + "step": 9080 + }, + { + "epoch": 28.0153164556962, + "grad_norm": 1.9000073671340942, + "learning_rate": 2.359353023909986e-05, + "loss": 0.0311, + "step": 9090 + }, + { + "epoch": 28.01594936708861, + "grad_norm": 23.23397445678711, + "learning_rate": 2.3558368495077358e-05, + "loss": 0.1287, + "step": 9100 + }, + { + "epoch": 28.016582278481014, + "grad_norm": 0.14218182861804962, + "learning_rate": 2.3523206751054856e-05, + "loss": 0.0783, + "step": 9110 + }, + { + "epoch": 28.01721518987342, + "grad_norm": 1.8179298639297485, + "learning_rate": 2.348804500703235e-05, + "loss": 0.0228, + "step": 9120 + }, + { + "epoch": 28.017848101265823, + "grad_norm": 0.01124257780611515, + "learning_rate": 2.3452883263009846e-05, + "loss": 0.0606, + "step": 9130 + }, + { + "epoch": 28.018481012658228, + "grad_norm": 0.06378079950809479, + "learning_rate": 2.341772151898734e-05, + "loss": 0.0613, + "step": 9140 + }, + { + "epoch": 28.019113924050632, + "grad_norm": 0.5730488896369934, + "learning_rate": 2.338255977496484e-05, + "loss": 0.0922, + "step": 9150 + }, + { + "epoch": 28.019746835443037, + "grad_norm": 0.1582580953836441, + "learning_rate": 2.3347398030942334e-05, + "loss": 0.099, + "step": 9160 + }, + { + "epoch": 28.02, + "eval_accuracy": 0.9673704414587332, + "eval_loss": 0.06770172715187073, + "eval_runtime": 874.5137, + "eval_samples_per_second": 0.596, + "eval_steps_per_second": 0.075, + "step": 9164 + }, + { + "epoch": 29.000379746835442, + "grad_norm": 1.0581729412078857, + "learning_rate": 2.3312236286919833e-05, + "loss": 0.0999, + "step": 9170 + }, + { + "epoch": 29.001012658227847, + "grad_norm": 1.9762561321258545, + "learning_rate": 2.3277074542897328e-05, + "loss": 0.0571, + "step": 9180 + }, + { + "epoch": 29.001645569620255, + "grad_norm": 1.5621942281723022, + "learning_rate": 2.3241912798874826e-05, + "loss": 0.0549, + "step": 9190 + }, + { + "epoch": 29.00227848101266, + "grad_norm": 0.007985897362232208, + "learning_rate": 2.320675105485232e-05, + "loss": 0.0897, + "step": 9200 + }, + { + "epoch": 29.002911392405064, + "grad_norm": 1.7005807161331177, + "learning_rate": 2.317158931082982e-05, + "loss": 0.0953, + "step": 9210 + }, + { + "epoch": 29.00354430379747, + "grad_norm": 0.01191030628979206, + "learning_rate": 2.3136427566807314e-05, + "loss": 0.0495, + "step": 9220 + }, + { + "epoch": 29.004177215189873, + "grad_norm": 2.738387107849121, + "learning_rate": 2.3101265822784813e-05, + "loss": 0.0748, + "step": 9230 + }, + { + "epoch": 29.004810126582278, + "grad_norm": 1.7861073017120361, + "learning_rate": 2.3066104078762308e-05, + "loss": 0.1279, + "step": 9240 + }, + { + "epoch": 29.005443037974683, + "grad_norm": 2.4535388946533203, + "learning_rate": 2.3030942334739806e-05, + "loss": 0.1033, + "step": 9250 + }, + { + "epoch": 29.006075949367087, + "grad_norm": 0.028340207412838936, + "learning_rate": 2.29957805907173e-05, + "loss": 0.2441, + "step": 9260 + }, + { + "epoch": 29.006708860759495, + "grad_norm": 0.020006628707051277, + "learning_rate": 2.29606188466948e-05, + "loss": 0.1339, + "step": 9270 + }, + { + "epoch": 29.0073417721519, + "grad_norm": 0.06801651418209076, + "learning_rate": 2.2925457102672294e-05, + "loss": 0.0383, + "step": 9280 + }, + { + "epoch": 29.007974683544305, + "grad_norm": 0.03369883447885513, + "learning_rate": 2.2890295358649792e-05, + "loss": 0.0552, + "step": 9290 + }, + { + "epoch": 29.00860759493671, + "grad_norm": 1.2586785554885864, + "learning_rate": 2.2855133614627287e-05, + "loss": 0.0745, + "step": 9300 + }, + { + "epoch": 29.009240506329114, + "grad_norm": 0.008519505150616169, + "learning_rate": 2.2819971870604782e-05, + "loss": 0.1307, + "step": 9310 + }, + { + "epoch": 29.00987341772152, + "grad_norm": 0.04472700506448746, + "learning_rate": 2.278481012658228e-05, + "loss": 0.0757, + "step": 9320 + }, + { + "epoch": 29.010506329113923, + "grad_norm": 1.7951302528381348, + "learning_rate": 2.2749648382559775e-05, + "loss": 0.0504, + "step": 9330 + }, + { + "epoch": 29.011139240506328, + "grad_norm": 7.45163106918335, + "learning_rate": 2.271448663853727e-05, + "loss": 0.0637, + "step": 9340 + }, + { + "epoch": 29.011772151898736, + "grad_norm": 1.5808919668197632, + "learning_rate": 2.267932489451477e-05, + "loss": 0.0681, + "step": 9350 + }, + { + "epoch": 29.01240506329114, + "grad_norm": 0.009355315938591957, + "learning_rate": 2.2644163150492264e-05, + "loss": 0.0789, + "step": 9360 + }, + { + "epoch": 29.013037974683545, + "grad_norm": 36.12007522583008, + "learning_rate": 2.2609001406469762e-05, + "loss": 0.1642, + "step": 9370 + }, + { + "epoch": 29.01367088607595, + "grad_norm": 0.007421193178743124, + "learning_rate": 2.2573839662447257e-05, + "loss": 0.0126, + "step": 9380 + }, + { + "epoch": 29.014303797468354, + "grad_norm": 1.9807149171829224, + "learning_rate": 2.2538677918424755e-05, + "loss": 0.1069, + "step": 9390 + }, + { + "epoch": 29.01493670886076, + "grad_norm": 0.016664525493979454, + "learning_rate": 2.250351617440225e-05, + "loss": 0.174, + "step": 9400 + }, + { + "epoch": 29.015569620253164, + "grad_norm": 0.015380149707198143, + "learning_rate": 2.246835443037975e-05, + "loss": 0.0959, + "step": 9410 + }, + { + "epoch": 29.01620253164557, + "grad_norm": 1.9330257177352905, + "learning_rate": 2.2433192686357243e-05, + "loss": 0.0604, + "step": 9420 + }, + { + "epoch": 29.016835443037976, + "grad_norm": 55.1077880859375, + "learning_rate": 2.2398030942334742e-05, + "loss": 0.0694, + "step": 9430 + }, + { + "epoch": 29.01746835443038, + "grad_norm": 1.9939329624176025, + "learning_rate": 2.2362869198312237e-05, + "loss": 0.1348, + "step": 9440 + }, + { + "epoch": 29.018101265822786, + "grad_norm": 0.015108354389667511, + "learning_rate": 2.2327707454289735e-05, + "loss": 0.0449, + "step": 9450 + }, + { + "epoch": 29.01873417721519, + "grad_norm": 12.801469802856445, + "learning_rate": 2.229254571026723e-05, + "loss": 0.1694, + "step": 9460 + }, + { + "epoch": 29.019367088607595, + "grad_norm": 2.3264384269714355, + "learning_rate": 2.225738396624473e-05, + "loss": 0.044, + "step": 9470 + }, + { + "epoch": 29.02, + "grad_norm": 0.13212990760803223, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.1988, + "step": 9480 + }, + { + "epoch": 29.02, + "eval_accuracy": 0.9558541266794626, + "eval_loss": 0.14145594835281372, + "eval_runtime": 928.9271, + "eval_samples_per_second": 0.561, + "eval_steps_per_second": 0.071, + "step": 9480 + }, + { + "epoch": 30.000632911392405, + "grad_norm": 2.014655113220215, + "learning_rate": 2.2187060478199718e-05, + "loss": 0.0823, + "step": 9490 + }, + { + "epoch": 30.00126582278481, + "grad_norm": 7.12656831741333, + "learning_rate": 2.2151898734177217e-05, + "loss": 0.0963, + "step": 9500 + }, + { + "epoch": 30.001898734177214, + "grad_norm": 1.7992146015167236, + "learning_rate": 2.211673699015471e-05, + "loss": 0.1107, + "step": 9510 + }, + { + "epoch": 30.00253164556962, + "grad_norm": 7.597923755645752, + "learning_rate": 2.2081575246132206e-05, + "loss": 0.102, + "step": 9520 + }, + { + "epoch": 30.003164556962027, + "grad_norm": 0.026418205350637436, + "learning_rate": 2.2046413502109705e-05, + "loss": 0.1202, + "step": 9530 + }, + { + "epoch": 30.00379746835443, + "grad_norm": 0.01834811083972454, + "learning_rate": 2.2011251758087203e-05, + "loss": 0.0728, + "step": 9540 + }, + { + "epoch": 30.004430379746836, + "grad_norm": 0.02628573216497898, + "learning_rate": 2.1976090014064698e-05, + "loss": 0.0948, + "step": 9550 + }, + { + "epoch": 30.00506329113924, + "grad_norm": 0.023844990879297256, + "learning_rate": 2.1940928270042196e-05, + "loss": 0.0601, + "step": 9560 + }, + { + "epoch": 30.005696202531645, + "grad_norm": 0.019756905734539032, + "learning_rate": 2.190576652601969e-05, + "loss": 0.0759, + "step": 9570 + }, + { + "epoch": 30.00632911392405, + "grad_norm": 6.277482986450195, + "learning_rate": 2.187060478199719e-05, + "loss": 0.053, + "step": 9580 + }, + { + "epoch": 30.006962025316454, + "grad_norm": 0.5496724843978882, + "learning_rate": 2.1835443037974685e-05, + "loss": 0.0653, + "step": 9590 + }, + { + "epoch": 30.00759493670886, + "grad_norm": 1.7114007472991943, + "learning_rate": 2.1800281293952183e-05, + "loss": 0.0466, + "step": 9600 + }, + { + "epoch": 30.008227848101267, + "grad_norm": 2.283554792404175, + "learning_rate": 2.1765119549929678e-05, + "loss": 0.0608, + "step": 9610 + }, + { + "epoch": 30.008860759493672, + "grad_norm": 4.336071014404297, + "learning_rate": 2.1729957805907176e-05, + "loss": 0.0702, + "step": 9620 + }, + { + "epoch": 30.009493670886076, + "grad_norm": 0.2585821747779846, + "learning_rate": 2.169479606188467e-05, + "loss": 0.0716, + "step": 9630 + }, + { + "epoch": 30.01012658227848, + "grad_norm": 2.0628931522369385, + "learning_rate": 2.165963431786217e-05, + "loss": 0.0975, + "step": 9640 + }, + { + "epoch": 30.010759493670886, + "grad_norm": 0.9379414319992065, + "learning_rate": 2.1624472573839664e-05, + "loss": 0.0643, + "step": 9650 + }, + { + "epoch": 30.01139240506329, + "grad_norm": 1.865263819694519, + "learning_rate": 2.158931082981716e-05, + "loss": 0.089, + "step": 9660 + }, + { + "epoch": 30.012025316455695, + "grad_norm": 0.01576172187924385, + "learning_rate": 2.1554149085794658e-05, + "loss": 0.0863, + "step": 9670 + }, + { + "epoch": 30.0126582278481, + "grad_norm": 8.34028434753418, + "learning_rate": 2.1518987341772153e-05, + "loss": 0.0266, + "step": 9680 + }, + { + "epoch": 30.013291139240508, + "grad_norm": 2.5415472984313965, + "learning_rate": 2.1483825597749647e-05, + "loss": 0.1101, + "step": 9690 + }, + { + "epoch": 30.013924050632912, + "grad_norm": 0.010155543684959412, + "learning_rate": 2.1448663853727146e-05, + "loss": 0.037, + "step": 9700 + }, + { + "epoch": 30.014556962025317, + "grad_norm": 2.1569736003875732, + "learning_rate": 2.141350210970464e-05, + "loss": 0.03, + "step": 9710 + }, + { + "epoch": 30.01518987341772, + "grad_norm": 2.8239643573760986, + "learning_rate": 2.137834036568214e-05, + "loss": 0.1167, + "step": 9720 + }, + { + "epoch": 30.015822784810126, + "grad_norm": 1.6771701574325562, + "learning_rate": 2.1343178621659634e-05, + "loss": 0.1783, + "step": 9730 + }, + { + "epoch": 30.01645569620253, + "grad_norm": 1.3176885843276978, + "learning_rate": 2.1308016877637132e-05, + "loss": 0.0518, + "step": 9740 + }, + { + "epoch": 30.017088607594935, + "grad_norm": 2.2316536903381348, + "learning_rate": 2.1272855133614627e-05, + "loss": 0.0688, + "step": 9750 + }, + { + "epoch": 30.01772151898734, + "grad_norm": 0.02535848319530487, + "learning_rate": 2.1237693389592126e-05, + "loss": 0.1062, + "step": 9760 + }, + { + "epoch": 30.01835443037975, + "grad_norm": 0.026239003986120224, + "learning_rate": 2.120253164556962e-05, + "loss": 0.0434, + "step": 9770 + }, + { + "epoch": 30.018987341772153, + "grad_norm": 0.008352408185601234, + "learning_rate": 2.116736990154712e-05, + "loss": 0.1434, + "step": 9780 + }, + { + "epoch": 30.019620253164558, + "grad_norm": 0.008794278837740421, + "learning_rate": 2.1132208157524614e-05, + "loss": 0.0747, + "step": 9790 + }, + { + "epoch": 30.02, + "eval_accuracy": 0.9712092130518234, + "eval_loss": 0.058070629835128784, + "eval_runtime": 882.2085, + "eval_samples_per_second": 0.591, + "eval_steps_per_second": 0.075, + "step": 9796 + }, + { + "epoch": 31.000253164556963, + "grad_norm": 0.15315893292427063, + "learning_rate": 2.1097046413502112e-05, + "loss": 0.1227, + "step": 9800 + }, + { + "epoch": 31.000886075949367, + "grad_norm": 0.007744469679892063, + "learning_rate": 2.1061884669479607e-05, + "loss": 0.0588, + "step": 9810 + }, + { + "epoch": 31.001518987341772, + "grad_norm": 1.7301479578018188, + "learning_rate": 2.1026722925457105e-05, + "loss": 0.1045, + "step": 9820 + }, + { + "epoch": 31.002151898734176, + "grad_norm": 0.016438119113445282, + "learning_rate": 2.09915611814346e-05, + "loss": 0.1089, + "step": 9830 + }, + { + "epoch": 31.00278481012658, + "grad_norm": 2.2902982234954834, + "learning_rate": 2.09563994374121e-05, + "loss": 0.0569, + "step": 9840 + }, + { + "epoch": 31.003417721518986, + "grad_norm": 0.01934031955897808, + "learning_rate": 2.0921237693389594e-05, + "loss": 0.0613, + "step": 9850 + }, + { + "epoch": 31.004050632911394, + "grad_norm": 0.10376526415348053, + "learning_rate": 2.088607594936709e-05, + "loss": 0.0278, + "step": 9860 + }, + { + "epoch": 31.0046835443038, + "grad_norm": 0.03141545131802559, + "learning_rate": 2.0850914205344583e-05, + "loss": 0.0557, + "step": 9870 + }, + { + "epoch": 31.005316455696203, + "grad_norm": 0.024358389899134636, + "learning_rate": 2.0815752461322082e-05, + "loss": 0.0358, + "step": 9880 + }, + { + "epoch": 31.005949367088608, + "grad_norm": 0.09153765439987183, + "learning_rate": 2.0780590717299577e-05, + "loss": 0.0682, + "step": 9890 + }, + { + "epoch": 31.006582278481012, + "grad_norm": 0.012928824871778488, + "learning_rate": 2.0745428973277075e-05, + "loss": 0.044, + "step": 9900 + }, + { + "epoch": 31.007215189873417, + "grad_norm": 0.02423839271068573, + "learning_rate": 2.071026722925457e-05, + "loss": 0.029, + "step": 9910 + }, + { + "epoch": 31.00784810126582, + "grad_norm": 0.007467347197234631, + "learning_rate": 2.067510548523207e-05, + "loss": 0.1689, + "step": 9920 + }, + { + "epoch": 31.008481012658226, + "grad_norm": 61.641075134277344, + "learning_rate": 2.0639943741209563e-05, + "loss": 0.0393, + "step": 9930 + }, + { + "epoch": 31.009113924050634, + "grad_norm": 0.013753454200923443, + "learning_rate": 2.060478199718706e-05, + "loss": 0.0425, + "step": 9940 + }, + { + "epoch": 31.00974683544304, + "grad_norm": 2.26615047454834, + "learning_rate": 2.056962025316456e-05, + "loss": 0.0482, + "step": 9950 + }, + { + "epoch": 31.010379746835444, + "grad_norm": 17.814420700073242, + "learning_rate": 2.0534458509142055e-05, + "loss": 0.1064, + "step": 9960 + }, + { + "epoch": 31.01101265822785, + "grad_norm": 0.0065340083092451096, + "learning_rate": 2.0499296765119553e-05, + "loss": 0.0222, + "step": 9970 + }, + { + "epoch": 31.011645569620253, + "grad_norm": 0.004512071143835783, + "learning_rate": 2.0464135021097048e-05, + "loss": 0.0561, + "step": 9980 + }, + { + "epoch": 31.012278481012657, + "grad_norm": 2.7562315464019775, + "learning_rate": 2.0428973277074546e-05, + "loss": 0.0865, + "step": 9990 + }, + { + "epoch": 31.012911392405062, + "grad_norm": 0.01689945161342621, + "learning_rate": 2.039381153305204e-05, + "loss": 0.04, + "step": 10000 + }, + { + "epoch": 31.013544303797467, + "grad_norm": 2.3072574138641357, + "learning_rate": 2.0358649789029536e-05, + "loss": 0.1108, + "step": 10010 + }, + { + "epoch": 31.014177215189875, + "grad_norm": 2.39740252494812, + "learning_rate": 2.0323488045007035e-05, + "loss": 0.0603, + "step": 10020 + }, + { + "epoch": 31.01481012658228, + "grad_norm": 2.2645068168640137, + "learning_rate": 2.028832630098453e-05, + "loss": 0.1595, + "step": 10030 + }, + { + "epoch": 31.015443037974684, + "grad_norm": 0.020067648962140083, + "learning_rate": 2.0253164556962025e-05, + "loss": 0.0824, + "step": 10040 + }, + { + "epoch": 31.01607594936709, + "grad_norm": 1.732828974723816, + "learning_rate": 2.0218002812939523e-05, + "loss": 0.0979, + "step": 10050 + }, + { + "epoch": 31.016708860759493, + "grad_norm": 0.006261592730879784, + "learning_rate": 2.0182841068917018e-05, + "loss": 0.0294, + "step": 10060 + }, + { + "epoch": 31.017341772151898, + "grad_norm": 0.009185651317238808, + "learning_rate": 2.0147679324894516e-05, + "loss": 0.0684, + "step": 10070 + }, + { + "epoch": 31.017974683544303, + "grad_norm": 2.076765298843384, + "learning_rate": 2.011251758087201e-05, + "loss": 0.0763, + "step": 10080 + }, + { + "epoch": 31.018607594936707, + "grad_norm": 0.010820266790688038, + "learning_rate": 2.007735583684951e-05, + "loss": 0.1151, + "step": 10090 + }, + { + "epoch": 31.019240506329115, + "grad_norm": 0.9778680801391602, + "learning_rate": 2.0042194092827004e-05, + "loss": 0.0274, + "step": 10100 + }, + { + "epoch": 31.01987341772152, + "grad_norm": 0.03660624101758003, + "learning_rate": 2.0007032348804503e-05, + "loss": 0.0556, + "step": 10110 + }, + { + "epoch": 31.02, + "eval_accuracy": 0.9692898272552783, + "eval_loss": 0.051862768828868866, + "eval_runtime": 895.6108, + "eval_samples_per_second": 0.582, + "eval_steps_per_second": 0.074, + "step": 10112 + }, + { + "epoch": 32.000506329113925, + "grad_norm": 0.014388811774551868, + "learning_rate": 1.9971870604781998e-05, + "loss": 0.0289, + "step": 10120 + }, + { + "epoch": 32.001139240506326, + "grad_norm": 1.698613166809082, + "learning_rate": 1.9936708860759496e-05, + "loss": 0.0312, + "step": 10130 + }, + { + "epoch": 32.001772151898734, + "grad_norm": 0.026271585375070572, + "learning_rate": 1.990154711673699e-05, + "loss": 0.0451, + "step": 10140 + }, + { + "epoch": 32.00240506329114, + "grad_norm": 0.00675854692235589, + "learning_rate": 1.986638537271449e-05, + "loss": 0.0247, + "step": 10150 + }, + { + "epoch": 32.003037974683544, + "grad_norm": 0.022877847775816917, + "learning_rate": 1.9831223628691984e-05, + "loss": 0.0235, + "step": 10160 + }, + { + "epoch": 32.00367088607595, + "grad_norm": 0.008142875507473946, + "learning_rate": 1.9796061884669482e-05, + "loss": 0.0699, + "step": 10170 + }, + { + "epoch": 32.00430379746835, + "grad_norm": 0.004865671973675489, + "learning_rate": 1.9760900140646977e-05, + "loss": 0.0645, + "step": 10180 + }, + { + "epoch": 32.00493670886076, + "grad_norm": 0.05031589791178703, + "learning_rate": 1.9725738396624476e-05, + "loss": 0.0675, + "step": 10190 + }, + { + "epoch": 32.00556962025316, + "grad_norm": 0.006223399192094803, + "learning_rate": 1.969057665260197e-05, + "loss": 0.0812, + "step": 10200 + }, + { + "epoch": 32.00620253164557, + "grad_norm": 0.008529792539775372, + "learning_rate": 1.9655414908579466e-05, + "loss": 0.1027, + "step": 10210 + }, + { + "epoch": 32.00683544303797, + "grad_norm": 0.004334003198891878, + "learning_rate": 1.962025316455696e-05, + "loss": 0.0511, + "step": 10220 + }, + { + "epoch": 32.00746835443038, + "grad_norm": 0.004973968956619501, + "learning_rate": 1.958509142053446e-05, + "loss": 0.0556, + "step": 10230 + }, + { + "epoch": 32.00810126582279, + "grad_norm": 0.005091523285955191, + "learning_rate": 1.9549929676511954e-05, + "loss": 0.0889, + "step": 10240 + }, + { + "epoch": 32.00873417721519, + "grad_norm": 0.004862161818891764, + "learning_rate": 1.9514767932489452e-05, + "loss": 0.0728, + "step": 10250 + }, + { + "epoch": 32.0093670886076, + "grad_norm": 0.43511274456977844, + "learning_rate": 1.9479606188466947e-05, + "loss": 0.0398, + "step": 10260 + }, + { + "epoch": 32.01, + "grad_norm": 0.006799314171075821, + "learning_rate": 1.9444444444444445e-05, + "loss": 0.0543, + "step": 10270 + }, + { + "epoch": 32.010632911392406, + "grad_norm": 2.4008827209472656, + "learning_rate": 1.940928270042194e-05, + "loss": 0.0623, + "step": 10280 + }, + { + "epoch": 32.01126582278481, + "grad_norm": 0.00461224839091301, + "learning_rate": 1.937412095639944e-05, + "loss": 0.0434, + "step": 10290 + }, + { + "epoch": 32.011898734177215, + "grad_norm": 0.004693826660513878, + "learning_rate": 1.9338959212376934e-05, + "loss": 0.0904, + "step": 10300 + }, + { + "epoch": 32.012531645569624, + "grad_norm": 0.006937893573194742, + "learning_rate": 1.9303797468354432e-05, + "loss": 0.0769, + "step": 10310 + }, + { + "epoch": 32.013164556962025, + "grad_norm": 0.005765580106526613, + "learning_rate": 1.9268635724331927e-05, + "loss": 0.0743, + "step": 10320 + }, + { + "epoch": 32.01379746835443, + "grad_norm": 0.003979605156928301, + "learning_rate": 1.9233473980309425e-05, + "loss": 0.0406, + "step": 10330 + }, + { + "epoch": 32.014430379746834, + "grad_norm": 1.9673800468444824, + "learning_rate": 1.919831223628692e-05, + "loss": 0.0928, + "step": 10340 + }, + { + "epoch": 32.01506329113924, + "grad_norm": 2.0642683506011963, + "learning_rate": 1.916315049226442e-05, + "loss": 0.0872, + "step": 10350 + }, + { + "epoch": 32.01569620253164, + "grad_norm": 2.0600333213806152, + "learning_rate": 1.9127988748241913e-05, + "loss": 0.0747, + "step": 10360 + }, + { + "epoch": 32.01632911392405, + "grad_norm": 2.7668960094451904, + "learning_rate": 1.9092827004219412e-05, + "loss": 0.0496, + "step": 10370 + }, + { + "epoch": 32.01696202531645, + "grad_norm": 0.0035948660224676132, + "learning_rate": 1.9057665260196907e-05, + "loss": 0.0713, + "step": 10380 + }, + { + "epoch": 32.01759493670886, + "grad_norm": 1.5214300155639648, + "learning_rate": 1.90225035161744e-05, + "loss": 0.0854, + "step": 10390 + }, + { + "epoch": 32.01822784810127, + "grad_norm": 1.6459559202194214, + "learning_rate": 1.89873417721519e-05, + "loss": 0.0765, + "step": 10400 + }, + { + "epoch": 32.01886075949367, + "grad_norm": 1.9180864095687866, + "learning_rate": 1.8952180028129395e-05, + "loss": 0.0367, + "step": 10410 + }, + { + "epoch": 32.01949367088608, + "grad_norm": 2.7413058280944824, + "learning_rate": 1.8917018284106893e-05, + "loss": 0.0763, + "step": 10420 + }, + { + "epoch": 32.02, + "eval_accuracy": 0.9731285988483686, + "eval_loss": 0.050615016371011734, + "eval_runtime": 876.7902, + "eval_samples_per_second": 0.594, + "eval_steps_per_second": 0.075, + "step": 10428 + }, + { + "epoch": 33.00012658227848, + "grad_norm": 0.003906856290996075, + "learning_rate": 1.8881856540084388e-05, + "loss": 0.0514, + "step": 10430 + }, + { + "epoch": 33.000759493670884, + "grad_norm": 1.0877649784088135, + "learning_rate": 1.8846694796061886e-05, + "loss": 0.0264, + "step": 10440 + }, + { + "epoch": 33.00139240506329, + "grad_norm": 0.0038005015812814236, + "learning_rate": 1.881153305203938e-05, + "loss": 0.1031, + "step": 10450 + }, + { + "epoch": 33.00202531645569, + "grad_norm": 0.0038557087536901236, + "learning_rate": 1.877637130801688e-05, + "loss": 0.055, + "step": 10460 + }, + { + "epoch": 33.0026582278481, + "grad_norm": 0.005424036644399166, + "learning_rate": 1.8741209563994375e-05, + "loss": 0.0734, + "step": 10470 + }, + { + "epoch": 33.00329113924051, + "grad_norm": 0.003683975664898753, + "learning_rate": 1.8706047819971873e-05, + "loss": 0.0477, + "step": 10480 + }, + { + "epoch": 33.00392405063291, + "grad_norm": 0.3991844952106476, + "learning_rate": 1.8670886075949368e-05, + "loss": 0.0014, + "step": 10490 + }, + { + "epoch": 33.00455696202532, + "grad_norm": 1.7069263458251953, + "learning_rate": 1.8635724331926866e-05, + "loss": 0.0679, + "step": 10500 + }, + { + "epoch": 33.00518987341772, + "grad_norm": 1.4248631000518799, + "learning_rate": 1.860056258790436e-05, + "loss": 0.0478, + "step": 10510 + }, + { + "epoch": 33.00582278481013, + "grad_norm": 0.006337147206068039, + "learning_rate": 1.856540084388186e-05, + "loss": 0.0769, + "step": 10520 + }, + { + "epoch": 33.00645569620253, + "grad_norm": 0.003814426949247718, + "learning_rate": 1.8530239099859354e-05, + "loss": 0.1142, + "step": 10530 + }, + { + "epoch": 33.00708860759494, + "grad_norm": 0.003283862257376313, + "learning_rate": 1.8495077355836853e-05, + "loss": 0.0826, + "step": 10540 + }, + { + "epoch": 33.00772151898734, + "grad_norm": 0.7462538480758667, + "learning_rate": 1.8459915611814348e-05, + "loss": 0.078, + "step": 10550 + }, + { + "epoch": 33.00835443037975, + "grad_norm": 1.6937042474746704, + "learning_rate": 1.8424753867791843e-05, + "loss": 0.0805, + "step": 10560 + }, + { + "epoch": 33.008987341772155, + "grad_norm": 2.729776382446289, + "learning_rate": 1.8389592123769338e-05, + "loss": 0.0982, + "step": 10570 + }, + { + "epoch": 33.009620253164556, + "grad_norm": 0.005251895170658827, + "learning_rate": 1.8354430379746836e-05, + "loss": 0.0348, + "step": 10580 + }, + { + "epoch": 33.010253164556964, + "grad_norm": 0.005169942043721676, + "learning_rate": 1.831926863572433e-05, + "loss": 0.1224, + "step": 10590 + }, + { + "epoch": 33.010886075949365, + "grad_norm": 2.058396577835083, + "learning_rate": 1.828410689170183e-05, + "loss": 0.0732, + "step": 10600 + }, + { + "epoch": 33.01151898734177, + "grad_norm": 0.004916292615234852, + "learning_rate": 1.8248945147679324e-05, + "loss": 0.0203, + "step": 10610 + }, + { + "epoch": 33.012151898734174, + "grad_norm": 0.004748203791677952, + "learning_rate": 1.8213783403656822e-05, + "loss": 0.0726, + "step": 10620 + }, + { + "epoch": 33.01278481012658, + "grad_norm": 0.004022388719022274, + "learning_rate": 1.8178621659634317e-05, + "loss": 0.0105, + "step": 10630 + }, + { + "epoch": 33.01341772151899, + "grad_norm": 0.002989129861816764, + "learning_rate": 1.8143459915611816e-05, + "loss": 0.0093, + "step": 10640 + }, + { + "epoch": 33.01405063291139, + "grad_norm": 0.0042097545228898525, + "learning_rate": 1.810829817158931e-05, + "loss": 0.0425, + "step": 10650 + }, + { + "epoch": 33.0146835443038, + "grad_norm": 0.008987381123006344, + "learning_rate": 1.807313642756681e-05, + "loss": 0.0721, + "step": 10660 + }, + { + "epoch": 33.0153164556962, + "grad_norm": 0.004144416656345129, + "learning_rate": 1.8037974683544304e-05, + "loss": 0.0474, + "step": 10670 + }, + { + "epoch": 33.01594936708861, + "grad_norm": 0.0029516241047531366, + "learning_rate": 1.8002812939521802e-05, + "loss": 0.0501, + "step": 10680 + }, + { + "epoch": 33.01658227848101, + "grad_norm": 0.00844433344900608, + "learning_rate": 1.7967651195499297e-05, + "loss": 0.0652, + "step": 10690 + }, + { + "epoch": 33.01721518987342, + "grad_norm": 2.04638409614563, + "learning_rate": 1.7932489451476795e-05, + "loss": 0.084, + "step": 10700 + }, + { + "epoch": 33.01784810126582, + "grad_norm": 0.004865339025855064, + "learning_rate": 1.789732770745429e-05, + "loss": 0.0698, + "step": 10710 + }, + { + "epoch": 33.01848101265823, + "grad_norm": 2.136470079421997, + "learning_rate": 1.786216596343179e-05, + "loss": 0.0798, + "step": 10720 + }, + { + "epoch": 33.019113924050636, + "grad_norm": 0.004824280273169279, + "learning_rate": 1.7827004219409284e-05, + "loss": 0.0244, + "step": 10730 + }, + { + "epoch": 33.01974683544304, + "grad_norm": 2.0193657875061035, + "learning_rate": 1.779184247538678e-05, + "loss": 0.0635, + "step": 10740 + }, + { + "epoch": 33.02, + "eval_accuracy": 0.9750479846449136, + "eval_loss": 0.049176860600709915, + "eval_runtime": 874.5591, + "eval_samples_per_second": 0.596, + "eval_steps_per_second": 0.075, + "step": 10744 + }, + { + "epoch": 34.00037974683544, + "grad_norm": 0.004216327331960201, + "learning_rate": 1.7756680731364274e-05, + "loss": 0.0312, + "step": 10750 + }, + { + "epoch": 34.00101265822785, + "grad_norm": 0.004103939048945904, + "learning_rate": 1.7721518987341772e-05, + "loss": 0.041, + "step": 10760 + }, + { + "epoch": 34.00164556962025, + "grad_norm": 0.014023522846400738, + "learning_rate": 1.7686357243319267e-05, + "loss": 0.0759, + "step": 10770 + }, + { + "epoch": 34.00227848101266, + "grad_norm": 0.003603527322411537, + "learning_rate": 1.7651195499296765e-05, + "loss": 0.0688, + "step": 10780 + }, + { + "epoch": 34.00291139240506, + "grad_norm": 0.0033313506282866, + "learning_rate": 1.7616033755274263e-05, + "loss": 0.0465, + "step": 10790 + }, + { + "epoch": 34.00354430379747, + "grad_norm": 1.8435585498809814, + "learning_rate": 1.758087201125176e-05, + "loss": 0.0949, + "step": 10800 + }, + { + "epoch": 34.00417721518988, + "grad_norm": 1.6445870399475098, + "learning_rate": 1.7545710267229257e-05, + "loss": 0.1285, + "step": 10810 + }, + { + "epoch": 34.00481012658228, + "grad_norm": 0.002922611776739359, + "learning_rate": 1.751054852320675e-05, + "loss": 0.0706, + "step": 10820 + }, + { + "epoch": 34.005443037974686, + "grad_norm": 2.673938274383545, + "learning_rate": 1.747538677918425e-05, + "loss": 0.0942, + "step": 10830 + }, + { + "epoch": 34.00607594936709, + "grad_norm": 0.004203031305223703, + "learning_rate": 1.7440225035161745e-05, + "loss": 0.0326, + "step": 10840 + }, + { + "epoch": 34.006708860759495, + "grad_norm": 1.8749343156814575, + "learning_rate": 1.7405063291139243e-05, + "loss": 0.1136, + "step": 10850 + }, + { + "epoch": 34.0073417721519, + "grad_norm": 0.004128487780690193, + "learning_rate": 1.7369901547116738e-05, + "loss": 0.0677, + "step": 10860 + }, + { + "epoch": 34.007974683544305, + "grad_norm": 0.00483922241255641, + "learning_rate": 1.7334739803094237e-05, + "loss": 0.0719, + "step": 10870 + }, + { + "epoch": 34.008607594936706, + "grad_norm": 1.308681607246399, + "learning_rate": 1.729957805907173e-05, + "loss": 0.0435, + "step": 10880 + }, + { + "epoch": 34.009240506329114, + "grad_norm": 0.0035531967878341675, + "learning_rate": 1.726441631504923e-05, + "loss": 0.0415, + "step": 10890 + }, + { + "epoch": 34.00987341772152, + "grad_norm": 1.937238335609436, + "learning_rate": 1.7229254571026725e-05, + "loss": 0.0518, + "step": 10900 + }, + { + "epoch": 34.01050632911392, + "grad_norm": 0.0039803506806492805, + "learning_rate": 1.719409282700422e-05, + "loss": 0.065, + "step": 10910 + }, + { + "epoch": 34.01113924050633, + "grad_norm": 1.8601768016815186, + "learning_rate": 1.7158931082981715e-05, + "loss": 0.0599, + "step": 10920 + }, + { + "epoch": 34.01177215189873, + "grad_norm": 0.003120964393019676, + "learning_rate": 1.7123769338959213e-05, + "loss": 0.0747, + "step": 10930 + }, + { + "epoch": 34.01240506329114, + "grad_norm": 2.323869228363037, + "learning_rate": 1.7088607594936708e-05, + "loss": 0.0448, + "step": 10940 + }, + { + "epoch": 34.01303797468354, + "grad_norm": 0.007831540890038013, + "learning_rate": 1.7053445850914206e-05, + "loss": 0.1065, + "step": 10950 + }, + { + "epoch": 34.01367088607595, + "grad_norm": 0.007483489811420441, + "learning_rate": 1.70182841068917e-05, + "loss": 0.057, + "step": 10960 + }, + { + "epoch": 34.01430379746836, + "grad_norm": 1.7984932661056519, + "learning_rate": 1.69831223628692e-05, + "loss": 0.1047, + "step": 10970 + }, + { + "epoch": 34.01493670886076, + "grad_norm": 0.011337379924952984, + "learning_rate": 1.6947960618846694e-05, + "loss": 0.0281, + "step": 10980 + }, + { + "epoch": 34.01556962025317, + "grad_norm": 0.004797440022230148, + "learning_rate": 1.6912798874824193e-05, + "loss": 0.0295, + "step": 10990 + }, + { + "epoch": 34.01620253164557, + "grad_norm": 0.0023510728497058153, + "learning_rate": 1.6877637130801688e-05, + "loss": 0.0391, + "step": 11000 + }, + { + "epoch": 34.01683544303798, + "grad_norm": 0.015410896390676498, + "learning_rate": 1.6842475386779186e-05, + "loss": 0.0316, + "step": 11010 + }, + { + "epoch": 34.01746835443038, + "grad_norm": 0.008044656366109848, + "learning_rate": 1.680731364275668e-05, + "loss": 0.036, + "step": 11020 + }, + { + "epoch": 34.018101265822786, + "grad_norm": 0.004154231399297714, + "learning_rate": 1.677215189873418e-05, + "loss": 0.0852, + "step": 11030 + }, + { + "epoch": 34.01873417721519, + "grad_norm": 0.006638134364038706, + "learning_rate": 1.6736990154711674e-05, + "loss": 0.0142, + "step": 11040 + }, + { + "epoch": 34.019367088607595, + "grad_norm": 0.0038269851356744766, + "learning_rate": 1.6701828410689173e-05, + "loss": 0.0155, + "step": 11050 + }, + { + "epoch": 34.02, + "grad_norm": 0.0027346955612301826, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0729, + "step": 11060 + }, + { + "epoch": 34.02, + "eval_accuracy": 0.9692898272552783, + "eval_loss": 0.048282425850629807, + "eval_runtime": 866.1958, + "eval_samples_per_second": 0.601, + "eval_steps_per_second": 0.076, + "step": 11060 + }, + { + "epoch": 35.00063291139241, + "grad_norm": 0.0029359341133385897, + "learning_rate": 1.6631504922644166e-05, + "loss": 0.0964, + "step": 11070 + }, + { + "epoch": 35.00126582278481, + "grad_norm": 0.0031327735632658005, + "learning_rate": 1.659634317862166e-05, + "loss": 0.057, + "step": 11080 + }, + { + "epoch": 35.00189873417722, + "grad_norm": 0.004360498860478401, + "learning_rate": 1.6561181434599156e-05, + "loss": 0.0264, + "step": 11090 + }, + { + "epoch": 35.00253164556962, + "grad_norm": 0.013268140144646168, + "learning_rate": 1.6526019690576654e-05, + "loss": 0.0785, + "step": 11100 + }, + { + "epoch": 35.00316455696203, + "grad_norm": 0.0029599510598927736, + "learning_rate": 1.649085794655415e-05, + "loss": 0.0914, + "step": 11110 + }, + { + "epoch": 35.00379746835443, + "grad_norm": 0.00652879336848855, + "learning_rate": 1.6455696202531644e-05, + "loss": 0.0423, + "step": 11120 + }, + { + "epoch": 35.004430379746836, + "grad_norm": 0.004876864142715931, + "learning_rate": 1.6420534458509142e-05, + "loss": 0.0367, + "step": 11130 + }, + { + "epoch": 35.00506329113924, + "grad_norm": 0.0030262030195444822, + "learning_rate": 1.6385372714486637e-05, + "loss": 0.0579, + "step": 11140 + }, + { + "epoch": 35.005696202531645, + "grad_norm": 0.005096379201859236, + "learning_rate": 1.6350210970464135e-05, + "loss": 0.0162, + "step": 11150 + }, + { + "epoch": 35.00632911392405, + "grad_norm": 0.005958447232842445, + "learning_rate": 1.631504922644163e-05, + "loss": 0.0704, + "step": 11160 + }, + { + "epoch": 35.006962025316454, + "grad_norm": 0.09998659044504166, + "learning_rate": 1.627988748241913e-05, + "loss": 0.0396, + "step": 11170 + }, + { + "epoch": 35.00759493670886, + "grad_norm": 2.7944371700286865, + "learning_rate": 1.6244725738396624e-05, + "loss": 0.0813, + "step": 11180 + }, + { + "epoch": 35.008227848101264, + "grad_norm": 0.00864170491695404, + "learning_rate": 1.6209563994374122e-05, + "loss": 0.069, + "step": 11190 + }, + { + "epoch": 35.00886075949367, + "grad_norm": 0.008192246779799461, + "learning_rate": 1.617440225035162e-05, + "loss": 0.1284, + "step": 11200 + }, + { + "epoch": 35.00949367088607, + "grad_norm": 0.007694208063185215, + "learning_rate": 1.6139240506329115e-05, + "loss": 0.0198, + "step": 11210 + }, + { + "epoch": 35.01012658227848, + "grad_norm": 0.0043876878917217255, + "learning_rate": 1.6104078762306614e-05, + "loss": 0.0759, + "step": 11220 + }, + { + "epoch": 35.01075949367089, + "grad_norm": 5.103017330169678, + "learning_rate": 1.606891701828411e-05, + "loss": 0.0263, + "step": 11230 + }, + { + "epoch": 35.01139240506329, + "grad_norm": 0.038662366569042206, + "learning_rate": 1.6033755274261607e-05, + "loss": 0.1215, + "step": 11240 + }, + { + "epoch": 35.0120253164557, + "grad_norm": 0.0065099457278847694, + "learning_rate": 1.5998593530239102e-05, + "loss": 0.1129, + "step": 11250 + }, + { + "epoch": 35.0126582278481, + "grad_norm": 3.32816219329834, + "learning_rate": 1.5963431786216597e-05, + "loss": 0.1243, + "step": 11260 + }, + { + "epoch": 35.01329113924051, + "grad_norm": 2.0049562454223633, + "learning_rate": 1.5928270042194095e-05, + "loss": 0.0641, + "step": 11270 + }, + { + "epoch": 35.01392405063291, + "grad_norm": 0.7978969216346741, + "learning_rate": 1.589310829817159e-05, + "loss": 0.0625, + "step": 11280 + }, + { + "epoch": 35.01455696202532, + "grad_norm": 0.004707363899797201, + "learning_rate": 1.5857946554149085e-05, + "loss": 0.064, + "step": 11290 + }, + { + "epoch": 35.01518987341772, + "grad_norm": 0.0142443822696805, + "learning_rate": 1.5822784810126583e-05, + "loss": 0.0638, + "step": 11300 + }, + { + "epoch": 35.015822784810126, + "grad_norm": 0.011079243384301662, + "learning_rate": 1.5787623066104078e-05, + "loss": 0.0004, + "step": 11310 + }, + { + "epoch": 35.016455696202534, + "grad_norm": 2.3243215084075928, + "learning_rate": 1.5752461322081577e-05, + "loss": 0.0786, + "step": 11320 + }, + { + "epoch": 35.017088607594935, + "grad_norm": 0.004174171946942806, + "learning_rate": 1.571729957805907e-05, + "loss": 0.0151, + "step": 11330 + }, + { + "epoch": 35.017721518987344, + "grad_norm": 0.006861343514174223, + "learning_rate": 1.568213783403657e-05, + "loss": 0.1092, + "step": 11340 + }, + { + "epoch": 35.018354430379745, + "grad_norm": 2.0469024181365967, + "learning_rate": 1.5646976090014065e-05, + "loss": 0.035, + "step": 11350 + }, + { + "epoch": 35.01898734177215, + "grad_norm": 2.0664782524108887, + "learning_rate": 1.5611814345991563e-05, + "loss": 0.0619, + "step": 11360 + }, + { + "epoch": 35.019620253164554, + "grad_norm": 0.004111067857593298, + "learning_rate": 1.5576652601969058e-05, + "loss": 0.0692, + "step": 11370 + }, + { + "epoch": 35.02, + "eval_accuracy": 0.9750479846449136, + "eval_loss": 0.04807919263839722, + "eval_runtime": 894.7526, + "eval_samples_per_second": 0.582, + "eval_steps_per_second": 0.074, + "step": 11376 + }, + { + "epoch": 36.00025316455696, + "grad_norm": 0.0060658580623567104, + "learning_rate": 1.5541490857946556e-05, + "loss": 0.0883, + "step": 11380 + }, + { + "epoch": 36.00088607594937, + "grad_norm": 2.9155092239379883, + "learning_rate": 1.550632911392405e-05, + "loss": 0.0509, + "step": 11390 + }, + { + "epoch": 36.001518987341775, + "grad_norm": 0.004734630696475506, + "learning_rate": 1.547116736990155e-05, + "loss": 0.0554, + "step": 11400 + }, + { + "epoch": 36.002151898734176, + "grad_norm": 2.3105368614196777, + "learning_rate": 1.5436005625879045e-05, + "loss": 0.0768, + "step": 11410 + }, + { + "epoch": 36.002784810126585, + "grad_norm": 0.0034461403265595436, + "learning_rate": 1.5400843881856543e-05, + "loss": 0.0336, + "step": 11420 + }, + { + "epoch": 36.003417721518986, + "grad_norm": 2.3787004947662354, + "learning_rate": 1.5365682137834038e-05, + "loss": 0.0541, + "step": 11430 + }, + { + "epoch": 36.004050632911394, + "grad_norm": 0.003510306589305401, + "learning_rate": 1.5330520393811533e-05, + "loss": 0.0132, + "step": 11440 + }, + { + "epoch": 36.004683544303795, + "grad_norm": 2.900848627090454, + "learning_rate": 1.529535864978903e-05, + "loss": 0.0892, + "step": 11450 + }, + { + "epoch": 36.0053164556962, + "grad_norm": 0.0045495242811739445, + "learning_rate": 1.5260196905766526e-05, + "loss": 0.0582, + "step": 11460 + }, + { + "epoch": 36.005949367088604, + "grad_norm": 2.1150054931640625, + "learning_rate": 1.5225035161744023e-05, + "loss": 0.0567, + "step": 11470 + }, + { + "epoch": 36.00658227848101, + "grad_norm": 2.303497791290283, + "learning_rate": 1.5189873417721521e-05, + "loss": 0.0471, + "step": 11480 + }, + { + "epoch": 36.00721518987342, + "grad_norm": 1.5887198448181152, + "learning_rate": 1.5154711673699016e-05, + "loss": 0.0762, + "step": 11490 + }, + { + "epoch": 36.00784810126582, + "grad_norm": 0.009701371192932129, + "learning_rate": 1.5119549929676513e-05, + "loss": 0.0994, + "step": 11500 + }, + { + "epoch": 36.00848101265823, + "grad_norm": 1.875978708267212, + "learning_rate": 1.5084388185654007e-05, + "loss": 0.037, + "step": 11510 + }, + { + "epoch": 36.00911392405063, + "grad_norm": 1.7351700067520142, + "learning_rate": 1.5049226441631506e-05, + "loss": 0.1036, + "step": 11520 + }, + { + "epoch": 36.00974683544304, + "grad_norm": 2.9241271018981934, + "learning_rate": 1.5014064697609e-05, + "loss": 0.0546, + "step": 11530 + }, + { + "epoch": 36.01037974683544, + "grad_norm": 0.00652047386392951, + "learning_rate": 1.4978902953586499e-05, + "loss": 0.0837, + "step": 11540 + }, + { + "epoch": 36.01101265822785, + "grad_norm": 0.005075695458799601, + "learning_rate": 1.4943741209563994e-05, + "loss": 0.0413, + "step": 11550 + }, + { + "epoch": 36.011645569620256, + "grad_norm": 0.005239785648882389, + "learning_rate": 1.4908579465541492e-05, + "loss": 0.0714, + "step": 11560 + }, + { + "epoch": 36.01227848101266, + "grad_norm": 0.0033316484186798334, + "learning_rate": 1.4873417721518987e-05, + "loss": 0.0297, + "step": 11570 + }, + { + "epoch": 36.012911392405066, + "grad_norm": 2.3135130405426025, + "learning_rate": 1.4838255977496484e-05, + "loss": 0.1521, + "step": 11580 + }, + { + "epoch": 36.01354430379747, + "grad_norm": 2.01491117477417, + "learning_rate": 1.480309423347398e-05, + "loss": 0.0542, + "step": 11590 + }, + { + "epoch": 36.014177215189875, + "grad_norm": 0.003011218272149563, + "learning_rate": 1.4767932489451477e-05, + "loss": 0.0605, + "step": 11600 + }, + { + "epoch": 36.014810126582276, + "grad_norm": 0.003551116678863764, + "learning_rate": 1.4732770745428972e-05, + "loss": 0.0359, + "step": 11610 + }, + { + "epoch": 36.015443037974684, + "grad_norm": 0.004775399807840586, + "learning_rate": 1.469760900140647e-05, + "loss": 0.0181, + "step": 11620 + }, + { + "epoch": 36.016075949367085, + "grad_norm": 0.0036576108541339636, + "learning_rate": 1.4662447257383969e-05, + "loss": 0.1157, + "step": 11630 + }, + { + "epoch": 36.01670886075949, + "grad_norm": 0.00402354309335351, + "learning_rate": 1.4627285513361464e-05, + "loss": 0.0341, + "step": 11640 + }, + { + "epoch": 36.0173417721519, + "grad_norm": 0.003662722185254097, + "learning_rate": 1.4592123769338962e-05, + "loss": 0.0393, + "step": 11650 + }, + { + "epoch": 36.0179746835443, + "grad_norm": 2.0367825031280518, + "learning_rate": 1.4556962025316457e-05, + "loss": 0.0609, + "step": 11660 + }, + { + "epoch": 36.01860759493671, + "grad_norm": 0.4210270345211029, + "learning_rate": 1.4521800281293954e-05, + "loss": 0.0793, + "step": 11670 + }, + { + "epoch": 36.01924050632911, + "grad_norm": 2.6557528972625732, + "learning_rate": 1.4486638537271449e-05, + "loss": 0.0707, + "step": 11680 + }, + { + "epoch": 36.01987341772152, + "grad_norm": 2.917741537094116, + "learning_rate": 1.4451476793248947e-05, + "loss": 0.1023, + "step": 11690 + }, + { + "epoch": 36.02, + "eval_accuracy": 0.9712092130518234, + "eval_loss": 0.047843087464571, + "eval_runtime": 935.2983, + "eval_samples_per_second": 0.557, + "eval_steps_per_second": 0.071, + "step": 11692 + }, + { + "epoch": 37.000506329113925, + "grad_norm": 0.004461715929210186, + "learning_rate": 1.4416315049226442e-05, + "loss": 0.0132, + "step": 11700 + }, + { + "epoch": 37.001139240506326, + "grad_norm": 2.282515525817871, + "learning_rate": 1.438115330520394e-05, + "loss": 0.097, + "step": 11710 + }, + { + "epoch": 37.001772151898734, + "grad_norm": 1.2947165966033936, + "learning_rate": 1.4345991561181435e-05, + "loss": 0.0536, + "step": 11720 + }, + { + "epoch": 37.00240506329114, + "grad_norm": 0.003223339095711708, + "learning_rate": 1.4310829817158933e-05, + "loss": 0.0295, + "step": 11730 + }, + { + "epoch": 37.003037974683544, + "grad_norm": 0.003211404662579298, + "learning_rate": 1.4275668073136428e-05, + "loss": 0.0408, + "step": 11740 + }, + { + "epoch": 37.00367088607595, + "grad_norm": 0.004511035978794098, + "learning_rate": 1.4240506329113925e-05, + "loss": 0.0594, + "step": 11750 + }, + { + "epoch": 37.00430379746835, + "grad_norm": 0.9744784235954285, + "learning_rate": 1.4205344585091422e-05, + "loss": 0.0235, + "step": 11760 + }, + { + "epoch": 37.00493670886076, + "grad_norm": 0.0027787466533482075, + "learning_rate": 1.4170182841068918e-05, + "loss": 0.0331, + "step": 11770 + }, + { + "epoch": 37.00556962025316, + "grad_norm": 2.0447912216186523, + "learning_rate": 1.4135021097046413e-05, + "loss": 0.0689, + "step": 11780 + }, + { + "epoch": 37.00620253164557, + "grad_norm": 0.0037356666289269924, + "learning_rate": 1.4099859353023911e-05, + "loss": 0.1165, + "step": 11790 + }, + { + "epoch": 37.00683544303797, + "grad_norm": 0.00903794914484024, + "learning_rate": 1.4064697609001406e-05, + "loss": 0.0184, + "step": 11800 + }, + { + "epoch": 37.00746835443038, + "grad_norm": 2.4592278003692627, + "learning_rate": 1.4029535864978905e-05, + "loss": 0.0623, + "step": 11810 + }, + { + "epoch": 37.00810126582279, + "grad_norm": 30.086641311645508, + "learning_rate": 1.39943741209564e-05, + "loss": 0.0916, + "step": 11820 + }, + { + "epoch": 37.00873417721519, + "grad_norm": 0.006698968820273876, + "learning_rate": 1.3959212376933898e-05, + "loss": 0.0514, + "step": 11830 + }, + { + "epoch": 37.0093670886076, + "grad_norm": 2.0741055011749268, + "learning_rate": 1.3924050632911393e-05, + "loss": 0.1023, + "step": 11840 + }, + { + "epoch": 37.01, + "grad_norm": 3.4092001914978027, + "learning_rate": 1.388888888888889e-05, + "loss": 0.1079, + "step": 11850 + }, + { + "epoch": 37.010632911392406, + "grad_norm": 1.7051777839660645, + "learning_rate": 1.3853727144866384e-05, + "loss": 0.1146, + "step": 11860 + }, + { + "epoch": 37.01126582278481, + "grad_norm": 0.007281308062374592, + "learning_rate": 1.3818565400843883e-05, + "loss": 0.0593, + "step": 11870 + }, + { + "epoch": 37.011898734177215, + "grad_norm": 0.01118686143308878, + "learning_rate": 1.3783403656821378e-05, + "loss": 0.0342, + "step": 11880 + }, + { + "epoch": 37.012531645569624, + "grad_norm": 0.014248767867684364, + "learning_rate": 1.3748241912798876e-05, + "loss": 0.0685, + "step": 11890 + }, + { + "epoch": 37.013164556962025, + "grad_norm": 1.9155502319335938, + "learning_rate": 1.3713080168776371e-05, + "loss": 0.0562, + "step": 11900 + }, + { + "epoch": 37.01379746835443, + "grad_norm": 2.529233694076538, + "learning_rate": 1.367791842475387e-05, + "loss": 0.0588, + "step": 11910 + }, + { + "epoch": 37.014430379746834, + "grad_norm": 0.006225684192031622, + "learning_rate": 1.3642756680731364e-05, + "loss": 0.046, + "step": 11920 + }, + { + "epoch": 37.01506329113924, + "grad_norm": 1.680939793586731, + "learning_rate": 1.3607594936708861e-05, + "loss": 0.0775, + "step": 11930 + }, + { + "epoch": 37.01569620253164, + "grad_norm": 1.7170394659042358, + "learning_rate": 1.3572433192686358e-05, + "loss": 0.0498, + "step": 11940 + }, + { + "epoch": 37.01632911392405, + "grad_norm": 5.160343170166016, + "learning_rate": 1.3537271448663854e-05, + "loss": 0.0707, + "step": 11950 + }, + { + "epoch": 37.01696202531645, + "grad_norm": 2.1870486736297607, + "learning_rate": 1.3502109704641349e-05, + "loss": 0.0583, + "step": 11960 + }, + { + "epoch": 37.01759493670886, + "grad_norm": 2.745607852935791, + "learning_rate": 1.3466947960618847e-05, + "loss": 0.0737, + "step": 11970 + }, + { + "epoch": 37.01822784810127, + "grad_norm": 0.005020118784159422, + "learning_rate": 1.3431786216596342e-05, + "loss": 0.0867, + "step": 11980 + }, + { + "epoch": 37.01886075949367, + "grad_norm": 2.2842442989349365, + "learning_rate": 1.339662447257384e-05, + "loss": 0.0794, + "step": 11990 + }, + { + "epoch": 37.01949367088608, + "grad_norm": 0.011187170632183552, + "learning_rate": 1.3361462728551336e-05, + "loss": 0.0863, + "step": 12000 + }, + { + "epoch": 37.02, + "eval_accuracy": 0.9750479846449136, + "eval_loss": 0.04793384671211243, + "eval_runtime": 885.4471, + "eval_samples_per_second": 0.588, + "eval_steps_per_second": 0.075, + "step": 12008 + }, + { + "epoch": 38.00012658227848, + "grad_norm": 0.004787384066730738, + "learning_rate": 1.3326300984528834e-05, + "loss": 0.0004, + "step": 12010 + }, + { + "epoch": 38.000759493670884, + "grad_norm": 0.006256919354200363, + "learning_rate": 1.3291139240506329e-05, + "loss": 0.0641, + "step": 12020 + }, + { + "epoch": 38.00139240506329, + "grad_norm": 2.1292147636413574, + "learning_rate": 1.3255977496483826e-05, + "loss": 0.0635, + "step": 12030 + }, + { + "epoch": 38.00202531645569, + "grad_norm": 1.8341985940933228, + "learning_rate": 1.3220815752461324e-05, + "loss": 0.0635, + "step": 12040 + }, + { + "epoch": 38.0026582278481, + "grad_norm": 0.012860477901995182, + "learning_rate": 1.3185654008438819e-05, + "loss": 0.0439, + "step": 12050 + }, + { + "epoch": 38.00329113924051, + "grad_norm": 0.005083575379103422, + "learning_rate": 1.3150492264416317e-05, + "loss": 0.0468, + "step": 12060 + }, + { + "epoch": 38.00392405063291, + "grad_norm": 1.4542759656906128, + "learning_rate": 1.3115330520393812e-05, + "loss": 0.0388, + "step": 12070 + }, + { + "epoch": 38.00455696202532, + "grad_norm": 1.959895372390747, + "learning_rate": 1.308016877637131e-05, + "loss": 0.0416, + "step": 12080 + }, + { + "epoch": 38.00518987341772, + "grad_norm": 2.081799030303955, + "learning_rate": 1.3045007032348805e-05, + "loss": 0.1381, + "step": 12090 + }, + { + "epoch": 38.00582278481013, + "grad_norm": 0.004892106633633375, + "learning_rate": 1.3009845288326302e-05, + "loss": 0.0568, + "step": 12100 + }, + { + "epoch": 38.00645569620253, + "grad_norm": 0.005953035783022642, + "learning_rate": 1.2974683544303799e-05, + "loss": 0.0619, + "step": 12110 + }, + { + "epoch": 38.00708860759494, + "grad_norm": 0.00476424815133214, + "learning_rate": 1.2939521800281295e-05, + "loss": 0.0487, + "step": 12120 + }, + { + "epoch": 38.00772151898734, + "grad_norm": 0.004643861670047045, + "learning_rate": 1.290436005625879e-05, + "loss": 0.078, + "step": 12130 + }, + { + "epoch": 38.00835443037975, + "grad_norm": 0.004535979591310024, + "learning_rate": 1.2869198312236289e-05, + "loss": 0.1059, + "step": 12140 + }, + { + "epoch": 38.008987341772155, + "grad_norm": 0.9413653612136841, + "learning_rate": 1.2834036568213783e-05, + "loss": 0.0808, + "step": 12150 + }, + { + "epoch": 38.009620253164556, + "grad_norm": 0.23469361662864685, + "learning_rate": 1.2798874824191282e-05, + "loss": 0.0552, + "step": 12160 + }, + { + "epoch": 38.010253164556964, + "grad_norm": 9.23499870300293, + "learning_rate": 1.2763713080168777e-05, + "loss": 0.1727, + "step": 12170 + }, + { + "epoch": 38.010886075949365, + "grad_norm": 2.0388455390930176, + "learning_rate": 1.2728551336146275e-05, + "loss": 0.1044, + "step": 12180 + }, + { + "epoch": 38.01151898734177, + "grad_norm": 1.715149164199829, + "learning_rate": 1.269338959212377e-05, + "loss": 0.0649, + "step": 12190 + }, + { + "epoch": 38.012151898734174, + "grad_norm": 0.0031002135947346687, + "learning_rate": 1.2658227848101267e-05, + "loss": 0.0792, + "step": 12200 + }, + { + "epoch": 38.01278481012658, + "grad_norm": 0.00621592253446579, + "learning_rate": 1.2623066104078762e-05, + "loss": 0.0711, + "step": 12210 + }, + { + "epoch": 38.01341772151899, + "grad_norm": 0.010905969887971878, + "learning_rate": 1.258790436005626e-05, + "loss": 0.05, + "step": 12220 + }, + { + "epoch": 38.01405063291139, + "grad_norm": 0.014951992779970169, + "learning_rate": 1.2552742616033755e-05, + "loss": 0.0523, + "step": 12230 + }, + { + "epoch": 38.0146835443038, + "grad_norm": 1.8830513954162598, + "learning_rate": 1.2517580872011253e-05, + "loss": 0.0341, + "step": 12240 + }, + { + "epoch": 38.0153164556962, + "grad_norm": 2.3067173957824707, + "learning_rate": 1.248241912798875e-05, + "loss": 0.0422, + "step": 12250 + }, + { + "epoch": 38.01594936708861, + "grad_norm": 0.008932225406169891, + "learning_rate": 1.2447257383966246e-05, + "loss": 0.0639, + "step": 12260 + }, + { + "epoch": 38.01658227848101, + "grad_norm": 0.00562430452555418, + "learning_rate": 1.2412095639943743e-05, + "loss": 0.0559, + "step": 12270 + }, + { + "epoch": 38.01721518987342, + "grad_norm": 0.003999199718236923, + "learning_rate": 1.237693389592124e-05, + "loss": 0.0529, + "step": 12280 + }, + { + "epoch": 38.01784810126582, + "grad_norm": 0.01232972927391529, + "learning_rate": 1.2341772151898735e-05, + "loss": 0.0269, + "step": 12290 + }, + { + "epoch": 38.01848101265823, + "grad_norm": 0.004044785164296627, + "learning_rate": 1.2306610407876231e-05, + "loss": 0.0154, + "step": 12300 + }, + { + "epoch": 38.019113924050636, + "grad_norm": 0.0040607457049191, + "learning_rate": 1.2271448663853728e-05, + "loss": 0.0157, + "step": 12310 + }, + { + "epoch": 38.01974683544304, + "grad_norm": 1.6415541172027588, + "learning_rate": 1.2236286919831224e-05, + "loss": 0.0934, + "step": 12320 + }, + { + "epoch": 38.02, + "eval_accuracy": 0.9712092130518234, + "eval_loss": 0.04641543701291084, + "eval_runtime": 886.1854, + "eval_samples_per_second": 0.588, + "eval_steps_per_second": 0.074, + "step": 12324 + }, + { + "epoch": 39.00037974683544, + "grad_norm": 1.787951111793518, + "learning_rate": 1.2201125175808721e-05, + "loss": 0.049, + "step": 12330 + }, + { + "epoch": 39.00101265822785, + "grad_norm": 0.0033679732587188482, + "learning_rate": 1.2165963431786218e-05, + "loss": 0.0464, + "step": 12340 + }, + { + "epoch": 39.00164556962025, + "grad_norm": 0.003388448618352413, + "learning_rate": 1.2130801687763714e-05, + "loss": 0.0743, + "step": 12350 + }, + { + "epoch": 39.00227848101266, + "grad_norm": 0.004701158031821251, + "learning_rate": 1.2095639943741211e-05, + "loss": 0.0753, + "step": 12360 + }, + { + "epoch": 39.00291139240506, + "grad_norm": 4.90346097946167, + "learning_rate": 1.2060478199718708e-05, + "loss": 0.0743, + "step": 12370 + }, + { + "epoch": 39.00354430379747, + "grad_norm": 1.0210288763046265, + "learning_rate": 1.2025316455696203e-05, + "loss": 0.0239, + "step": 12380 + }, + { + "epoch": 39.00417721518988, + "grad_norm": 0.0035583917051553726, + "learning_rate": 1.19901547116737e-05, + "loss": 0.0558, + "step": 12390 + }, + { + "epoch": 39.00481012658228, + "grad_norm": 0.010518099181354046, + "learning_rate": 1.1954992967651196e-05, + "loss": 0.0485, + "step": 12400 + }, + { + "epoch": 39.005443037974686, + "grad_norm": 1.8886620998382568, + "learning_rate": 1.1919831223628692e-05, + "loss": 0.0778, + "step": 12410 + }, + { + "epoch": 39.00607594936709, + "grad_norm": 1.6945433616638184, + "learning_rate": 1.1884669479606189e-05, + "loss": 0.0958, + "step": 12420 + }, + { + "epoch": 39.006708860759495, + "grad_norm": 0.005481211934238672, + "learning_rate": 1.1849507735583686e-05, + "loss": 0.072, + "step": 12430 + }, + { + "epoch": 39.0073417721519, + "grad_norm": 0.009708485566079617, + "learning_rate": 1.1814345991561182e-05, + "loss": 0.0492, + "step": 12440 + }, + { + "epoch": 39.007974683544305, + "grad_norm": 2.023264169692993, + "learning_rate": 1.1779184247538679e-05, + "loss": 0.0423, + "step": 12450 + }, + { + "epoch": 39.008607594936706, + "grad_norm": 0.002309863455593586, + "learning_rate": 1.1744022503516176e-05, + "loss": 0.0424, + "step": 12460 + }, + { + "epoch": 39.009240506329114, + "grad_norm": 3.2563633918762207, + "learning_rate": 1.170886075949367e-05, + "loss": 0.0603, + "step": 12470 + }, + { + "epoch": 39.00987341772152, + "grad_norm": 0.005214280914515257, + "learning_rate": 1.1673699015471167e-05, + "loss": 0.0973, + "step": 12480 + }, + { + "epoch": 39.01050632911392, + "grad_norm": 0.0032180920243263245, + "learning_rate": 1.1638537271448664e-05, + "loss": 0.0352, + "step": 12490 + }, + { + "epoch": 39.01113924050633, + "grad_norm": 0.0023686427157372236, + "learning_rate": 1.160337552742616e-05, + "loss": 0.0459, + "step": 12500 + }, + { + "epoch": 39.01177215189873, + "grad_norm": 0.0045258644968271255, + "learning_rate": 1.1568213783403657e-05, + "loss": 0.0683, + "step": 12510 + }, + { + "epoch": 39.01240506329114, + "grad_norm": 0.0025700768455863, + "learning_rate": 1.1533052039381154e-05, + "loss": 0.0412, + "step": 12520 + }, + { + "epoch": 39.01303797468354, + "grad_norm": 0.0044273072853684425, + "learning_rate": 1.149789029535865e-05, + "loss": 0.0338, + "step": 12530 + }, + { + "epoch": 39.01367088607595, + "grad_norm": 0.007956129498779774, + "learning_rate": 1.1462728551336147e-05, + "loss": 0.0658, + "step": 12540 + }, + { + "epoch": 39.01430379746836, + "grad_norm": 0.005596678238362074, + "learning_rate": 1.1427566807313644e-05, + "loss": 0.0658, + "step": 12550 + }, + { + "epoch": 39.01493670886076, + "grad_norm": 0.003668359015136957, + "learning_rate": 1.139240506329114e-05, + "loss": 0.0757, + "step": 12560 + }, + { + "epoch": 39.01556962025317, + "grad_norm": 0.0037127614486962557, + "learning_rate": 1.1357243319268635e-05, + "loss": 0.0477, + "step": 12570 + }, + { + "epoch": 39.01620253164557, + "grad_norm": 1.9813635349273682, + "learning_rate": 1.1322081575246132e-05, + "loss": 0.0453, + "step": 12580 + }, + { + "epoch": 39.01683544303798, + "grad_norm": 0.004934421740472317, + "learning_rate": 1.1286919831223628e-05, + "loss": 0.0507, + "step": 12590 + }, + { + "epoch": 39.01746835443038, + "grad_norm": 0.0019371870439499617, + "learning_rate": 1.1251758087201125e-05, + "loss": 0.053, + "step": 12600 + }, + { + "epoch": 39.018101265822786, + "grad_norm": 0.011108608916401863, + "learning_rate": 1.1216596343178622e-05, + "loss": 0.0283, + "step": 12610 + }, + { + "epoch": 39.01873417721519, + "grad_norm": 2.060152530670166, + "learning_rate": 1.1181434599156118e-05, + "loss": 0.0567, + "step": 12620 + }, + { + "epoch": 39.019367088607595, + "grad_norm": 0.002437671646475792, + "learning_rate": 1.1146272855133615e-05, + "loss": 0.0331, + "step": 12630 + }, + { + "epoch": 39.02, + "grad_norm": 0.0036968346685171127, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.0927, + "step": 12640 + }, + { + "epoch": 39.02, + "eval_accuracy": 0.9712092130518234, + "eval_loss": 0.04621642827987671, + "eval_runtime": 901.4604, + "eval_samples_per_second": 0.578, + "eval_steps_per_second": 0.073, + "step": 12640 + }, + { + "epoch": 40.00063291139241, + "grad_norm": 0.0033778073266148567, + "learning_rate": 1.1075949367088608e-05, + "loss": 0.0145, + "step": 12650 + }, + { + "epoch": 40.00126582278481, + "grad_norm": 2.246983528137207, + "learning_rate": 1.1040787623066103e-05, + "loss": 0.0527, + "step": 12660 + }, + { + "epoch": 40.00189873417722, + "grad_norm": 0.002830845071002841, + "learning_rate": 1.1005625879043602e-05, + "loss": 0.0437, + "step": 12670 + }, + { + "epoch": 40.00253164556962, + "grad_norm": 1.5313416719436646, + "learning_rate": 1.0970464135021098e-05, + "loss": 0.0223, + "step": 12680 + }, + { + "epoch": 40.00316455696203, + "grad_norm": 1.529272437095642, + "learning_rate": 1.0935302390998595e-05, + "loss": 0.0464, + "step": 12690 + }, + { + "epoch": 40.00379746835443, + "grad_norm": 0.0037761295679956675, + "learning_rate": 1.0900140646976091e-05, + "loss": 0.0859, + "step": 12700 + }, + { + "epoch": 40.004430379746836, + "grad_norm": 0.003269642125815153, + "learning_rate": 1.0864978902953588e-05, + "loss": 0.068, + "step": 12710 + }, + { + "epoch": 40.00506329113924, + "grad_norm": 2.6645495891571045, + "learning_rate": 1.0829817158931085e-05, + "loss": 0.0689, + "step": 12720 + }, + { + "epoch": 40.005696202531645, + "grad_norm": 2.0125551223754883, + "learning_rate": 1.079465541490858e-05, + "loss": 0.0139, + "step": 12730 + }, + { + "epoch": 40.00632911392405, + "grad_norm": 0.002355450764298439, + "learning_rate": 1.0759493670886076e-05, + "loss": 0.0098, + "step": 12740 + }, + { + "epoch": 40.006962025316454, + "grad_norm": 0.004568996839225292, + "learning_rate": 1.0724331926863573e-05, + "loss": 0.0692, + "step": 12750 + }, + { + "epoch": 40.00759493670886, + "grad_norm": 0.020498152822256088, + "learning_rate": 1.068917018284107e-05, + "loss": 0.0148, + "step": 12760 + }, + { + "epoch": 40.008227848101264, + "grad_norm": 0.006843405310064554, + "learning_rate": 1.0654008438818566e-05, + "loss": 0.0895, + "step": 12770 + }, + { + "epoch": 40.00886075949367, + "grad_norm": 1.5150411128997803, + "learning_rate": 1.0618846694796063e-05, + "loss": 0.0739, + "step": 12780 + }, + { + "epoch": 40.00949367088607, + "grad_norm": 0.003931212704628706, + "learning_rate": 1.058368495077356e-05, + "loss": 0.1223, + "step": 12790 + }, + { + "epoch": 40.01012658227848, + "grad_norm": 0.007164576090872288, + "learning_rate": 1.0548523206751056e-05, + "loss": 0.0627, + "step": 12800 + }, + { + "epoch": 40.01075949367089, + "grad_norm": 0.009334239177405834, + "learning_rate": 1.0513361462728553e-05, + "loss": 0.0764, + "step": 12810 + }, + { + "epoch": 40.01139240506329, + "grad_norm": 5.856595516204834, + "learning_rate": 1.047819971870605e-05, + "loss": 0.1549, + "step": 12820 + }, + { + "epoch": 40.0120253164557, + "grad_norm": 0.006400711834430695, + "learning_rate": 1.0443037974683544e-05, + "loss": 0.1071, + "step": 12830 + }, + { + "epoch": 40.0126582278481, + "grad_norm": 0.005136122461408377, + "learning_rate": 1.0407876230661041e-05, + "loss": 0.0683, + "step": 12840 + }, + { + "epoch": 40.01329113924051, + "grad_norm": 1.7356547117233276, + "learning_rate": 1.0372714486638538e-05, + "loss": 0.0767, + "step": 12850 + }, + { + "epoch": 40.01392405063291, + "grad_norm": 0.008359666913747787, + "learning_rate": 1.0337552742616034e-05, + "loss": 0.0188, + "step": 12860 + }, + { + "epoch": 40.01455696202532, + "grad_norm": 0.0056666117161512375, + "learning_rate": 1.030239099859353e-05, + "loss": 0.1222, + "step": 12870 + }, + { + "epoch": 40.01518987341772, + "grad_norm": 3.0140154361724854, + "learning_rate": 1.0267229254571027e-05, + "loss": 0.0681, + "step": 12880 + }, + { + "epoch": 40.015822784810126, + "grad_norm": 2.062131881713867, + "learning_rate": 1.0232067510548524e-05, + "loss": 0.1132, + "step": 12890 + }, + { + "epoch": 40.016455696202534, + "grad_norm": 0.027009891346096992, + "learning_rate": 1.019690576652602e-05, + "loss": 0.0292, + "step": 12900 + }, + { + "epoch": 40.017088607594935, + "grad_norm": 0.004601217340677977, + "learning_rate": 1.0161744022503517e-05, + "loss": 0.0366, + "step": 12910 + }, + { + "epoch": 40.017721518987344, + "grad_norm": 0.004661747720092535, + "learning_rate": 1.0126582278481012e-05, + "loss": 0.0386, + "step": 12920 + }, + { + "epoch": 40.018354430379745, + "grad_norm": 0.00786674115806818, + "learning_rate": 1.0091420534458509e-05, + "loss": 0.0872, + "step": 12930 + }, + { + "epoch": 40.01898734177215, + "grad_norm": 0.00579045619815588, + "learning_rate": 1.0056258790436006e-05, + "loss": 0.0319, + "step": 12940 + }, + { + "epoch": 40.019620253164554, + "grad_norm": 1.1140903234481812, + "learning_rate": 1.0021097046413502e-05, + "loss": 0.0254, + "step": 12950 + }, + { + "epoch": 40.02, + "eval_accuracy": 0.9731285988483686, + "eval_loss": 0.04482452943921089, + "eval_runtime": 863.7445, + "eval_samples_per_second": 0.603, + "eval_steps_per_second": 0.076, + "step": 12956 + }, + { + "epoch": 41.00025316455696, + "grad_norm": 1.9278745651245117, + "learning_rate": 9.985935302390999e-06, + "loss": 0.1063, + "step": 12960 + }, + { + "epoch": 41.00088607594937, + "grad_norm": 0.011978060938417912, + "learning_rate": 9.950773558368495e-06, + "loss": 0.047, + "step": 12970 + }, + { + "epoch": 41.001518987341775, + "grad_norm": 0.0027657151222229004, + "learning_rate": 9.915611814345992e-06, + "loss": 0.0534, + "step": 12980 + }, + { + "epoch": 41.002151898734176, + "grad_norm": 0.003485744819045067, + "learning_rate": 9.880450070323489e-06, + "loss": 0.045, + "step": 12990 + }, + { + "epoch": 41.002784810126585, + "grad_norm": 0.019399795681238174, + "learning_rate": 9.845288326300985e-06, + "loss": 0.0372, + "step": 13000 + }, + { + "epoch": 41.003417721518986, + "grad_norm": 2.4368183612823486, + "learning_rate": 9.81012658227848e-06, + "loss": 0.0384, + "step": 13010 + }, + { + "epoch": 41.004050632911394, + "grad_norm": 0.0074830991216003895, + "learning_rate": 9.774964838255977e-06, + "loss": 0.0164, + "step": 13020 + }, + { + "epoch": 41.004683544303795, + "grad_norm": 1.7207592725753784, + "learning_rate": 9.739803094233474e-06, + "loss": 0.025, + "step": 13030 + }, + { + "epoch": 41.0053164556962, + "grad_norm": 2.901007652282715, + "learning_rate": 9.70464135021097e-06, + "loss": 0.0496, + "step": 13040 + }, + { + "epoch": 41.005949367088604, + "grad_norm": 2.028984785079956, + "learning_rate": 9.669479606188467e-06, + "loss": 0.0475, + "step": 13050 + }, + { + "epoch": 41.00658227848101, + "grad_norm": 2.09635066986084, + "learning_rate": 9.634317862165963e-06, + "loss": 0.1207, + "step": 13060 + }, + { + "epoch": 41.00721518987342, + "grad_norm": 1.6683522462844849, + "learning_rate": 9.59915611814346e-06, + "loss": 0.0922, + "step": 13070 + }, + { + "epoch": 41.00784810126582, + "grad_norm": 5.185677528381348, + "learning_rate": 9.563994374120957e-06, + "loss": 0.0592, + "step": 13080 + }, + { + "epoch": 41.00848101265823, + "grad_norm": 1.2516071796417236, + "learning_rate": 9.528832630098453e-06, + "loss": 0.0768, + "step": 13090 + }, + { + "epoch": 41.00911392405063, + "grad_norm": 0.0021934949327260256, + "learning_rate": 9.49367088607595e-06, + "loss": 0.1049, + "step": 13100 + }, + { + "epoch": 41.00974683544304, + "grad_norm": 0.0028072514105588198, + "learning_rate": 9.458509142053447e-06, + "loss": 0.0724, + "step": 13110 + }, + { + "epoch": 41.01037974683544, + "grad_norm": 0.0021622704807668924, + "learning_rate": 9.423347398030943e-06, + "loss": 0.05, + "step": 13120 + }, + { + "epoch": 41.01101265822785, + "grad_norm": 0.0022336526308208704, + "learning_rate": 9.38818565400844e-06, + "loss": 0.0391, + "step": 13130 + }, + { + "epoch": 41.011645569620256, + "grad_norm": 3.3134849071502686, + "learning_rate": 9.353023909985936e-06, + "loss": 0.0497, + "step": 13140 + }, + { + "epoch": 41.01227848101266, + "grad_norm": 2.5143470764160156, + "learning_rate": 9.317862165963433e-06, + "loss": 0.0941, + "step": 13150 + }, + { + "epoch": 41.012911392405066, + "grad_norm": 0.003348015947267413, + "learning_rate": 9.28270042194093e-06, + "loss": 0.0363, + "step": 13160 + }, + { + "epoch": 41.01354430379747, + "grad_norm": 0.00388367404229939, + "learning_rate": 9.247538677918426e-06, + "loss": 0.0579, + "step": 13170 + }, + { + "epoch": 41.014177215189875, + "grad_norm": 2.559396266937256, + "learning_rate": 9.212376933895921e-06, + "loss": 0.0345, + "step": 13180 + }, + { + "epoch": 41.014810126582276, + "grad_norm": 0.0016660054679960012, + "learning_rate": 9.177215189873418e-06, + "loss": 0.0792, + "step": 13190 + }, + { + "epoch": 41.015443037974684, + "grad_norm": 1.8110253810882568, + "learning_rate": 9.142053445850915e-06, + "loss": 0.0731, + "step": 13200 + }, + { + "epoch": 41.016075949367085, + "grad_norm": 1.7615784406661987, + "learning_rate": 9.106891701828411e-06, + "loss": 0.0562, + "step": 13210 + }, + { + "epoch": 41.01670886075949, + "grad_norm": 0.002290283562615514, + "learning_rate": 9.071729957805908e-06, + "loss": 0.0677, + "step": 13220 + }, + { + "epoch": 41.0173417721519, + "grad_norm": 1.738335132598877, + "learning_rate": 9.036568213783404e-06, + "loss": 0.0458, + "step": 13230 + }, + { + "epoch": 41.0179746835443, + "grad_norm": 0.003094849642366171, + "learning_rate": 9.001406469760901e-06, + "loss": 0.0521, + "step": 13240 + }, + { + "epoch": 41.01860759493671, + "grad_norm": 2.013655662536621, + "learning_rate": 8.966244725738398e-06, + "loss": 0.0521, + "step": 13250 + }, + { + "epoch": 41.01924050632911, + "grad_norm": 0.0036079958081245422, + "learning_rate": 8.931082981715894e-06, + "loss": 0.07, + "step": 13260 + }, + { + "epoch": 41.01987341772152, + "grad_norm": 0.0022296609822660685, + "learning_rate": 8.89592123769339e-06, + "loss": 0.043, + "step": 13270 + }, + { + "epoch": 41.02, + "eval_accuracy": 0.9750479846449136, + "eval_loss": 0.044986799359321594, + "eval_runtime": 875.8809, + "eval_samples_per_second": 0.595, + "eval_steps_per_second": 0.075, + "step": 13272 + }, + { + "epoch": 42.000506329113925, + "grad_norm": 0.004048566333949566, + "learning_rate": 8.860759493670886e-06, + "loss": 0.0185, + "step": 13280 + }, + { + "epoch": 42.001139240506326, + "grad_norm": 0.003146181348711252, + "learning_rate": 8.825597749648383e-06, + "loss": 0.0577, + "step": 13290 + }, + { + "epoch": 42.001772151898734, + "grad_norm": 1.8363380432128906, + "learning_rate": 8.79043600562588e-06, + "loss": 0.0858, + "step": 13300 + }, + { + "epoch": 42.00240506329114, + "grad_norm": 0.010127292014658451, + "learning_rate": 8.755274261603376e-06, + "loss": 0.0343, + "step": 13310 + }, + { + "epoch": 42.003037974683544, + "grad_norm": 0.003768622875213623, + "learning_rate": 8.720112517580872e-06, + "loss": 0.0568, + "step": 13320 + }, + { + "epoch": 42.00367088607595, + "grad_norm": 4.169876575469971, + "learning_rate": 8.684950773558369e-06, + "loss": 0.0695, + "step": 13330 + }, + { + "epoch": 42.00430379746835, + "grad_norm": 1.0784434080123901, + "learning_rate": 8.649789029535866e-06, + "loss": 0.0755, + "step": 13340 + }, + { + "epoch": 42.00493670886076, + "grad_norm": 2.58248233795166, + "learning_rate": 8.614627285513362e-06, + "loss": 0.051, + "step": 13350 + }, + { + "epoch": 42.00556962025316, + "grad_norm": 0.001903594471514225, + "learning_rate": 8.579465541490857e-06, + "loss": 0.0168, + "step": 13360 + }, + { + "epoch": 42.00620253164557, + "grad_norm": 0.0027827962767332792, + "learning_rate": 8.544303797468354e-06, + "loss": 0.015, + "step": 13370 + }, + { + "epoch": 42.00683544303797, + "grad_norm": 1.1959846019744873, + "learning_rate": 8.50914205344585e-06, + "loss": 0.0271, + "step": 13380 + }, + { + "epoch": 42.00746835443038, + "grad_norm": 0.0016241382109001279, + "learning_rate": 8.473980309423347e-06, + "loss": 0.0164, + "step": 13390 + }, + { + "epoch": 42.00810126582279, + "grad_norm": 1.522375226020813, + "learning_rate": 8.438818565400844e-06, + "loss": 0.0359, + "step": 13400 + }, + { + "epoch": 42.00873417721519, + "grad_norm": 1.8420836925506592, + "learning_rate": 8.40365682137834e-06, + "loss": 0.0759, + "step": 13410 + }, + { + "epoch": 42.0093670886076, + "grad_norm": 0.0029541512485593557, + "learning_rate": 8.368495077355837e-06, + "loss": 0.0002, + "step": 13420 + }, + { + "epoch": 42.01, + "grad_norm": 1.9524579048156738, + "learning_rate": 8.333333333333334e-06, + "loss": 0.0445, + "step": 13430 + }, + { + "epoch": 42.010632911392406, + "grad_norm": 1.539594292640686, + "learning_rate": 8.29817158931083e-06, + "loss": 0.1011, + "step": 13440 + }, + { + "epoch": 42.01126582278481, + "grad_norm": 2.301950693130493, + "learning_rate": 8.263009845288327e-06, + "loss": 0.1041, + "step": 13450 + }, + { + "epoch": 42.011898734177215, + "grad_norm": 0.004067489877343178, + "learning_rate": 8.227848101265822e-06, + "loss": 0.086, + "step": 13460 + }, + { + "epoch": 42.012531645569624, + "grad_norm": 2.2229349613189697, + "learning_rate": 8.192686357243319e-06, + "loss": 0.1346, + "step": 13470 + }, + { + "epoch": 42.013164556962025, + "grad_norm": 0.005787982139736414, + "learning_rate": 8.157524613220815e-06, + "loss": 0.0442, + "step": 13480 + }, + { + "epoch": 42.01379746835443, + "grad_norm": 0.0025440517347306013, + "learning_rate": 8.122362869198312e-06, + "loss": 0.0091, + "step": 13490 + }, + { + "epoch": 42.014430379746834, + "grad_norm": 3.941728115081787, + "learning_rate": 8.08720112517581e-06, + "loss": 0.0812, + "step": 13500 + }, + { + "epoch": 42.01506329113924, + "grad_norm": 0.6516626477241516, + "learning_rate": 8.052039381153307e-06, + "loss": 0.0892, + "step": 13510 + }, + { + "epoch": 42.01569620253164, + "grad_norm": 0.002771928673610091, + "learning_rate": 8.016877637130803e-06, + "loss": 0.0256, + "step": 13520 + }, + { + "epoch": 42.01632911392405, + "grad_norm": 1.8547269105911255, + "learning_rate": 7.981715893108298e-06, + "loss": 0.0839, + "step": 13530 + }, + { + "epoch": 42.01696202531645, + "grad_norm": 0.0014696965226903558, + "learning_rate": 7.946554149085795e-06, + "loss": 0.0356, + "step": 13540 + }, + { + "epoch": 42.01759493670886, + "grad_norm": 1.8392219543457031, + "learning_rate": 7.911392405063292e-06, + "loss": 0.0614, + "step": 13550 + }, + { + "epoch": 42.01822784810127, + "grad_norm": 0.0020308021921664476, + "learning_rate": 7.876230661040788e-06, + "loss": 0.0192, + "step": 13560 + }, + { + "epoch": 42.01886075949367, + "grad_norm": 1.8742437362670898, + "learning_rate": 7.841068917018285e-06, + "loss": 0.0993, + "step": 13570 + }, + { + "epoch": 42.01949367088608, + "grad_norm": 2.1382617950439453, + "learning_rate": 7.805907172995782e-06, + "loss": 0.0695, + "step": 13580 + }, + { + "epoch": 42.02, + "eval_accuracy": 0.9750479846449136, + "eval_loss": 0.04481815919280052, + "eval_runtime": 864.6404, + "eval_samples_per_second": 0.603, + "eval_steps_per_second": 0.076, + "step": 13588 + }, + { + "epoch": 43.00012658227848, + "grad_norm": 0.0016333996318280697, + "learning_rate": 7.770745428973278e-06, + "loss": 0.0536, + "step": 13590 + }, + { + "epoch": 43.000759493670884, + "grad_norm": 2.334437131881714, + "learning_rate": 7.735583684950775e-06, + "loss": 0.0156, + "step": 13600 + }, + { + "epoch": 43.00139240506329, + "grad_norm": 1.4097012281417847, + "learning_rate": 7.700421940928271e-06, + "loss": 0.0638, + "step": 13610 + }, + { + "epoch": 43.00202531645569, + "grad_norm": 0.0024437035899609327, + "learning_rate": 7.665260196905766e-06, + "loss": 0.0184, + "step": 13620 + }, + { + "epoch": 43.0026582278481, + "grad_norm": 1.7008681297302246, + "learning_rate": 7.630098452883263e-06, + "loss": 0.0664, + "step": 13630 + }, + { + "epoch": 43.00329113924051, + "grad_norm": 0.0019239940447732806, + "learning_rate": 7.5949367088607605e-06, + "loss": 0.1075, + "step": 13640 + }, + { + "epoch": 43.00392405063291, + "grad_norm": 0.007515426259487867, + "learning_rate": 7.559774964838256e-06, + "loss": 0.0423, + "step": 13650 + }, + { + "epoch": 43.00455696202532, + "grad_norm": 2.043806552886963, + "learning_rate": 7.524613220815753e-06, + "loss": 0.057, + "step": 13660 + }, + { + "epoch": 43.00518987341772, + "grad_norm": 1.7065999507904053, + "learning_rate": 7.4894514767932495e-06, + "loss": 0.0867, + "step": 13670 + }, + { + "epoch": 43.00582278481013, + "grad_norm": 0.02369196154177189, + "learning_rate": 7.454289732770746e-06, + "loss": 0.035, + "step": 13680 + }, + { + "epoch": 43.00645569620253, + "grad_norm": 2.008984088897705, + "learning_rate": 7.419127988748242e-06, + "loss": 0.0754, + "step": 13690 + }, + { + "epoch": 43.00708860759494, + "grad_norm": 0.002092506969347596, + "learning_rate": 7.3839662447257386e-06, + "loss": 0.0776, + "step": 13700 + }, + { + "epoch": 43.00772151898734, + "grad_norm": 2.8015787601470947, + "learning_rate": 7.348804500703235e-06, + "loss": 0.0597, + "step": 13710 + }, + { + "epoch": 43.00835443037975, + "grad_norm": 0.002701831515878439, + "learning_rate": 7.313642756680732e-06, + "loss": 0.0401, + "step": 13720 + }, + { + "epoch": 43.008987341772155, + "grad_norm": 0.002448461716994643, + "learning_rate": 7.2784810126582285e-06, + "loss": 0.061, + "step": 13730 + }, + { + "epoch": 43.009620253164556, + "grad_norm": 0.001818045973777771, + "learning_rate": 7.243319268635724e-06, + "loss": 0.0198, + "step": 13740 + }, + { + "epoch": 43.010253164556964, + "grad_norm": 1.9535473585128784, + "learning_rate": 7.208157524613221e-06, + "loss": 0.1018, + "step": 13750 + }, + { + "epoch": 43.010886075949365, + "grad_norm": 0.0013298611156642437, + "learning_rate": 7.1729957805907175e-06, + "loss": 0.0232, + "step": 13760 + }, + { + "epoch": 43.01151898734177, + "grad_norm": 0.002159226918593049, + "learning_rate": 7.137834036568214e-06, + "loss": 0.0652, + "step": 13770 + }, + { + "epoch": 43.012151898734174, + "grad_norm": 2.004695177078247, + "learning_rate": 7.102672292545711e-06, + "loss": 0.0787, + "step": 13780 + }, + { + "epoch": 43.01278481012658, + "grad_norm": 0.002241474576294422, + "learning_rate": 7.0675105485232066e-06, + "loss": 0.0564, + "step": 13790 + }, + { + "epoch": 43.01341772151899, + "grad_norm": 0.0015954429982230067, + "learning_rate": 7.032348804500703e-06, + "loss": 0.0715, + "step": 13800 + }, + { + "epoch": 43.01405063291139, + "grad_norm": 0.04250750690698624, + "learning_rate": 6.9971870604782e-06, + "loss": 0.0477, + "step": 13810 + }, + { + "epoch": 43.0146835443038, + "grad_norm": 0.002124907448887825, + "learning_rate": 6.9620253164556965e-06, + "loss": 0.0152, + "step": 13820 + }, + { + "epoch": 43.0153164556962, + "grad_norm": 0.00806543417274952, + "learning_rate": 6.926863572433192e-06, + "loss": 0.0002, + "step": 13830 + }, + { + "epoch": 43.01594936708861, + "grad_norm": 0.0016234181821346283, + "learning_rate": 6.891701828410689e-06, + "loss": 0.0443, + "step": 13840 + }, + { + "epoch": 43.01658227848101, + "grad_norm": 0.0016976363258436322, + "learning_rate": 6.8565400843881855e-06, + "loss": 0.0811, + "step": 13850 + }, + { + "epoch": 43.01721518987342, + "grad_norm": 0.002512180246412754, + "learning_rate": 6.821378340365682e-06, + "loss": 0.0635, + "step": 13860 + }, + { + "epoch": 43.01784810126582, + "grad_norm": 0.014850087463855743, + "learning_rate": 6.786216596343179e-06, + "loss": 0.1057, + "step": 13870 + }, + { + "epoch": 43.01848101265823, + "grad_norm": 1.433249831199646, + "learning_rate": 6.7510548523206746e-06, + "loss": 0.0757, + "step": 13880 + }, + { + "epoch": 43.019113924050636, + "grad_norm": 0.0025778491981327534, + "learning_rate": 6.715893108298171e-06, + "loss": 0.01, + "step": 13890 + }, + { + "epoch": 43.01974683544304, + "grad_norm": 0.0015471552032977343, + "learning_rate": 6.680731364275668e-06, + "loss": 0.0398, + "step": 13900 + }, + { + "epoch": 43.02, + "eval_accuracy": 0.9769673704414588, + "eval_loss": 0.04403243213891983, + "eval_runtime": 918.4271, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.072, + "step": 13904 + }, + { + "epoch": 44.00037974683544, + "grad_norm": 0.002836668398231268, + "learning_rate": 6.6455696202531645e-06, + "loss": 0.0786, + "step": 13910 + }, + { + "epoch": 44.00101265822785, + "grad_norm": 1.8331170082092285, + "learning_rate": 6.610407876230662e-06, + "loss": 0.0305, + "step": 13920 + }, + { + "epoch": 44.00164556962025, + "grad_norm": 0.002194299828261137, + "learning_rate": 6.5752461322081586e-06, + "loss": 0.032, + "step": 13930 + }, + { + "epoch": 44.00227848101266, + "grad_norm": 0.19766800105571747, + "learning_rate": 6.540084388185655e-06, + "loss": 0.0536, + "step": 13940 + }, + { + "epoch": 44.00291139240506, + "grad_norm": 0.0020373559091240168, + "learning_rate": 6.504922644163151e-06, + "loss": 0.0457, + "step": 13950 + }, + { + "epoch": 44.00354430379747, + "grad_norm": 1.5892828702926636, + "learning_rate": 6.469760900140648e-06, + "loss": 0.091, + "step": 13960 + }, + { + "epoch": 44.00417721518988, + "grad_norm": 0.002311758464202285, + "learning_rate": 6.434599156118144e-06, + "loss": 0.0358, + "step": 13970 + }, + { + "epoch": 44.00481012658228, + "grad_norm": 1.6445496082305908, + "learning_rate": 6.399437412095641e-06, + "loss": 0.063, + "step": 13980 + }, + { + "epoch": 44.005443037974686, + "grad_norm": 0.012705209665000439, + "learning_rate": 6.3642756680731375e-06, + "loss": 0.0365, + "step": 13990 + }, + { + "epoch": 44.00607594936709, + "grad_norm": 2.040945053100586, + "learning_rate": 6.329113924050633e-06, + "loss": 0.094, + "step": 14000 + }, + { + "epoch": 44.006708860759495, + "grad_norm": 0.0017350486014038324, + "learning_rate": 6.29395218002813e-06, + "loss": 0.0668, + "step": 14010 + }, + { + "epoch": 44.0073417721519, + "grad_norm": 1.4773507118225098, + "learning_rate": 6.2587904360056266e-06, + "loss": 0.0389, + "step": 14020 + }, + { + "epoch": 44.007974683544305, + "grad_norm": 1.701064109802246, + "learning_rate": 6.223628691983123e-06, + "loss": 0.0213, + "step": 14030 + }, + { + "epoch": 44.008607594936706, + "grad_norm": 2.707864284515381, + "learning_rate": 6.18846694796062e-06, + "loss": 0.1098, + "step": 14040 + }, + { + "epoch": 44.009240506329114, + "grad_norm": 0.10958682745695114, + "learning_rate": 6.153305203938116e-06, + "loss": 0.1058, + "step": 14050 + }, + { + "epoch": 44.00987341772152, + "grad_norm": 1.8932132720947266, + "learning_rate": 6.118143459915612e-06, + "loss": 0.0379, + "step": 14060 + }, + { + "epoch": 44.01050632911392, + "grad_norm": 0.0022401248570531607, + "learning_rate": 6.082981715893109e-06, + "loss": 0.0445, + "step": 14070 + }, + { + "epoch": 44.01113924050633, + "grad_norm": 0.00331929256208241, + "learning_rate": 6.0478199718706055e-06, + "loss": 0.0807, + "step": 14080 + }, + { + "epoch": 44.01177215189873, + "grad_norm": 1.2681527137756348, + "learning_rate": 6.012658227848101e-06, + "loss": 0.0283, + "step": 14090 + }, + { + "epoch": 44.01240506329114, + "grad_norm": 0.0029717902652919292, + "learning_rate": 5.977496483825598e-06, + "loss": 0.0867, + "step": 14100 + }, + { + "epoch": 44.01303797468354, + "grad_norm": 1.9497371912002563, + "learning_rate": 5.9423347398030946e-06, + "loss": 0.0512, + "step": 14110 + }, + { + "epoch": 44.01367088607595, + "grad_norm": 0.0076379780657589436, + "learning_rate": 5.907172995780591e-06, + "loss": 0.0647, + "step": 14120 + }, + { + "epoch": 44.01430379746836, + "grad_norm": 0.0024882254656404257, + "learning_rate": 5.872011251758088e-06, + "loss": 0.0205, + "step": 14130 + }, + { + "epoch": 44.01493670886076, + "grad_norm": 0.0016955797327682376, + "learning_rate": 5.836849507735584e-06, + "loss": 0.0855, + "step": 14140 + }, + { + "epoch": 44.01556962025317, + "grad_norm": 1.9731398820877075, + "learning_rate": 5.80168776371308e-06, + "loss": 0.0623, + "step": 14150 + }, + { + "epoch": 44.01620253164557, + "grad_norm": 0.0020470027811825275, + "learning_rate": 5.766526019690577e-06, + "loss": 0.0151, + "step": 14160 + }, + { + "epoch": 44.01683544303798, + "grad_norm": 0.0018880012212321162, + "learning_rate": 5.7313642756680735e-06, + "loss": 0.0393, + "step": 14170 + }, + { + "epoch": 44.01746835443038, + "grad_norm": 0.003355634631589055, + "learning_rate": 5.69620253164557e-06, + "loss": 0.024, + "step": 14180 + }, + { + "epoch": 44.018101265822786, + "grad_norm": 0.0030694929882884026, + "learning_rate": 5.661040787623066e-06, + "loss": 0.0285, + "step": 14190 + }, + { + "epoch": 44.01873417721519, + "grad_norm": 0.001790488138794899, + "learning_rate": 5.6258790436005626e-06, + "loss": 0.0361, + "step": 14200 + }, + { + "epoch": 44.019367088607595, + "grad_norm": 0.0015849280171096325, + "learning_rate": 5.590717299578059e-06, + "loss": 0.0828, + "step": 14210 + }, + { + "epoch": 44.02, + "grad_norm": 1.4295841455459595, + "learning_rate": 5.555555555555556e-06, + "loss": 0.0455, + "step": 14220 + }, + { + "epoch": 44.02, + "eval_accuracy": 0.9769673704414588, + "eval_loss": 0.04359356313943863, + "eval_runtime": 920.1088, + "eval_samples_per_second": 0.566, + "eval_steps_per_second": 0.072, + "step": 14220 + }, + { + "epoch": 45.00063291139241, + "grad_norm": 0.006928425282239914, + "learning_rate": 5.520393811533052e-06, + "loss": 0.0489, + "step": 14230 + }, + { + "epoch": 45.00126582278481, + "grad_norm": 0.0019474881701171398, + "learning_rate": 5.485232067510549e-06, + "loss": 0.0236, + "step": 14240 + }, + { + "epoch": 45.00189873417722, + "grad_norm": 1.7010918855667114, + "learning_rate": 5.450070323488046e-06, + "loss": 0.0395, + "step": 14250 + }, + { + "epoch": 45.00253164556962, + "grad_norm": 0.0026303452905267477, + "learning_rate": 5.414908579465542e-06, + "loss": 0.0223, + "step": 14260 + }, + { + "epoch": 45.00316455696203, + "grad_norm": 0.0020397694315761328, + "learning_rate": 5.379746835443038e-06, + "loss": 0.0431, + "step": 14270 + }, + { + "epoch": 45.00379746835443, + "grad_norm": 2.7060606479644775, + "learning_rate": 5.344585091420535e-06, + "loss": 0.1383, + "step": 14280 + }, + { + "epoch": 45.004430379746836, + "grad_norm": 0.0019224517745897174, + "learning_rate": 5.309423347398031e-06, + "loss": 0.0401, + "step": 14290 + }, + { + "epoch": 45.00506329113924, + "grad_norm": 1.5191059112548828, + "learning_rate": 5.274261603375528e-06, + "loss": 0.0563, + "step": 14300 + }, + { + "epoch": 45.005696202531645, + "grad_norm": 1.7753527164459229, + "learning_rate": 5.239099859353025e-06, + "loss": 0.0452, + "step": 14310 + }, + { + "epoch": 45.00632911392405, + "grad_norm": 0.0027101896703243256, + "learning_rate": 5.2039381153305205e-06, + "loss": 0.0733, + "step": 14320 + }, + { + "epoch": 45.006962025316454, + "grad_norm": 0.0018217426259070635, + "learning_rate": 5.168776371308017e-06, + "loss": 0.0601, + "step": 14330 + }, + { + "epoch": 45.00759493670886, + "grad_norm": 1.4243252277374268, + "learning_rate": 5.133614627285514e-06, + "loss": 0.0995, + "step": 14340 + }, + { + "epoch": 45.008227848101264, + "grad_norm": 0.007520051673054695, + "learning_rate": 5.09845288326301e-06, + "loss": 0.0881, + "step": 14350 + }, + { + "epoch": 45.00886075949367, + "grad_norm": 1.565136194229126, + "learning_rate": 5.063291139240506e-06, + "loss": 0.0648, + "step": 14360 + }, + { + "epoch": 45.00949367088607, + "grad_norm": 0.0016765049658715725, + "learning_rate": 5.028129395218003e-06, + "loss": 0.0751, + "step": 14370 + }, + { + "epoch": 45.01012658227848, + "grad_norm": 0.0014801392098888755, + "learning_rate": 4.992967651195499e-06, + "loss": 0.0246, + "step": 14380 + }, + { + "epoch": 45.01075949367089, + "grad_norm": 0.0016110733849927783, + "learning_rate": 4.957805907172996e-06, + "loss": 0.0396, + "step": 14390 + }, + { + "epoch": 45.01139240506329, + "grad_norm": 0.002271553035825491, + "learning_rate": 4.922644163150493e-06, + "loss": 0.0268, + "step": 14400 + }, + { + "epoch": 45.0120253164557, + "grad_norm": 0.002145587233826518, + "learning_rate": 4.8874824191279884e-06, + "loss": 0.077, + "step": 14410 + }, + { + "epoch": 45.0126582278481, + "grad_norm": 0.001231012400239706, + "learning_rate": 4.852320675105485e-06, + "loss": 0.0633, + "step": 14420 + }, + { + "epoch": 45.01329113924051, + "grad_norm": 2.76226544380188, + "learning_rate": 4.817158931082982e-06, + "loss": 0.0718, + "step": 14430 + }, + { + "epoch": 45.01392405063291, + "grad_norm": 0.002031494863331318, + "learning_rate": 4.781997187060478e-06, + "loss": 0.0752, + "step": 14440 + }, + { + "epoch": 45.01455696202532, + "grad_norm": 2.092021942138672, + "learning_rate": 4.746835443037975e-06, + "loss": 0.0301, + "step": 14450 + }, + { + "epoch": 45.01518987341772, + "grad_norm": 0.0018518840661272407, + "learning_rate": 4.711673699015472e-06, + "loss": 0.0203, + "step": 14460 + }, + { + "epoch": 45.015822784810126, + "grad_norm": 0.0020814728923141956, + "learning_rate": 4.676511954992968e-06, + "loss": 0.0284, + "step": 14470 + }, + { + "epoch": 45.016455696202534, + "grad_norm": 0.001710868556983769, + "learning_rate": 4.641350210970465e-06, + "loss": 0.0455, + "step": 14480 + }, + { + "epoch": 45.017088607594935, + "grad_norm": 0.002616223180666566, + "learning_rate": 4.606188466947961e-06, + "loss": 0.0875, + "step": 14490 + }, + { + "epoch": 45.017721518987344, + "grad_norm": 0.0022610658779740334, + "learning_rate": 4.571026722925457e-06, + "loss": 0.0313, + "step": 14500 + }, + { + "epoch": 45.018354430379745, + "grad_norm": 2.640284776687622, + "learning_rate": 4.535864978902954e-06, + "loss": 0.073, + "step": 14510 + }, + { + "epoch": 45.01898734177215, + "grad_norm": 0.0019336823606863618, + "learning_rate": 4.5007032348804506e-06, + "loss": 0.0416, + "step": 14520 + }, + { + "epoch": 45.019620253164554, + "grad_norm": 0.0023440378718078136, + "learning_rate": 4.465541490857947e-06, + "loss": 0.0423, + "step": 14530 + }, + { + "epoch": 45.02, + "eval_accuracy": 0.9750479846449136, + "eval_loss": 0.043731071054935455, + "eval_runtime": 852.3259, + "eval_samples_per_second": 0.611, + "eval_steps_per_second": 0.077, + "step": 14536 + }, + { + "epoch": 46.00025316455696, + "grad_norm": 0.0025269004981964827, + "learning_rate": 4.430379746835443e-06, + "loss": 0.0336, + "step": 14540 + }, + { + "epoch": 46.00088607594937, + "grad_norm": 0.0019006689544767141, + "learning_rate": 4.39521800281294e-06, + "loss": 0.006, + "step": 14550 + }, + { + "epoch": 46.001518987341775, + "grad_norm": 0.0017826940165832639, + "learning_rate": 4.360056258790436e-06, + "loss": 0.0263, + "step": 14560 + }, + { + "epoch": 46.002151898734176, + "grad_norm": 0.0013272733194753528, + "learning_rate": 4.324894514767933e-06, + "loss": 0.0522, + "step": 14570 + }, + { + "epoch": 46.002784810126585, + "grad_norm": 1.84543776512146, + "learning_rate": 4.289732770745429e-06, + "loss": 0.0337, + "step": 14580 + }, + { + "epoch": 46.003417721518986, + "grad_norm": 2.3933355808258057, + "learning_rate": 4.254571026722925e-06, + "loss": 0.0667, + "step": 14590 + }, + { + "epoch": 46.004050632911394, + "grad_norm": 1.3483564853668213, + "learning_rate": 4.219409282700422e-06, + "loss": 0.0358, + "step": 14600 + }, + { + "epoch": 46.004683544303795, + "grad_norm": 0.0019236382795497775, + "learning_rate": 4.1842475386779186e-06, + "loss": 0.105, + "step": 14610 + }, + { + "epoch": 46.0053164556962, + "grad_norm": 1.8564695119857788, + "learning_rate": 4.149085794655415e-06, + "loss": 0.0618, + "step": 14620 + }, + { + "epoch": 46.005949367088604, + "grad_norm": 0.0019053075229749084, + "learning_rate": 4.113924050632911e-06, + "loss": 0.0308, + "step": 14630 + }, + { + "epoch": 46.00658227848101, + "grad_norm": 0.888851523399353, + "learning_rate": 4.078762306610408e-06, + "loss": 0.0046, + "step": 14640 + }, + { + "epoch": 46.00721518987342, + "grad_norm": 0.0015094269765540957, + "learning_rate": 4.043600562587905e-06, + "loss": 0.0597, + "step": 14650 + }, + { + "epoch": 46.00784810126582, + "grad_norm": 0.0023652520030736923, + "learning_rate": 4.008438818565402e-06, + "loss": 0.0719, + "step": 14660 + }, + { + "epoch": 46.00848101265823, + "grad_norm": 0.0014642721507698298, + "learning_rate": 3.9732770745428975e-06, + "loss": 0.0657, + "step": 14670 + }, + { + "epoch": 46.00911392405063, + "grad_norm": 2.2750301361083984, + "learning_rate": 3.938115330520394e-06, + "loss": 0.0324, + "step": 14680 + }, + { + "epoch": 46.00974683544304, + "grad_norm": 0.007223762571811676, + "learning_rate": 3.902953586497891e-06, + "loss": 0.0269, + "step": 14690 + }, + { + "epoch": 46.01037974683544, + "grad_norm": 0.0021259100176393986, + "learning_rate": 3.867791842475387e-06, + "loss": 0.0067, + "step": 14700 + }, + { + "epoch": 46.01101265822785, + "grad_norm": 0.00181837088894099, + "learning_rate": 3.832630098452883e-06, + "loss": 0.0759, + "step": 14710 + }, + { + "epoch": 46.011645569620256, + "grad_norm": 0.006951657589524984, + "learning_rate": 3.7974683544303802e-06, + "loss": 0.0472, + "step": 14720 + }, + { + "epoch": 46.01227848101266, + "grad_norm": 0.0019946214742958546, + "learning_rate": 3.7623066104078764e-06, + "loss": 0.0398, + "step": 14730 + }, + { + "epoch": 46.012911392405066, + "grad_norm": 0.0024334420450031757, + "learning_rate": 3.727144866385373e-06, + "loss": 0.0486, + "step": 14740 + }, + { + "epoch": 46.01354430379747, + "grad_norm": 1.671848177909851, + "learning_rate": 3.6919831223628693e-06, + "loss": 0.0479, + "step": 14750 + }, + { + "epoch": 46.014177215189875, + "grad_norm": 0.0020218833815306425, + "learning_rate": 3.656821378340366e-06, + "loss": 0.0696, + "step": 14760 + }, + { + "epoch": 46.014810126582276, + "grad_norm": 2.070733070373535, + "learning_rate": 3.621659634317862e-06, + "loss": 0.0623, + "step": 14770 + }, + { + "epoch": 46.015443037974684, + "grad_norm": 0.00212723552249372, + "learning_rate": 3.5864978902953588e-06, + "loss": 0.0344, + "step": 14780 + }, + { + "epoch": 46.016075949367085, + "grad_norm": 0.007334825582802296, + "learning_rate": 3.5513361462728554e-06, + "loss": 0.0943, + "step": 14790 + }, + { + "epoch": 46.01670886075949, + "grad_norm": 0.00477689690887928, + "learning_rate": 3.5161744022503516e-06, + "loss": 0.0391, + "step": 14800 + }, + { + "epoch": 46.0173417721519, + "grad_norm": 0.0014756337041035295, + "learning_rate": 3.4810126582278482e-06, + "loss": 0.0578, + "step": 14810 + }, + { + "epoch": 46.0179746835443, + "grad_norm": 0.0018279188079759479, + "learning_rate": 3.4458509142053444e-06, + "loss": 0.1147, + "step": 14820 + }, + { + "epoch": 46.01860759493671, + "grad_norm": 2.317160129547119, + "learning_rate": 3.410689170182841e-06, + "loss": 0.0934, + "step": 14830 + }, + { + "epoch": 46.01924050632911, + "grad_norm": 0.007261245045810938, + "learning_rate": 3.3755274261603373e-06, + "loss": 0.0732, + "step": 14840 + }, + { + "epoch": 46.01987341772152, + "grad_norm": 2.7471020221710205, + "learning_rate": 3.340365682137834e-06, + "loss": 0.0602, + "step": 14850 + }, + { + "epoch": 46.02, + "eval_accuracy": 0.9769673704414588, + "eval_loss": 0.0437638945877552, + "eval_runtime": 913.7302, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.072, + "step": 14852 + }, + { + "epoch": 47.000506329113925, + "grad_norm": 1.6660641431808472, + "learning_rate": 3.305203938115331e-06, + "loss": 0.0939, + "step": 14860 + }, + { + "epoch": 47.001139240506326, + "grad_norm": 1.8603593111038208, + "learning_rate": 3.2700421940928276e-06, + "loss": 0.1073, + "step": 14870 + }, + { + "epoch": 47.001772151898734, + "grad_norm": 0.0009881872683763504, + "learning_rate": 3.234880450070324e-06, + "loss": 0.059, + "step": 14880 + }, + { + "epoch": 47.00240506329114, + "grad_norm": 0.0015653116861358285, + "learning_rate": 3.1997187060478204e-06, + "loss": 0.0548, + "step": 14890 + }, + { + "epoch": 47.003037974683544, + "grad_norm": 1.5998444557189941, + "learning_rate": 3.1645569620253167e-06, + "loss": 0.0465, + "step": 14900 + }, + { + "epoch": 47.00367088607595, + "grad_norm": 0.0016954662278294563, + "learning_rate": 3.1293952180028133e-06, + "loss": 0.0684, + "step": 14910 + }, + { + "epoch": 47.00430379746835, + "grad_norm": 1.6929208040237427, + "learning_rate": 3.09423347398031e-06, + "loss": 0.0489, + "step": 14920 + }, + { + "epoch": 47.00493670886076, + "grad_norm": 1.6551687717437744, + "learning_rate": 3.059071729957806e-06, + "loss": 0.0799, + "step": 14930 + }, + { + "epoch": 47.00556962025316, + "grad_norm": 0.0021378027740865946, + "learning_rate": 3.0239099859353028e-06, + "loss": 0.0199, + "step": 14940 + }, + { + "epoch": 47.00620253164557, + "grad_norm": 0.001334396773017943, + "learning_rate": 2.988748241912799e-06, + "loss": 0.0181, + "step": 14950 + }, + { + "epoch": 47.00683544303797, + "grad_norm": 0.007229403592646122, + "learning_rate": 2.9535864978902956e-06, + "loss": 0.0588, + "step": 14960 + }, + { + "epoch": 47.00746835443038, + "grad_norm": 0.0016747723566368222, + "learning_rate": 2.918424753867792e-06, + "loss": 0.034, + "step": 14970 + }, + { + "epoch": 47.00810126582279, + "grad_norm": 0.8091289401054382, + "learning_rate": 2.8832630098452884e-06, + "loss": 0.0476, + "step": 14980 + }, + { + "epoch": 47.00873417721519, + "grad_norm": 1.8234349489212036, + "learning_rate": 2.848101265822785e-06, + "loss": 0.0711, + "step": 14990 + }, + { + "epoch": 47.0093670886076, + "grad_norm": 1.9740657806396484, + "learning_rate": 2.8129395218002813e-06, + "loss": 0.0338, + "step": 15000 + }, + { + "epoch": 47.01, + "grad_norm": 0.0016077395994216204, + "learning_rate": 2.777777777777778e-06, + "loss": 0.0454, + "step": 15010 + }, + { + "epoch": 47.010632911392406, + "grad_norm": 0.004215624183416367, + "learning_rate": 2.7426160337552745e-06, + "loss": 0.0438, + "step": 15020 + }, + { + "epoch": 47.01126582278481, + "grad_norm": 0.0015969170490279794, + "learning_rate": 2.707454289732771e-06, + "loss": 0.0418, + "step": 15030 + }, + { + "epoch": 47.011898734177215, + "grad_norm": 1.9868484735488892, + "learning_rate": 2.6722925457102674e-06, + "loss": 0.0777, + "step": 15040 + }, + { + "epoch": 47.012531645569624, + "grad_norm": 0.006545115727931261, + "learning_rate": 2.637130801687764e-06, + "loss": 0.0402, + "step": 15050 + }, + { + "epoch": 47.013164556962025, + "grad_norm": 0.008419793099164963, + "learning_rate": 2.6019690576652602e-06, + "loss": 0.0617, + "step": 15060 + }, + { + "epoch": 47.01379746835443, + "grad_norm": 1.8613364696502686, + "learning_rate": 2.566807313642757e-06, + "loss": 0.0477, + "step": 15070 + }, + { + "epoch": 47.014430379746834, + "grad_norm": 1.6971709728240967, + "learning_rate": 2.531645569620253e-06, + "loss": 0.0393, + "step": 15080 + }, + { + "epoch": 47.01506329113924, + "grad_norm": 0.0011392009910196066, + "learning_rate": 2.4964838255977497e-06, + "loss": 0.0481, + "step": 15090 + }, + { + "epoch": 47.01569620253164, + "grad_norm": 2.0638561248779297, + "learning_rate": 2.4613220815752463e-06, + "loss": 0.0648, + "step": 15100 + }, + { + "epoch": 47.01632911392405, + "grad_norm": 0.007941323332488537, + "learning_rate": 2.4261603375527425e-06, + "loss": 0.07, + "step": 15110 + }, + { + "epoch": 47.01696202531645, + "grad_norm": 0.0015764615964144468, + "learning_rate": 2.390998593530239e-06, + "loss": 0.0537, + "step": 15120 + }, + { + "epoch": 47.01759493670886, + "grad_norm": 1.6305996179580688, + "learning_rate": 2.355836849507736e-06, + "loss": 0.0408, + "step": 15130 + }, + { + "epoch": 47.01822784810127, + "grad_norm": 1.698041558265686, + "learning_rate": 2.3206751054852324e-06, + "loss": 0.0421, + "step": 15140 + }, + { + "epoch": 47.01886075949367, + "grad_norm": 1.406259536743164, + "learning_rate": 2.2855133614627286e-06, + "loss": 0.0367, + "step": 15150 + }, + { + "epoch": 47.01949367088608, + "grad_norm": 0.0014187191845849156, + "learning_rate": 2.2503516174402253e-06, + "loss": 0.0407, + "step": 15160 + }, + { + "epoch": 47.02, + "eval_accuracy": 0.9750479846449136, + "eval_loss": 0.04366590827703476, + "eval_runtime": 877.8986, + "eval_samples_per_second": 0.593, + "eval_steps_per_second": 0.075, + "step": 15168 + }, + { + "epoch": 48.00012658227848, + "grad_norm": 0.0016180831007659435, + "learning_rate": 2.2151898734177215e-06, + "loss": 0.0144, + "step": 15170 + }, + { + "epoch": 48.000759493670884, + "grad_norm": 1.924263596534729, + "learning_rate": 2.180028129395218e-06, + "loss": 0.0882, + "step": 15180 + }, + { + "epoch": 48.00139240506329, + "grad_norm": 1.9763191938400269, + "learning_rate": 2.1448663853727143e-06, + "loss": 0.051, + "step": 15190 + }, + { + "epoch": 48.00202531645569, + "grad_norm": 2.0363032817840576, + "learning_rate": 2.109704641350211e-06, + "loss": 0.048, + "step": 15200 + }, + { + "epoch": 48.0026582278481, + "grad_norm": 1.593593716621399, + "learning_rate": 2.0745428973277076e-06, + "loss": 0.0373, + "step": 15210 + }, + { + "epoch": 48.00329113924051, + "grad_norm": 1.4322104454040527, + "learning_rate": 2.039381153305204e-06, + "loss": 0.0681, + "step": 15220 + }, + { + "epoch": 48.00392405063291, + "grad_norm": 0.0023278342559933662, + "learning_rate": 2.004219409282701e-06, + "loss": 0.0085, + "step": 15230 + }, + { + "epoch": 48.00455696202532, + "grad_norm": 2.485316514968872, + "learning_rate": 1.969057665260197e-06, + "loss": 0.0902, + "step": 15240 + }, + { + "epoch": 48.00518987341772, + "grad_norm": 0.0025991136208176613, + "learning_rate": 1.9338959212376937e-06, + "loss": 0.0845, + "step": 15250 + }, + { + "epoch": 48.00582278481013, + "grad_norm": 0.001590245054103434, + "learning_rate": 1.8987341772151901e-06, + "loss": 0.0218, + "step": 15260 + }, + { + "epoch": 48.00645569620253, + "grad_norm": 0.001682686386629939, + "learning_rate": 1.8635724331926865e-06, + "loss": 0.0623, + "step": 15270 + }, + { + "epoch": 48.00708860759494, + "grad_norm": 1.803706169128418, + "learning_rate": 1.828410689170183e-06, + "loss": 0.0568, + "step": 15280 + }, + { + "epoch": 48.00772151898734, + "grad_norm": 0.003340956987813115, + "learning_rate": 1.7932489451476794e-06, + "loss": 0.0419, + "step": 15290 + }, + { + "epoch": 48.00835443037975, + "grad_norm": 0.0032536042854189873, + "learning_rate": 1.7580872011251758e-06, + "loss": 0.0861, + "step": 15300 + }, + { + "epoch": 48.008987341772155, + "grad_norm": 1.4462651014328003, + "learning_rate": 1.7229254571026722e-06, + "loss": 0.0639, + "step": 15310 + }, + { + "epoch": 48.009620253164556, + "grad_norm": 0.0021976635325700045, + "learning_rate": 1.6877637130801686e-06, + "loss": 0.0624, + "step": 15320 + }, + { + "epoch": 48.010253164556964, + "grad_norm": 0.07273641228675842, + "learning_rate": 1.6526019690576655e-06, + "loss": 0.0662, + "step": 15330 + }, + { + "epoch": 48.010886075949365, + "grad_norm": 0.0021416894160211086, + "learning_rate": 1.617440225035162e-06, + "loss": 0.0562, + "step": 15340 + }, + { + "epoch": 48.01151898734177, + "grad_norm": 1.788458228111267, + "learning_rate": 1.5822784810126583e-06, + "loss": 0.047, + "step": 15350 + }, + { + "epoch": 48.012151898734174, + "grad_norm": 0.03488551825284958, + "learning_rate": 1.547116736990155e-06, + "loss": 0.0406, + "step": 15360 + }, + { + "epoch": 48.01278481012658, + "grad_norm": 0.002074979245662689, + "learning_rate": 1.5119549929676514e-06, + "loss": 0.0363, + "step": 15370 + }, + { + "epoch": 48.01341772151899, + "grad_norm": 0.00142192211933434, + "learning_rate": 1.4767932489451478e-06, + "loss": 0.0473, + "step": 15380 + }, + { + "epoch": 48.01405063291139, + "grad_norm": 1.6816853284835815, + "learning_rate": 1.4416315049226442e-06, + "loss": 0.0692, + "step": 15390 + }, + { + "epoch": 48.0146835443038, + "grad_norm": 0.0019629504531621933, + "learning_rate": 1.4064697609001406e-06, + "loss": 0.0566, + "step": 15400 + }, + { + "epoch": 48.0153164556962, + "grad_norm": 1.898903250694275, + "learning_rate": 1.3713080168776373e-06, + "loss": 0.0501, + "step": 15410 + }, + { + "epoch": 48.01594936708861, + "grad_norm": 0.0017211647937074304, + "learning_rate": 1.3361462728551337e-06, + "loss": 0.0268, + "step": 15420 + }, + { + "epoch": 48.01658227848101, + "grad_norm": 0.0072492752224206924, + "learning_rate": 1.3009845288326301e-06, + "loss": 0.0376, + "step": 15430 + }, + { + "epoch": 48.01721518987342, + "grad_norm": 0.0022765600588172674, + "learning_rate": 1.2658227848101265e-06, + "loss": 0.0423, + "step": 15440 + }, + { + "epoch": 48.01784810126582, + "grad_norm": 0.001874367124401033, + "learning_rate": 1.2306610407876232e-06, + "loss": 0.0247, + "step": 15450 + }, + { + "epoch": 48.01848101265823, + "grad_norm": 0.0012237579794600606, + "learning_rate": 1.1954992967651196e-06, + "loss": 0.0355, + "step": 15460 + }, + { + "epoch": 48.019113924050636, + "grad_norm": 1.6173341274261475, + "learning_rate": 1.1603375527426162e-06, + "loss": 0.07, + "step": 15470 + }, + { + "epoch": 48.01974683544304, + "grad_norm": 0.001786409760825336, + "learning_rate": 1.1251758087201126e-06, + "loss": 0.0435, + "step": 15480 + }, + { + "epoch": 48.02, + "eval_accuracy": 0.9769673704414588, + "eval_loss": 0.043503157794475555, + "eval_runtime": 856.2754, + "eval_samples_per_second": 0.608, + "eval_steps_per_second": 0.077, + "step": 15484 + }, + { + "epoch": 49.00037974683544, + "grad_norm": 1.8200874328613281, + "learning_rate": 1.090014064697609e-06, + "loss": 0.0552, + "step": 15490 + }, + { + "epoch": 49.00101265822785, + "grad_norm": 0.002268126467242837, + "learning_rate": 1.0548523206751055e-06, + "loss": 0.0194, + "step": 15500 + }, + { + "epoch": 49.00164556962025, + "grad_norm": 0.7153782844543457, + "learning_rate": 1.019690576652602e-06, + "loss": 0.104, + "step": 15510 + }, + { + "epoch": 49.00227848101266, + "grad_norm": 0.0011382299708202481, + "learning_rate": 9.845288326300985e-07, + "loss": 0.0602, + "step": 15520 + }, + { + "epoch": 49.00291139240506, + "grad_norm": 1.8290748596191406, + "learning_rate": 9.493670886075951e-07, + "loss": 0.0151, + "step": 15530 + }, + { + "epoch": 49.00354430379747, + "grad_norm": 0.0012674119789153337, + "learning_rate": 9.142053445850915e-07, + "loss": 0.0333, + "step": 15540 + }, + { + "epoch": 49.00417721518988, + "grad_norm": 1.7633377313613892, + "learning_rate": 8.790436005625879e-07, + "loss": 0.0616, + "step": 15550 + }, + { + "epoch": 49.00481012658228, + "grad_norm": 1.8792239427566528, + "learning_rate": 8.438818565400843e-07, + "loss": 0.0621, + "step": 15560 + }, + { + "epoch": 49.005443037974686, + "grad_norm": 0.002730746753513813, + "learning_rate": 8.08720112517581e-07, + "loss": 0.0706, + "step": 15570 + }, + { + "epoch": 49.00607594936709, + "grad_norm": 0.0031078814063221216, + "learning_rate": 7.735583684950775e-07, + "loss": 0.0644, + "step": 15580 + }, + { + "epoch": 49.006708860759495, + "grad_norm": 0.00169693015050143, + "learning_rate": 7.383966244725739e-07, + "loss": 0.0387, + "step": 15590 + }, + { + "epoch": 49.0073417721519, + "grad_norm": 0.0015373198548331857, + "learning_rate": 7.032348804500703e-07, + "loss": 0.0601, + "step": 15600 + }, + { + "epoch": 49.007974683544305, + "grad_norm": 0.008119100704789162, + "learning_rate": 6.680731364275668e-07, + "loss": 0.0289, + "step": 15610 + }, + { + "epoch": 49.008607594936706, + "grad_norm": 2.2671926021575928, + "learning_rate": 6.329113924050633e-07, + "loss": 0.0466, + "step": 15620 + }, + { + "epoch": 49.009240506329114, + "grad_norm": 0.0015134834684431553, + "learning_rate": 5.977496483825598e-07, + "loss": 0.0606, + "step": 15630 + }, + { + "epoch": 49.00987341772152, + "grad_norm": 1.3951467275619507, + "learning_rate": 5.625879043600563e-07, + "loss": 0.0563, + "step": 15640 + }, + { + "epoch": 49.01050632911392, + "grad_norm": 0.0020348818507045507, + "learning_rate": 5.274261603375527e-07, + "loss": 0.0662, + "step": 15650 + }, + { + "epoch": 49.01113924050633, + "grad_norm": 2.6892151832580566, + "learning_rate": 4.922644163150493e-07, + "loss": 0.0594, + "step": 15660 + }, + { + "epoch": 49.01177215189873, + "grad_norm": 0.0018980724271386862, + "learning_rate": 4.5710267229254574e-07, + "loss": 0.0116, + "step": 15670 + }, + { + "epoch": 49.01240506329114, + "grad_norm": 0.0024212722200900316, + "learning_rate": 4.2194092827004216e-07, + "loss": 0.0489, + "step": 15680 + }, + { + "epoch": 49.01303797468354, + "grad_norm": 0.0014871886232867837, + "learning_rate": 3.8677918424753874e-07, + "loss": 0.0409, + "step": 15690 + }, + { + "epoch": 49.01367088607595, + "grad_norm": 0.0018424575682729483, + "learning_rate": 3.5161744022503516e-07, + "loss": 0.0436, + "step": 15700 + }, + { + "epoch": 49.01430379746836, + "grad_norm": 0.0016427412629127502, + "learning_rate": 3.1645569620253163e-07, + "loss": 0.0623, + "step": 15710 + }, + { + "epoch": 49.01493670886076, + "grad_norm": 0.00207283697091043, + "learning_rate": 2.8129395218002816e-07, + "loss": 0.099, + "step": 15720 + }, + { + "epoch": 49.01556962025317, + "grad_norm": 0.001088398857973516, + "learning_rate": 2.4613220815752463e-07, + "loss": 0.0557, + "step": 15730 + }, + { + "epoch": 49.01620253164557, + "grad_norm": 1.5488436222076416, + "learning_rate": 2.1097046413502108e-07, + "loss": 0.0514, + "step": 15740 + }, + { + "epoch": 49.01683544303798, + "grad_norm": 0.0019001478794962168, + "learning_rate": 1.7580872011251758e-07, + "loss": 0.0426, + "step": 15750 + }, + { + "epoch": 49.01746835443038, + "grad_norm": 0.0017688849475234747, + "learning_rate": 1.4064697609001408e-07, + "loss": 0.0131, + "step": 15760 + }, + { + "epoch": 49.018101265822786, + "grad_norm": 0.0013972530141472816, + "learning_rate": 1.0548523206751054e-07, + "loss": 0.0471, + "step": 15770 + }, + { + "epoch": 49.01873417721519, + "grad_norm": 0.0011928863823413849, + "learning_rate": 7.032348804500704e-08, + "loss": 0.0233, + "step": 15780 + }, + { + "epoch": 49.019367088607595, + "grad_norm": 0.0023855739273130894, + "learning_rate": 3.516174402250352e-08, + "loss": 0.0767, + "step": 15790 + }, + { + "epoch": 49.02, + "grad_norm": 0.0015424606390297413, + "learning_rate": 0.0, + "loss": 0.0463, + "step": 15800 + }, + { + "epoch": 49.02, + "eval_accuracy": 0.9769673704414588, + "eval_loss": 0.04361777752637863, + "eval_runtime": 919.5227, + "eval_samples_per_second": 0.567, + "eval_steps_per_second": 0.072, + "step": 15800 + }, + { + "epoch": 49.02, + "step": 15800, + "total_flos": 1.5772028175840707e+20, + "train_loss": 0.3727006455846838, + "train_runtime": 274155.4834, + "train_samples_per_second": 0.461, + "train_steps_per_second": 0.058 + }, + { + "epoch": 49.02, + "eval_accuracy": 0.8875, + "eval_loss": 0.4278368353843689, + "eval_runtime": 1123.5362, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.071, + "step": 15800 + }, + { + "epoch": 49.02, + "eval_accuracy": 0.8875, + "eval_loss": 0.4278368353843689, + "eval_runtime": 1123.0215, + "eval_samples_per_second": 0.57, + "eval_steps_per_second": 0.071, + "step": 15800 + } + ], + "logging_steps": 10, + "max_steps": 15800, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "total_flos": 1.5772028175840707e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}