{ "best_metric": 0.9769673704414588, "best_model_checkpoint": "videomae-base-finetuned-isl-numbers-alphabet-nouns/checkpoint-13904", "epoch": 49.02, "eval_steps": 500, "global_step": 15800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006329113924050633, "grad_norm": 7.342864990234375, "learning_rate": 3.1645569620253163e-07, "loss": 5.0266, "step": 10 }, { "epoch": 0.0012658227848101266, "grad_norm": 7.16239070892334, "learning_rate": 6.329113924050633e-07, "loss": 5.0833, "step": 20 }, { "epoch": 0.0018987341772151898, "grad_norm": 7.36508846282959, "learning_rate": 9.493670886075951e-07, "loss": 5.0852, "step": 30 }, { "epoch": 0.002531645569620253, "grad_norm": 7.174118995666504, "learning_rate": 1.2658227848101265e-06, "loss": 5.1223, "step": 40 }, { "epoch": 0.0031645569620253164, "grad_norm": 7.650392055511475, "learning_rate": 1.5822784810126583e-06, "loss": 5.0765, "step": 50 }, { "epoch": 0.0037974683544303796, "grad_norm": 7.764711856842041, "learning_rate": 1.8987341772151901e-06, "loss": 5.0422, "step": 60 }, { "epoch": 0.004430379746835443, "grad_norm": 11.08586311340332, "learning_rate": 2.2151898734177215e-06, "loss": 5.0661, "step": 70 }, { "epoch": 0.005063291139240506, "grad_norm": 7.217874050140381, "learning_rate": 2.531645569620253e-06, "loss": 5.0943, "step": 80 }, { "epoch": 0.00569620253164557, "grad_norm": 7.349112510681152, "learning_rate": 2.848101265822785e-06, "loss": 5.1201, "step": 90 }, { "epoch": 0.006329113924050633, "grad_norm": 7.260085582733154, "learning_rate": 3.1645569620253167e-06, "loss": 5.0797, "step": 100 }, { "epoch": 0.006962025316455696, "grad_norm": 7.140063762664795, "learning_rate": 3.4810126582278482e-06, "loss": 5.0803, "step": 110 }, { "epoch": 0.007594936708860759, "grad_norm": 8.30470085144043, "learning_rate": 3.7974683544303802e-06, "loss": 5.061, "step": 120 }, { "epoch": 0.008227848101265823, "grad_norm": 8.53849983215332, "learning_rate": 4.113924050632911e-06, "loss": 5.0407, "step": 130 }, { "epoch": 0.008860759493670886, "grad_norm": 7.535670757293701, "learning_rate": 4.430379746835443e-06, "loss": 5.0505, "step": 140 }, { "epoch": 0.00949367088607595, "grad_norm": 8.011333465576172, "learning_rate": 4.746835443037975e-06, "loss": 5.0643, "step": 150 }, { "epoch": 0.010126582278481013, "grad_norm": 8.219789505004883, "learning_rate": 5.063291139240506e-06, "loss": 4.9714, "step": 160 }, { "epoch": 0.010759493670886076, "grad_norm": 8.42809009552002, "learning_rate": 5.379746835443038e-06, "loss": 4.9477, "step": 170 }, { "epoch": 0.01139240506329114, "grad_norm": 11.367712020874023, "learning_rate": 5.69620253164557e-06, "loss": 4.9509, "step": 180 }, { "epoch": 0.012025316455696202, "grad_norm": 9.428906440734863, "learning_rate": 6.012658227848101e-06, "loss": 4.9267, "step": 190 }, { "epoch": 0.012658227848101266, "grad_norm": 8.949156761169434, "learning_rate": 6.329113924050633e-06, "loss": 4.8931, "step": 200 }, { "epoch": 0.013291139240506329, "grad_norm": 8.922019004821777, "learning_rate": 6.6455696202531645e-06, "loss": 4.921, "step": 210 }, { "epoch": 0.013924050632911392, "grad_norm": 10.950883865356445, "learning_rate": 6.9620253164556965e-06, "loss": 4.781, "step": 220 }, { "epoch": 0.014556962025316455, "grad_norm": 10.333183288574219, "learning_rate": 7.2784810126582285e-06, "loss": 4.8136, "step": 230 }, { "epoch": 0.015189873417721518, "grad_norm": 11.363033294677734, "learning_rate": 7.5949367088607605e-06, "loss": 4.8154, "step": 240 }, { "epoch": 0.015822784810126583, "grad_norm": 10.919289588928223, "learning_rate": 7.911392405063292e-06, "loss": 4.8229, "step": 250 }, { "epoch": 0.016455696202531647, "grad_norm": 13.473514556884766, "learning_rate": 8.227848101265822e-06, "loss": 4.6251, "step": 260 }, { "epoch": 0.01708860759493671, "grad_norm": 13.167325973510742, "learning_rate": 8.544303797468354e-06, "loss": 4.6354, "step": 270 }, { "epoch": 0.017721518987341773, "grad_norm": 12.691937446594238, "learning_rate": 8.860759493670886e-06, "loss": 4.6029, "step": 280 }, { "epoch": 0.018354430379746836, "grad_norm": 13.479981422424316, "learning_rate": 9.177215189873418e-06, "loss": 4.621, "step": 290 }, { "epoch": 0.0189873417721519, "grad_norm": 12.953887939453125, "learning_rate": 9.49367088607595e-06, "loss": 4.5616, "step": 300 }, { "epoch": 0.019620253164556962, "grad_norm": 13.775838851928711, "learning_rate": 9.81012658227848e-06, "loss": 4.5228, "step": 310 }, { "epoch": 0.02, "eval_accuracy": 0.2533589251439539, "eval_loss": 4.3514204025268555, "eval_runtime": 834.4919, "eval_samples_per_second": 0.624, "eval_steps_per_second": 0.079, "step": 316 }, { "epoch": 1.0002531645569621, "grad_norm": 12.405595779418945, "learning_rate": 1.0126582278481012e-05, "loss": 4.3858, "step": 320 }, { "epoch": 1.0008860759493672, "grad_norm": 14.15708065032959, "learning_rate": 1.0443037974683544e-05, "loss": 4.3131, "step": 330 }, { "epoch": 1.0015189873417722, "grad_norm": 12.798722267150879, "learning_rate": 1.0759493670886076e-05, "loss": 4.3399, "step": 340 }, { "epoch": 1.0021518987341773, "grad_norm": 15.9908447265625, "learning_rate": 1.1075949367088608e-05, "loss": 4.2376, "step": 350 }, { "epoch": 1.0027848101265824, "grad_norm": 12.808874130249023, "learning_rate": 1.139240506329114e-05, "loss": 4.1385, "step": 360 }, { "epoch": 1.0034177215189874, "grad_norm": 15.553874969482422, "learning_rate": 1.170886075949367e-05, "loss": 4.202, "step": 370 }, { "epoch": 1.0040506329113925, "grad_norm": 12.774052619934082, "learning_rate": 1.2025316455696203e-05, "loss": 4.0002, "step": 380 }, { "epoch": 1.0046835443037974, "grad_norm": 15.000925064086914, "learning_rate": 1.2341772151898735e-05, "loss": 4.0083, "step": 390 }, { "epoch": 1.0053164556962024, "grad_norm": 13.524576187133789, "learning_rate": 1.2658227848101267e-05, "loss": 4.086, "step": 400 }, { "epoch": 1.0059493670886075, "grad_norm": 14.740311622619629, "learning_rate": 1.2974683544303799e-05, "loss": 4.0018, "step": 410 }, { "epoch": 1.0065822784810126, "grad_norm": 13.781682968139648, "learning_rate": 1.3291139240506329e-05, "loss": 3.9788, "step": 420 }, { "epoch": 1.0072151898734176, "grad_norm": 17.60359764099121, "learning_rate": 1.3607594936708861e-05, "loss": 3.7957, "step": 430 }, { "epoch": 1.0078481012658227, "grad_norm": 15.860666275024414, "learning_rate": 1.3924050632911393e-05, "loss": 3.8316, "step": 440 }, { "epoch": 1.0084810126582278, "grad_norm": 18.55211639404297, "learning_rate": 1.4240506329113925e-05, "loss": 3.7908, "step": 450 }, { "epoch": 1.0091139240506328, "grad_norm": 12.959016799926758, "learning_rate": 1.4556962025316457e-05, "loss": 3.6121, "step": 460 }, { "epoch": 1.009746835443038, "grad_norm": 14.276836395263672, "learning_rate": 1.4873417721518987e-05, "loss": 3.6771, "step": 470 }, { "epoch": 1.010379746835443, "grad_norm": 15.222319602966309, "learning_rate": 1.5189873417721521e-05, "loss": 3.695, "step": 480 }, { "epoch": 1.011012658227848, "grad_norm": 12.566011428833008, "learning_rate": 1.550632911392405e-05, "loss": 3.6071, "step": 490 }, { "epoch": 1.011645569620253, "grad_norm": 13.210318565368652, "learning_rate": 1.5822784810126583e-05, "loss": 3.4652, "step": 500 }, { "epoch": 1.0122784810126582, "grad_norm": 14.671469688415527, "learning_rate": 1.6139240506329115e-05, "loss": 3.457, "step": 510 }, { "epoch": 1.0129113924050632, "grad_norm": 12.707144737243652, "learning_rate": 1.6455696202531644e-05, "loss": 3.5506, "step": 520 }, { "epoch": 1.0135443037974683, "grad_norm": 15.84700870513916, "learning_rate": 1.677215189873418e-05, "loss": 3.433, "step": 530 }, { "epoch": 1.0141772151898734, "grad_norm": 13.466840744018555, "learning_rate": 1.7088607594936708e-05, "loss": 3.4244, "step": 540 }, { "epoch": 1.0148101265822784, "grad_norm": 12.980550765991211, "learning_rate": 1.7405063291139243e-05, "loss": 3.3659, "step": 550 }, { "epoch": 1.0154430379746835, "grad_norm": 12.236871719360352, "learning_rate": 1.7721518987341772e-05, "loss": 3.3406, "step": 560 }, { "epoch": 1.0160759493670886, "grad_norm": 14.126879692077637, "learning_rate": 1.8037974683544304e-05, "loss": 3.2676, "step": 570 }, { "epoch": 1.0167088607594936, "grad_norm": 14.927568435668945, "learning_rate": 1.8354430379746836e-05, "loss": 3.1862, "step": 580 }, { "epoch": 1.0173417721518987, "grad_norm": 14.192325592041016, "learning_rate": 1.8670886075949368e-05, "loss": 3.2144, "step": 590 }, { "epoch": 1.0179746835443038, "grad_norm": 12.938292503356934, "learning_rate": 1.89873417721519e-05, "loss": 3.1199, "step": 600 }, { "epoch": 1.0186075949367088, "grad_norm": 15.724618911743164, "learning_rate": 1.9303797468354432e-05, "loss": 3.1761, "step": 610 }, { "epoch": 1.019240506329114, "grad_norm": 12.795076370239258, "learning_rate": 1.962025316455696e-05, "loss": 3.0823, "step": 620 }, { "epoch": 1.019873417721519, "grad_norm": 16.50122833251953, "learning_rate": 1.9936708860759496e-05, "loss": 3.0795, "step": 630 }, { "epoch": 1.02, "eval_accuracy": 0.581573896353167, "eval_loss": 2.8514750003814697, "eval_runtime": 833.5163, "eval_samples_per_second": 0.625, "eval_steps_per_second": 0.079, "step": 632 }, { "epoch": 2.0005063291139242, "grad_norm": 16.488386154174805, "learning_rate": 2.0253164556962025e-05, "loss": 2.7661, "step": 640 }, { "epoch": 2.0011392405063293, "grad_norm": 13.507458686828613, "learning_rate": 2.056962025316456e-05, "loss": 2.6532, "step": 650 }, { "epoch": 2.0017721518987344, "grad_norm": 12.340897560119629, "learning_rate": 2.088607594936709e-05, "loss": 2.6855, "step": 660 }, { "epoch": 2.0024050632911394, "grad_norm": 13.872140884399414, "learning_rate": 2.120253164556962e-05, "loss": 2.8458, "step": 670 }, { "epoch": 2.0030379746835445, "grad_norm": 12.209596633911133, "learning_rate": 2.1518987341772153e-05, "loss": 2.6217, "step": 680 }, { "epoch": 2.0036708860759496, "grad_norm": 12.713300704956055, "learning_rate": 2.1835443037974685e-05, "loss": 2.6449, "step": 690 }, { "epoch": 2.0043037974683546, "grad_norm": 13.209062576293945, "learning_rate": 2.2151898734177217e-05, "loss": 2.6258, "step": 700 }, { "epoch": 2.0049367088607597, "grad_norm": 13.877063751220703, "learning_rate": 2.246835443037975e-05, "loss": 2.5975, "step": 710 }, { "epoch": 2.0055696202531648, "grad_norm": 15.146308898925781, "learning_rate": 2.278481012658228e-05, "loss": 2.4536, "step": 720 }, { "epoch": 2.00620253164557, "grad_norm": 12.468453407287598, "learning_rate": 2.3101265822784813e-05, "loss": 2.4758, "step": 730 }, { "epoch": 2.006835443037975, "grad_norm": 13.691295623779297, "learning_rate": 2.341772151898734e-05, "loss": 2.6211, "step": 740 }, { "epoch": 2.00746835443038, "grad_norm": 13.543225288391113, "learning_rate": 2.3734177215189873e-05, "loss": 2.5074, "step": 750 }, { "epoch": 2.008101265822785, "grad_norm": 12.138303756713867, "learning_rate": 2.4050632911392405e-05, "loss": 2.2878, "step": 760 }, { "epoch": 2.0087341772151897, "grad_norm": 12.926602363586426, "learning_rate": 2.4367088607594937e-05, "loss": 2.3154, "step": 770 }, { "epoch": 2.0093670886075947, "grad_norm": 13.598923683166504, "learning_rate": 2.468354430379747e-05, "loss": 2.2751, "step": 780 }, { "epoch": 2.01, "grad_norm": 12.883562088012695, "learning_rate": 2.5e-05, "loss": 2.3078, "step": 790 }, { "epoch": 2.010632911392405, "grad_norm": 13.546457290649414, "learning_rate": 2.5316455696202533e-05, "loss": 2.5038, "step": 800 }, { "epoch": 2.01126582278481, "grad_norm": 15.554511070251465, "learning_rate": 2.5632911392405062e-05, "loss": 2.3539, "step": 810 }, { "epoch": 2.011898734177215, "grad_norm": 12.446627616882324, "learning_rate": 2.5949367088607597e-05, "loss": 2.4136, "step": 820 }, { "epoch": 2.01253164556962, "grad_norm": 17.281230926513672, "learning_rate": 2.626582278481013e-05, "loss": 2.2076, "step": 830 }, { "epoch": 2.013164556962025, "grad_norm": 11.104803085327148, "learning_rate": 2.6582278481012658e-05, "loss": 2.1716, "step": 840 }, { "epoch": 2.01379746835443, "grad_norm": 15.020671844482422, "learning_rate": 2.689873417721519e-05, "loss": 2.3163, "step": 850 }, { "epoch": 2.0144303797468353, "grad_norm": 12.115602493286133, "learning_rate": 2.7215189873417722e-05, "loss": 2.1664, "step": 860 }, { "epoch": 2.0150632911392403, "grad_norm": 13.693445205688477, "learning_rate": 2.7531645569620257e-05, "loss": 2.1606, "step": 870 }, { "epoch": 2.0156962025316454, "grad_norm": 10.924906730651855, "learning_rate": 2.7848101265822786e-05, "loss": 2.136, "step": 880 }, { "epoch": 2.0163291139240505, "grad_norm": 11.92796802520752, "learning_rate": 2.8164556962025318e-05, "loss": 2.1733, "step": 890 }, { "epoch": 2.0169620253164555, "grad_norm": 12.855241775512695, "learning_rate": 2.848101265822785e-05, "loss": 2.07, "step": 900 }, { "epoch": 2.0175949367088606, "grad_norm": 14.710478782653809, "learning_rate": 2.879746835443038e-05, "loss": 2.1004, "step": 910 }, { "epoch": 2.0182278481012657, "grad_norm": 14.612150192260742, "learning_rate": 2.9113924050632914e-05, "loss": 2.011, "step": 920 }, { "epoch": 2.0188607594936707, "grad_norm": 15.704828262329102, "learning_rate": 2.9430379746835446e-05, "loss": 2.0263, "step": 930 }, { "epoch": 2.019493670886076, "grad_norm": 12.45781421661377, "learning_rate": 2.9746835443037974e-05, "loss": 1.8438, "step": 940 }, { "epoch": 2.02, "eval_accuracy": 0.7332053742802304, "eval_loss": 1.7508126497268677, "eval_runtime": 838.4982, "eval_samples_per_second": 0.621, "eval_steps_per_second": 0.079, "step": 948 }, { "epoch": 3.000126582278481, "grad_norm": 11.997598648071289, "learning_rate": 3.0063291139240506e-05, "loss": 1.8218, "step": 950 }, { "epoch": 3.000759493670886, "grad_norm": 10.052958488464355, "learning_rate": 3.0379746835443042e-05, "loss": 1.8534, "step": 960 }, { "epoch": 3.001392405063291, "grad_norm": 12.40282154083252, "learning_rate": 3.0696202531645574e-05, "loss": 1.7654, "step": 970 }, { "epoch": 3.002025316455696, "grad_norm": 12.564291000366211, "learning_rate": 3.10126582278481e-05, "loss": 1.537, "step": 980 }, { "epoch": 3.002658227848101, "grad_norm": 12.120141983032227, "learning_rate": 3.132911392405064e-05, "loss": 1.7112, "step": 990 }, { "epoch": 3.003291139240506, "grad_norm": 9.203033447265625, "learning_rate": 3.1645569620253167e-05, "loss": 1.5344, "step": 1000 }, { "epoch": 3.0039240506329112, "grad_norm": 10.776932716369629, "learning_rate": 3.1962025316455695e-05, "loss": 1.6746, "step": 1010 }, { "epoch": 3.0045569620253163, "grad_norm": 13.07652759552002, "learning_rate": 3.227848101265823e-05, "loss": 1.7123, "step": 1020 }, { "epoch": 3.0051898734177214, "grad_norm": 12.412099838256836, "learning_rate": 3.2594936708860766e-05, "loss": 1.6235, "step": 1030 }, { "epoch": 3.0058227848101264, "grad_norm": 11.809402465820312, "learning_rate": 3.291139240506329e-05, "loss": 1.7162, "step": 1040 }, { "epoch": 3.0064556962025315, "grad_norm": 10.014476776123047, "learning_rate": 3.322784810126582e-05, "loss": 1.5555, "step": 1050 }, { "epoch": 3.0070886075949366, "grad_norm": 11.16119384765625, "learning_rate": 3.354430379746836e-05, "loss": 1.5755, "step": 1060 }, { "epoch": 3.0077215189873416, "grad_norm": 11.865368843078613, "learning_rate": 3.386075949367089e-05, "loss": 1.4931, "step": 1070 }, { "epoch": 3.0083544303797467, "grad_norm": 10.88665771484375, "learning_rate": 3.4177215189873416e-05, "loss": 1.4321, "step": 1080 }, { "epoch": 3.0089873417721518, "grad_norm": 15.595149993896484, "learning_rate": 3.449367088607595e-05, "loss": 1.7013, "step": 1090 }, { "epoch": 3.009620253164557, "grad_norm": 12.852721214294434, "learning_rate": 3.4810126582278487e-05, "loss": 1.4012, "step": 1100 }, { "epoch": 3.010253164556962, "grad_norm": 11.31406307220459, "learning_rate": 3.5126582278481015e-05, "loss": 1.3529, "step": 1110 }, { "epoch": 3.010886075949367, "grad_norm": 13.643885612487793, "learning_rate": 3.5443037974683544e-05, "loss": 1.6591, "step": 1120 }, { "epoch": 3.011518987341772, "grad_norm": 8.9163818359375, "learning_rate": 3.575949367088608e-05, "loss": 1.5645, "step": 1130 }, { "epoch": 3.012151898734177, "grad_norm": 11.10364818572998, "learning_rate": 3.607594936708861e-05, "loss": 1.573, "step": 1140 }, { "epoch": 3.012784810126582, "grad_norm": 12.710495948791504, "learning_rate": 3.639240506329114e-05, "loss": 1.4811, "step": 1150 }, { "epoch": 3.0134177215189872, "grad_norm": 9.857568740844727, "learning_rate": 3.670886075949367e-05, "loss": 1.3618, "step": 1160 }, { "epoch": 3.0140506329113923, "grad_norm": 12.947936058044434, "learning_rate": 3.70253164556962e-05, "loss": 1.4507, "step": 1170 }, { "epoch": 3.0146835443037974, "grad_norm": 7.388261795043945, "learning_rate": 3.7341772151898736e-05, "loss": 1.368, "step": 1180 }, { "epoch": 3.0153164556962024, "grad_norm": 12.413993835449219, "learning_rate": 3.765822784810127e-05, "loss": 1.2501, "step": 1190 }, { "epoch": 3.0159493670886075, "grad_norm": 9.680545806884766, "learning_rate": 3.79746835443038e-05, "loss": 1.3826, "step": 1200 }, { "epoch": 3.0165822784810126, "grad_norm": 10.841100692749023, "learning_rate": 3.829113924050633e-05, "loss": 1.3361, "step": 1210 }, { "epoch": 3.0172151898734176, "grad_norm": 10.57774829864502, "learning_rate": 3.8607594936708864e-05, "loss": 1.4978, "step": 1220 }, { "epoch": 3.0178481012658227, "grad_norm": 20.876358032226562, "learning_rate": 3.89240506329114e-05, "loss": 1.4616, "step": 1230 }, { "epoch": 3.0184810126582278, "grad_norm": 13.871617317199707, "learning_rate": 3.924050632911392e-05, "loss": 1.2869, "step": 1240 }, { "epoch": 3.019113924050633, "grad_norm": 10.208392143249512, "learning_rate": 3.9556962025316456e-05, "loss": 1.4173, "step": 1250 }, { "epoch": 3.019746835443038, "grad_norm": 13.292877197265625, "learning_rate": 3.987341772151899e-05, "loss": 1.1451, "step": 1260 }, { "epoch": 3.02, "eval_accuracy": 0.7389635316698656, "eval_loss": 1.146372675895691, "eval_runtime": 855.4763, "eval_samples_per_second": 0.609, "eval_steps_per_second": 0.077, "step": 1264 }, { "epoch": 4.000379746835443, "grad_norm": 12.077232360839844, "learning_rate": 4.018987341772152e-05, "loss": 1.186, "step": 1270 }, { "epoch": 4.0010126582278485, "grad_norm": 11.689604759216309, "learning_rate": 4.050632911392405e-05, "loss": 1.2715, "step": 1280 }, { "epoch": 4.001645569620253, "grad_norm": 10.000913619995117, "learning_rate": 4.0822784810126584e-05, "loss": 1.1501, "step": 1290 }, { "epoch": 4.002278481012659, "grad_norm": 11.559700965881348, "learning_rate": 4.113924050632912e-05, "loss": 1.3049, "step": 1300 }, { "epoch": 4.002911392405063, "grad_norm": 9.99675178527832, "learning_rate": 4.145569620253165e-05, "loss": 1.1664, "step": 1310 }, { "epoch": 4.003544303797469, "grad_norm": 10.537801742553711, "learning_rate": 4.177215189873418e-05, "loss": 1.0544, "step": 1320 }, { "epoch": 4.004177215189873, "grad_norm": 7.792250633239746, "learning_rate": 4.208860759493671e-05, "loss": 1.185, "step": 1330 }, { "epoch": 4.004810126582279, "grad_norm": 7.729872226715088, "learning_rate": 4.240506329113924e-05, "loss": 0.958, "step": 1340 }, { "epoch": 4.0054430379746835, "grad_norm": 8.17205810546875, "learning_rate": 4.2721518987341776e-05, "loss": 1.1805, "step": 1350 }, { "epoch": 4.006075949367089, "grad_norm": 8.281828880310059, "learning_rate": 4.3037974683544305e-05, "loss": 1.0389, "step": 1360 }, { "epoch": 4.006708860759494, "grad_norm": 10.64874267578125, "learning_rate": 4.3354430379746834e-05, "loss": 1.1096, "step": 1370 }, { "epoch": 4.007341772151899, "grad_norm": 10.857294082641602, "learning_rate": 4.367088607594937e-05, "loss": 1.0934, "step": 1380 }, { "epoch": 4.007974683544304, "grad_norm": 10.056556701660156, "learning_rate": 4.3987341772151904e-05, "loss": 1.0382, "step": 1390 }, { "epoch": 4.008607594936709, "grad_norm": 5.310425281524658, "learning_rate": 4.430379746835443e-05, "loss": 0.8079, "step": 1400 }, { "epoch": 4.009240506329114, "grad_norm": 9.902403831481934, "learning_rate": 4.462025316455696e-05, "loss": 1.1619, "step": 1410 }, { "epoch": 4.009873417721519, "grad_norm": 10.88439655303955, "learning_rate": 4.49367088607595e-05, "loss": 1.1363, "step": 1420 }, { "epoch": 4.010506329113924, "grad_norm": 8.662186622619629, "learning_rate": 4.525316455696203e-05, "loss": 1.0104, "step": 1430 }, { "epoch": 4.0111392405063295, "grad_norm": 7.63654899597168, "learning_rate": 4.556962025316456e-05, "loss": 0.9476, "step": 1440 }, { "epoch": 4.011772151898734, "grad_norm": 15.624666213989258, "learning_rate": 4.588607594936709e-05, "loss": 1.0386, "step": 1450 }, { "epoch": 4.01240506329114, "grad_norm": 10.169722557067871, "learning_rate": 4.6202531645569625e-05, "loss": 1.0147, "step": 1460 }, { "epoch": 4.013037974683544, "grad_norm": 10.036338806152344, "learning_rate": 4.6518987341772154e-05, "loss": 1.0159, "step": 1470 }, { "epoch": 4.01367088607595, "grad_norm": 10.780523300170898, "learning_rate": 4.683544303797468e-05, "loss": 1.0246, "step": 1480 }, { "epoch": 4.014303797468354, "grad_norm": 12.800423622131348, "learning_rate": 4.715189873417722e-05, "loss": 0.9578, "step": 1490 }, { "epoch": 4.01493670886076, "grad_norm": 11.1471586227417, "learning_rate": 4.7468354430379746e-05, "loss": 0.9799, "step": 1500 }, { "epoch": 4.0155696202531646, "grad_norm": 5.804954528808594, "learning_rate": 4.778481012658228e-05, "loss": 0.9171, "step": 1510 }, { "epoch": 4.01620253164557, "grad_norm": 14.239872932434082, "learning_rate": 4.810126582278481e-05, "loss": 1.1415, "step": 1520 }, { "epoch": 4.016835443037975, "grad_norm": 6.897063255310059, "learning_rate": 4.8417721518987346e-05, "loss": 1.0636, "step": 1530 }, { "epoch": 4.017468354430379, "grad_norm": 12.96029281616211, "learning_rate": 4.8734177215189874e-05, "loss": 0.9399, "step": 1540 }, { "epoch": 4.018101265822785, "grad_norm": 17.250343322753906, "learning_rate": 4.905063291139241e-05, "loss": 0.8518, "step": 1550 }, { "epoch": 4.018734177215189, "grad_norm": 10.851552963256836, "learning_rate": 4.936708860759494e-05, "loss": 1.091, "step": 1560 }, { "epoch": 4.019367088607595, "grad_norm": 16.084505081176758, "learning_rate": 4.968354430379747e-05, "loss": 1.0378, "step": 1570 }, { "epoch": 4.02, "grad_norm": 11.509261131286621, "learning_rate": 5e-05, "loss": 1.0637, "step": 1580 }, { "epoch": 4.02, "eval_accuracy": 0.7773512476007678, "eval_loss": 0.7994989156723022, "eval_runtime": 871.4001, "eval_samples_per_second": 0.598, "eval_steps_per_second": 0.076, "step": 1580 }, { "epoch": 5.000632911392405, "grad_norm": 8.971628189086914, "learning_rate": 4.99648382559775e-05, "loss": 0.8316, "step": 1590 }, { "epoch": 5.00126582278481, "grad_norm": 13.378620147705078, "learning_rate": 4.9929676511955e-05, "loss": 0.8914, "step": 1600 }, { "epoch": 5.001898734177215, "grad_norm": 9.115056037902832, "learning_rate": 4.989451476793249e-05, "loss": 0.9715, "step": 1610 }, { "epoch": 5.00253164556962, "grad_norm": 20.863187789916992, "learning_rate": 4.985935302390999e-05, "loss": 0.7779, "step": 1620 }, { "epoch": 5.003164556962025, "grad_norm": 10.088607788085938, "learning_rate": 4.982419127988748e-05, "loss": 0.8775, "step": 1630 }, { "epoch": 5.00379746835443, "grad_norm": 6.800666332244873, "learning_rate": 4.9789029535864986e-05, "loss": 0.7337, "step": 1640 }, { "epoch": 5.004430379746835, "grad_norm": 12.704118728637695, "learning_rate": 4.975386779184248e-05, "loss": 0.7668, "step": 1650 }, { "epoch": 5.0050632911392405, "grad_norm": 7.471581935882568, "learning_rate": 4.9718706047819975e-05, "loss": 0.7418, "step": 1660 }, { "epoch": 5.005696202531645, "grad_norm": 11.495733261108398, "learning_rate": 4.968354430379747e-05, "loss": 0.8782, "step": 1670 }, { "epoch": 5.006329113924051, "grad_norm": 7.495334148406982, "learning_rate": 4.964838255977497e-05, "loss": 0.7234, "step": 1680 }, { "epoch": 5.006962025316455, "grad_norm": 9.78715991973877, "learning_rate": 4.9613220815752464e-05, "loss": 0.8924, "step": 1690 }, { "epoch": 5.007594936708861, "grad_norm": 6.057085990905762, "learning_rate": 4.957805907172996e-05, "loss": 0.6553, "step": 1700 }, { "epoch": 5.008227848101265, "grad_norm": 7.595339298248291, "learning_rate": 4.9542897327707454e-05, "loss": 0.9012, "step": 1710 }, { "epoch": 5.008860759493671, "grad_norm": 11.763632774353027, "learning_rate": 4.950773558368496e-05, "loss": 0.7808, "step": 1720 }, { "epoch": 5.0094936708860756, "grad_norm": 9.754287719726562, "learning_rate": 4.947257383966245e-05, "loss": 0.8886, "step": 1730 }, { "epoch": 5.010126582278481, "grad_norm": 7.873773097991943, "learning_rate": 4.943741209563995e-05, "loss": 0.6798, "step": 1740 }, { "epoch": 5.010759493670886, "grad_norm": 9.378247261047363, "learning_rate": 4.940225035161744e-05, "loss": 0.763, "step": 1750 }, { "epoch": 5.011392405063291, "grad_norm": 13.096108436584473, "learning_rate": 4.936708860759494e-05, "loss": 0.7834, "step": 1760 }, { "epoch": 5.012025316455696, "grad_norm": 5.072068691253662, "learning_rate": 4.933192686357244e-05, "loss": 0.6282, "step": 1770 }, { "epoch": 5.012658227848101, "grad_norm": 14.543283462524414, "learning_rate": 4.929676511954993e-05, "loss": 0.6679, "step": 1780 }, { "epoch": 5.013291139240506, "grad_norm": 11.026363372802734, "learning_rate": 4.9261603375527427e-05, "loss": 0.7401, "step": 1790 }, { "epoch": 5.0139240506329115, "grad_norm": 5.937496662139893, "learning_rate": 4.9226441631504925e-05, "loss": 0.7408, "step": 1800 }, { "epoch": 5.014556962025316, "grad_norm": 9.093450546264648, "learning_rate": 4.919127988748242e-05, "loss": 0.9351, "step": 1810 }, { "epoch": 5.015189873417722, "grad_norm": 6.913345813751221, "learning_rate": 4.9156118143459915e-05, "loss": 0.6614, "step": 1820 }, { "epoch": 5.015822784810126, "grad_norm": 5.915560245513916, "learning_rate": 4.912095639943741e-05, "loss": 0.6584, "step": 1830 }, { "epoch": 5.016455696202532, "grad_norm": 9.257479667663574, "learning_rate": 4.908579465541491e-05, "loss": 0.6744, "step": 1840 }, { "epoch": 5.017088607594936, "grad_norm": 12.13564395904541, "learning_rate": 4.905063291139241e-05, "loss": 0.7035, "step": 1850 }, { "epoch": 5.017721518987342, "grad_norm": 6.930849075317383, "learning_rate": 4.90154711673699e-05, "loss": 0.7628, "step": 1860 }, { "epoch": 5.0183544303797465, "grad_norm": 13.400097846984863, "learning_rate": 4.89803094233474e-05, "loss": 0.6906, "step": 1870 }, { "epoch": 5.018987341772152, "grad_norm": 3.020031452178955, "learning_rate": 4.89451476793249e-05, "loss": 0.8423, "step": 1880 }, { "epoch": 5.019620253164557, "grad_norm": 11.807697296142578, "learning_rate": 4.8909985935302396e-05, "loss": 0.7795, "step": 1890 }, { "epoch": 5.02, "eval_accuracy": 0.8829174664107485, "eval_loss": 0.4938121438026428, "eval_runtime": 888.2928, "eval_samples_per_second": 0.587, "eval_steps_per_second": 0.074, "step": 1896 }, { "epoch": 6.000253164556962, "grad_norm": 1.9311364889144897, "learning_rate": 4.887482419127989e-05, "loss": 0.5037, "step": 1900 }, { "epoch": 6.000886075949367, "grad_norm": 2.175624370574951, "learning_rate": 4.8839662447257386e-05, "loss": 0.5447, "step": 1910 }, { "epoch": 6.001518987341772, "grad_norm": 6.488412380218506, "learning_rate": 4.8804500703234885e-05, "loss": 0.4324, "step": 1920 }, { "epoch": 6.002151898734177, "grad_norm": 7.727587699890137, "learning_rate": 4.876933895921238e-05, "loss": 0.7189, "step": 1930 }, { "epoch": 6.002784810126582, "grad_norm": 9.56769847869873, "learning_rate": 4.8734177215189874e-05, "loss": 0.4081, "step": 1940 }, { "epoch": 6.0034177215189874, "grad_norm": 4.022727012634277, "learning_rate": 4.869901547116737e-05, "loss": 0.6191, "step": 1950 }, { "epoch": 6.004050632911392, "grad_norm": 4.674831867218018, "learning_rate": 4.866385372714487e-05, "loss": 0.523, "step": 1960 }, { "epoch": 6.004683544303798, "grad_norm": 9.083635330200195, "learning_rate": 4.862869198312236e-05, "loss": 0.5281, "step": 1970 }, { "epoch": 6.005316455696202, "grad_norm": 11.267608642578125, "learning_rate": 4.859353023909986e-05, "loss": 0.3717, "step": 1980 }, { "epoch": 6.005949367088608, "grad_norm": 4.938966751098633, "learning_rate": 4.855836849507735e-05, "loss": 0.4737, "step": 1990 }, { "epoch": 6.006582278481012, "grad_norm": 7.700178623199463, "learning_rate": 4.852320675105486e-05, "loss": 0.507, "step": 2000 }, { "epoch": 6.007215189873418, "grad_norm": 13.916314125061035, "learning_rate": 4.848804500703235e-05, "loss": 0.4627, "step": 2010 }, { "epoch": 6.0078481012658225, "grad_norm": 9.637331008911133, "learning_rate": 4.845288326300985e-05, "loss": 0.509, "step": 2020 }, { "epoch": 6.008481012658228, "grad_norm": 6.41615629196167, "learning_rate": 4.8417721518987346e-05, "loss": 0.4817, "step": 2030 }, { "epoch": 6.009113924050633, "grad_norm": 4.391960144042969, "learning_rate": 4.8382559774964844e-05, "loss": 0.6306, "step": 2040 }, { "epoch": 6.009746835443038, "grad_norm": 3.4830005168914795, "learning_rate": 4.8347398030942336e-05, "loss": 0.6192, "step": 2050 }, { "epoch": 6.010379746835443, "grad_norm": 8.927491188049316, "learning_rate": 4.8312236286919834e-05, "loss": 0.3656, "step": 2060 }, { "epoch": 6.011012658227848, "grad_norm": 6.872843265533447, "learning_rate": 4.827707454289733e-05, "loss": 0.4649, "step": 2070 }, { "epoch": 6.011645569620253, "grad_norm": 5.7255682945251465, "learning_rate": 4.824191279887483e-05, "loss": 0.5626, "step": 2080 }, { "epoch": 6.012278481012658, "grad_norm": 12.707902908325195, "learning_rate": 4.820675105485232e-05, "loss": 0.7542, "step": 2090 }, { "epoch": 6.012911392405063, "grad_norm": 8.386948585510254, "learning_rate": 4.817158931082982e-05, "loss": 0.5574, "step": 2100 }, { "epoch": 6.0135443037974685, "grad_norm": 5.364466667175293, "learning_rate": 4.813642756680732e-05, "loss": 0.5088, "step": 2110 }, { "epoch": 6.014177215189873, "grad_norm": 7.0593791007995605, "learning_rate": 4.810126582278481e-05, "loss": 0.4507, "step": 2120 }, { "epoch": 6.014810126582279, "grad_norm": 4.73020076751709, "learning_rate": 4.806610407876231e-05, "loss": 0.4129, "step": 2130 }, { "epoch": 6.015443037974683, "grad_norm": 9.857553482055664, "learning_rate": 4.80309423347398e-05, "loss": 0.6213, "step": 2140 }, { "epoch": 6.016075949367089, "grad_norm": 3.092639446258545, "learning_rate": 4.7995780590717305e-05, "loss": 0.465, "step": 2150 }, { "epoch": 6.016708860759493, "grad_norm": 10.670005798339844, "learning_rate": 4.79606188466948e-05, "loss": 0.5071, "step": 2160 }, { "epoch": 6.017341772151899, "grad_norm": 10.013630867004395, "learning_rate": 4.7925457102672295e-05, "loss": 0.4878, "step": 2170 }, { "epoch": 6.0179746835443035, "grad_norm": 5.06149959564209, "learning_rate": 4.789029535864979e-05, "loss": 0.4294, "step": 2180 }, { "epoch": 6.018607594936709, "grad_norm": 6.84969425201416, "learning_rate": 4.785513361462729e-05, "loss": 0.4591, "step": 2190 }, { "epoch": 6.019240506329114, "grad_norm": 16.97979736328125, "learning_rate": 4.7819971870604783e-05, "loss": 0.3969, "step": 2200 }, { "epoch": 6.019873417721519, "grad_norm": 4.75033712387085, "learning_rate": 4.778481012658228e-05, "loss": 0.4484, "step": 2210 }, { "epoch": 6.02, "eval_accuracy": 0.8829174664107485, "eval_loss": 0.38333675265312195, "eval_runtime": 852.5332, "eval_samples_per_second": 0.611, "eval_steps_per_second": 0.077, "step": 2212 }, { "epoch": 7.000506329113924, "grad_norm": 14.740522384643555, "learning_rate": 4.774964838255977e-05, "loss": 0.4139, "step": 2220 }, { "epoch": 7.001139240506329, "grad_norm": 2.6119625568389893, "learning_rate": 4.771448663853728e-05, "loss": 0.3493, "step": 2230 }, { "epoch": 7.001772151898734, "grad_norm": 1.260722279548645, "learning_rate": 4.767932489451477e-05, "loss": 0.2562, "step": 2240 }, { "epoch": 7.002405063291139, "grad_norm": 2.5697829723358154, "learning_rate": 4.764416315049227e-05, "loss": 0.3227, "step": 2250 }, { "epoch": 7.0030379746835445, "grad_norm": 5.382594585418701, "learning_rate": 4.760900140646976e-05, "loss": 0.42, "step": 2260 }, { "epoch": 7.003670886075949, "grad_norm": 3.63082218170166, "learning_rate": 4.757383966244726e-05, "loss": 0.3715, "step": 2270 }, { "epoch": 7.004303797468355, "grad_norm": 4.501291751861572, "learning_rate": 4.7538677918424756e-05, "loss": 0.2527, "step": 2280 }, { "epoch": 7.004936708860759, "grad_norm": 6.318272590637207, "learning_rate": 4.7503516174402255e-05, "loss": 0.2922, "step": 2290 }, { "epoch": 7.005569620253165, "grad_norm": 15.981465339660645, "learning_rate": 4.7468354430379746e-05, "loss": 0.4605, "step": 2300 }, { "epoch": 7.006202531645569, "grad_norm": 7.8365020751953125, "learning_rate": 4.7433192686357245e-05, "loss": 0.4728, "step": 2310 }, { "epoch": 7.006835443037975, "grad_norm": 6.1255388259887695, "learning_rate": 4.739803094233474e-05, "loss": 0.3578, "step": 2320 }, { "epoch": 7.0074683544303795, "grad_norm": 12.867307662963867, "learning_rate": 4.7362869198312235e-05, "loss": 0.3075, "step": 2330 }, { "epoch": 7.008101265822785, "grad_norm": 3.2056193351745605, "learning_rate": 4.732770745428973e-05, "loss": 0.4281, "step": 2340 }, { "epoch": 7.00873417721519, "grad_norm": 14.265457153320312, "learning_rate": 4.729254571026723e-05, "loss": 0.584, "step": 2350 }, { "epoch": 7.009367088607595, "grad_norm": 5.145318984985352, "learning_rate": 4.725738396624473e-05, "loss": 0.3264, "step": 2360 }, { "epoch": 7.01, "grad_norm": 12.106173515319824, "learning_rate": 4.722222222222222e-05, "loss": 0.4977, "step": 2370 }, { "epoch": 7.010632911392405, "grad_norm": 2.557343006134033, "learning_rate": 4.718706047819972e-05, "loss": 0.2676, "step": 2380 }, { "epoch": 7.01126582278481, "grad_norm": 9.406204223632812, "learning_rate": 4.715189873417722e-05, "loss": 0.3632, "step": 2390 }, { "epoch": 7.011898734177215, "grad_norm": 0.7909258604049683, "learning_rate": 4.7116736990154716e-05, "loss": 0.2164, "step": 2400 }, { "epoch": 7.01253164556962, "grad_norm": 0.8705269694328308, "learning_rate": 4.708157524613221e-05, "loss": 0.3072, "step": 2410 }, { "epoch": 7.013164556962026, "grad_norm": 3.603971481323242, "learning_rate": 4.704641350210971e-05, "loss": 0.2117, "step": 2420 }, { "epoch": 7.01379746835443, "grad_norm": 7.447606563568115, "learning_rate": 4.7011251758087204e-05, "loss": 0.32, "step": 2430 }, { "epoch": 7.014430379746836, "grad_norm": 7.379356384277344, "learning_rate": 4.69760900140647e-05, "loss": 0.3718, "step": 2440 }, { "epoch": 7.01506329113924, "grad_norm": 17.737926483154297, "learning_rate": 4.6940928270042194e-05, "loss": 0.4529, "step": 2450 }, { "epoch": 7.015696202531646, "grad_norm": 3.0855658054351807, "learning_rate": 4.690576652601969e-05, "loss": 0.2227, "step": 2460 }, { "epoch": 7.0163291139240505, "grad_norm": 14.56204605102539, "learning_rate": 4.687060478199719e-05, "loss": 0.3014, "step": 2470 }, { "epoch": 7.016962025316456, "grad_norm": 3.587031841278076, "learning_rate": 4.683544303797468e-05, "loss": 0.3073, "step": 2480 }, { "epoch": 7.017594936708861, "grad_norm": 3.022749423980713, "learning_rate": 4.680028129395218e-05, "loss": 0.2003, "step": 2490 }, { "epoch": 7.018227848101266, "grad_norm": 8.433323860168457, "learning_rate": 4.676511954992968e-05, "loss": 0.3411, "step": 2500 }, { "epoch": 7.018860759493671, "grad_norm": 4.128199100494385, "learning_rate": 4.672995780590718e-05, "loss": 0.3518, "step": 2510 }, { "epoch": 7.019493670886076, "grad_norm": 11.647120475769043, "learning_rate": 4.669479606188467e-05, "loss": 0.2162, "step": 2520 }, { "epoch": 7.02, "eval_accuracy": 0.9155470249520153, "eval_loss": 0.25118356943130493, "eval_runtime": 859.0348, "eval_samples_per_second": 0.606, "eval_steps_per_second": 0.077, "step": 2528 }, { "epoch": 8.000126582278481, "grad_norm": 21.305402755737305, "learning_rate": 4.665963431786217e-05, "loss": 0.3546, "step": 2530 }, { "epoch": 8.000759493670886, "grad_norm": 9.045007705688477, "learning_rate": 4.6624472573839666e-05, "loss": 0.2433, "step": 2540 }, { "epoch": 8.00139240506329, "grad_norm": 6.032439708709717, "learning_rate": 4.6589310829817164e-05, "loss": 0.2786, "step": 2550 }, { "epoch": 8.002025316455697, "grad_norm": 5.416832447052002, "learning_rate": 4.6554149085794655e-05, "loss": 0.2595, "step": 2560 }, { "epoch": 8.002658227848102, "grad_norm": 2.8014585971832275, "learning_rate": 4.6518987341772154e-05, "loss": 0.301, "step": 2570 }, { "epoch": 8.003291139240506, "grad_norm": 1.782472014427185, "learning_rate": 4.648382559774965e-05, "loss": 0.1689, "step": 2580 }, { "epoch": 8.00392405063291, "grad_norm": 2.679903507232666, "learning_rate": 4.644866385372715e-05, "loss": 0.2884, "step": 2590 }, { "epoch": 8.004556962025317, "grad_norm": 2.2325003147125244, "learning_rate": 4.641350210970464e-05, "loss": 0.2047, "step": 2600 }, { "epoch": 8.005189873417722, "grad_norm": 9.163021087646484, "learning_rate": 4.637834036568214e-05, "loss": 0.1786, "step": 2610 }, { "epoch": 8.005822784810126, "grad_norm": 3.2019553184509277, "learning_rate": 4.634317862165964e-05, "loss": 0.2159, "step": 2620 }, { "epoch": 8.006455696202531, "grad_norm": 4.9009904861450195, "learning_rate": 4.630801687763714e-05, "loss": 0.249, "step": 2630 }, { "epoch": 8.007088607594937, "grad_norm": 4.865252494812012, "learning_rate": 4.627285513361463e-05, "loss": 0.3338, "step": 2640 }, { "epoch": 8.007721518987342, "grad_norm": 1.178444504737854, "learning_rate": 4.623769338959213e-05, "loss": 0.2716, "step": 2650 }, { "epoch": 8.008354430379747, "grad_norm": 2.2412354946136475, "learning_rate": 4.6202531645569625e-05, "loss": 0.1516, "step": 2660 }, { "epoch": 8.008987341772151, "grad_norm": 11.467605590820312, "learning_rate": 4.616736990154712e-05, "loss": 0.3324, "step": 2670 }, { "epoch": 8.009620253164558, "grad_norm": 4.130499362945557, "learning_rate": 4.6132208157524615e-05, "loss": 0.3092, "step": 2680 }, { "epoch": 8.010253164556962, "grad_norm": 8.060324668884277, "learning_rate": 4.6097046413502107e-05, "loss": 0.2931, "step": 2690 }, { "epoch": 8.010886075949367, "grad_norm": 18.59467124938965, "learning_rate": 4.606188466947961e-05, "loss": 0.2696, "step": 2700 }, { "epoch": 8.011518987341772, "grad_norm": 0.5714246034622192, "learning_rate": 4.60267229254571e-05, "loss": 0.2422, "step": 2710 }, { "epoch": 8.012151898734178, "grad_norm": 2.771367311477661, "learning_rate": 4.59915611814346e-05, "loss": 0.2716, "step": 2720 }, { "epoch": 8.012784810126583, "grad_norm": 1.5201733112335205, "learning_rate": 4.595639943741209e-05, "loss": 0.1204, "step": 2730 }, { "epoch": 8.013417721518987, "grad_norm": 15.473625183105469, "learning_rate": 4.59212376933896e-05, "loss": 0.2585, "step": 2740 }, { "epoch": 8.014050632911392, "grad_norm": 3.226699113845825, "learning_rate": 4.588607594936709e-05, "loss": 0.1866, "step": 2750 }, { "epoch": 8.014683544303798, "grad_norm": 1.999643087387085, "learning_rate": 4.585091420534459e-05, "loss": 0.237, "step": 2760 }, { "epoch": 8.015316455696203, "grad_norm": 18.86202049255371, "learning_rate": 4.581575246132208e-05, "loss": 0.2425, "step": 2770 }, { "epoch": 8.015949367088608, "grad_norm": 0.8307034969329834, "learning_rate": 4.5780590717299585e-05, "loss": 0.1629, "step": 2780 }, { "epoch": 8.016582278481012, "grad_norm": 15.315855979919434, "learning_rate": 4.5745428973277076e-05, "loss": 0.2366, "step": 2790 }, { "epoch": 8.017215189873419, "grad_norm": 16.721681594848633, "learning_rate": 4.5710267229254575e-05, "loss": 0.3524, "step": 2800 }, { "epoch": 8.017848101265823, "grad_norm": 0.7722905278205872, "learning_rate": 4.5675105485232066e-05, "loss": 0.1772, "step": 2810 }, { "epoch": 8.018481012658228, "grad_norm": 1.7419816255569458, "learning_rate": 4.5639943741209564e-05, "loss": 0.206, "step": 2820 }, { "epoch": 8.019113924050632, "grad_norm": 4.165656089782715, "learning_rate": 4.560478199718706e-05, "loss": 0.2458, "step": 2830 }, { "epoch": 8.019746835443039, "grad_norm": 2.085017681121826, "learning_rate": 4.556962025316456e-05, "loss": 0.228, "step": 2840 }, { "epoch": 8.02, "eval_accuracy": 0.9309021113243762, "eval_loss": 0.19722820818424225, "eval_runtime": 905.8613, "eval_samples_per_second": 0.575, "eval_steps_per_second": 0.073, "step": 2844 }, { "epoch": 9.000379746835444, "grad_norm": 13.498887062072754, "learning_rate": 4.553445850914206e-05, "loss": 0.1674, "step": 2850 }, { "epoch": 9.001012658227848, "grad_norm": 0.7413739562034607, "learning_rate": 4.549929676511955e-05, "loss": 0.1371, "step": 2860 }, { "epoch": 9.001645569620253, "grad_norm": 1.669324278831482, "learning_rate": 4.546413502109705e-05, "loss": 0.146, "step": 2870 }, { "epoch": 9.002278481012658, "grad_norm": 2.6705691814422607, "learning_rate": 4.542897327707454e-05, "loss": 0.2867, "step": 2880 }, { "epoch": 9.002911392405064, "grad_norm": 3.0164263248443604, "learning_rate": 4.5393811533052046e-05, "loss": 0.2902, "step": 2890 }, { "epoch": 9.003544303797469, "grad_norm": 6.8492302894592285, "learning_rate": 4.535864978902954e-05, "loss": 0.1386, "step": 2900 }, { "epoch": 9.004177215189873, "grad_norm": 0.835591733455658, "learning_rate": 4.5323488045007036e-05, "loss": 0.1567, "step": 2910 }, { "epoch": 9.004810126582278, "grad_norm": 4.592350959777832, "learning_rate": 4.528832630098453e-05, "loss": 0.2066, "step": 2920 }, { "epoch": 9.005443037974684, "grad_norm": 5.640721797943115, "learning_rate": 4.525316455696203e-05, "loss": 0.0528, "step": 2930 }, { "epoch": 9.006075949367089, "grad_norm": 13.428877830505371, "learning_rate": 4.5218002812939524e-05, "loss": 0.1163, "step": 2940 }, { "epoch": 9.006708860759494, "grad_norm": 0.22825801372528076, "learning_rate": 4.518284106891702e-05, "loss": 0.1288, "step": 2950 }, { "epoch": 9.007341772151898, "grad_norm": 1.8029659986495972, "learning_rate": 4.5147679324894514e-05, "loss": 0.2082, "step": 2960 }, { "epoch": 9.007974683544305, "grad_norm": 0.23065458238124847, "learning_rate": 4.511251758087202e-05, "loss": 0.2156, "step": 2970 }, { "epoch": 9.00860759493671, "grad_norm": 3.216405153274536, "learning_rate": 4.507735583684951e-05, "loss": 0.2436, "step": 2980 }, { "epoch": 9.009240506329114, "grad_norm": 0.4626835584640503, "learning_rate": 4.504219409282701e-05, "loss": 0.2139, "step": 2990 }, { "epoch": 9.009873417721519, "grad_norm": 5.6381516456604, "learning_rate": 4.50070323488045e-05, "loss": 0.1326, "step": 3000 }, { "epoch": 9.010506329113925, "grad_norm": 7.001626968383789, "learning_rate": 4.4971870604782e-05, "loss": 0.1666, "step": 3010 }, { "epoch": 9.01113924050633, "grad_norm": 3.0966169834136963, "learning_rate": 4.49367088607595e-05, "loss": 0.2111, "step": 3020 }, { "epoch": 9.011772151898734, "grad_norm": 7.884695053100586, "learning_rate": 4.490154711673699e-05, "loss": 0.2074, "step": 3030 }, { "epoch": 9.012405063291139, "grad_norm": 18.61140251159668, "learning_rate": 4.486638537271449e-05, "loss": 0.1748, "step": 3040 }, { "epoch": 9.013037974683545, "grad_norm": 23.860849380493164, "learning_rate": 4.4831223628691985e-05, "loss": 0.1367, "step": 3050 }, { "epoch": 9.01367088607595, "grad_norm": 6.8134942054748535, "learning_rate": 4.4796061884669484e-05, "loss": 0.1653, "step": 3060 }, { "epoch": 9.014303797468354, "grad_norm": 21.31640625, "learning_rate": 4.4760900140646975e-05, "loss": 0.3217, "step": 3070 }, { "epoch": 9.014936708860759, "grad_norm": 3.0076091289520264, "learning_rate": 4.4725738396624474e-05, "loss": 0.2275, "step": 3080 }, { "epoch": 9.015569620253165, "grad_norm": 0.38252460956573486, "learning_rate": 4.469057665260197e-05, "loss": 0.0889, "step": 3090 }, { "epoch": 9.01620253164557, "grad_norm": 3.001718282699585, "learning_rate": 4.465541490857947e-05, "loss": 0.2065, "step": 3100 }, { "epoch": 9.016835443037975, "grad_norm": 5.9051923751831055, "learning_rate": 4.462025316455696e-05, "loss": 0.2604, "step": 3110 }, { "epoch": 9.01746835443038, "grad_norm": 1.5905555486679077, "learning_rate": 4.458509142053446e-05, "loss": 0.2658, "step": 3120 }, { "epoch": 9.018101265822784, "grad_norm": 2.863093137741089, "learning_rate": 4.454992967651196e-05, "loss": 0.174, "step": 3130 }, { "epoch": 9.01873417721519, "grad_norm": 2.637539863586426, "learning_rate": 4.451476793248946e-05, "loss": 0.1515, "step": 3140 }, { "epoch": 9.019367088607595, "grad_norm": 0.3857173025608063, "learning_rate": 4.447960618846695e-05, "loss": 0.0724, "step": 3150 }, { "epoch": 9.02, "grad_norm": 1.806871771812439, "learning_rate": 4.4444444444444447e-05, "loss": 0.1711, "step": 3160 }, { "epoch": 9.02, "eval_accuracy": 0.9481765834932822, "eval_loss": 0.14262719452381134, "eval_runtime": 899.0913, "eval_samples_per_second": 0.579, "eval_steps_per_second": 0.073, "step": 3160 }, { "epoch": 10.000632911392405, "grad_norm": 5.219705104827881, "learning_rate": 4.4409282700421945e-05, "loss": 0.1425, "step": 3170 }, { "epoch": 10.00126582278481, "grad_norm": 12.191892623901367, "learning_rate": 4.4374120956399436e-05, "loss": 0.1729, "step": 3180 }, { "epoch": 10.001898734177216, "grad_norm": 1.2383445501327515, "learning_rate": 4.4338959212376935e-05, "loss": 0.2565, "step": 3190 }, { "epoch": 10.00253164556962, "grad_norm": 0.5347968935966492, "learning_rate": 4.430379746835443e-05, "loss": 0.1981, "step": 3200 }, { "epoch": 10.003164556962025, "grad_norm": 0.8782184720039368, "learning_rate": 4.426863572433193e-05, "loss": 0.0789, "step": 3210 }, { "epoch": 10.00379746835443, "grad_norm": 0.14596301317214966, "learning_rate": 4.423347398030942e-05, "loss": 0.1853, "step": 3220 }, { "epoch": 10.004430379746836, "grad_norm": 0.4674586057662964, "learning_rate": 4.419831223628692e-05, "loss": 0.1213, "step": 3230 }, { "epoch": 10.00506329113924, "grad_norm": 5.122977256774902, "learning_rate": 4.416315049226441e-05, "loss": 0.1708, "step": 3240 }, { "epoch": 10.005696202531645, "grad_norm": 2.872215509414673, "learning_rate": 4.412798874824192e-05, "loss": 0.2037, "step": 3250 }, { "epoch": 10.00632911392405, "grad_norm": 3.31655216217041, "learning_rate": 4.409282700421941e-05, "loss": 0.1574, "step": 3260 }, { "epoch": 10.006962025316456, "grad_norm": 0.5382121801376343, "learning_rate": 4.405766526019691e-05, "loss": 0.1031, "step": 3270 }, { "epoch": 10.00759493670886, "grad_norm": 1.605431079864502, "learning_rate": 4.4022503516174406e-05, "loss": 0.1636, "step": 3280 }, { "epoch": 10.008227848101265, "grad_norm": 9.264389038085938, "learning_rate": 4.3987341772151904e-05, "loss": 0.2556, "step": 3290 }, { "epoch": 10.00886075949367, "grad_norm": 0.28090256452560425, "learning_rate": 4.3952180028129396e-05, "loss": 0.1123, "step": 3300 }, { "epoch": 10.009493670886076, "grad_norm": 2.213951349258423, "learning_rate": 4.3917018284106894e-05, "loss": 0.0749, "step": 3310 }, { "epoch": 10.010126582278481, "grad_norm": 6.6392035484313965, "learning_rate": 4.388185654008439e-05, "loss": 0.1676, "step": 3320 }, { "epoch": 10.010759493670886, "grad_norm": 0.35711225867271423, "learning_rate": 4.384669479606189e-05, "loss": 0.0756, "step": 3330 }, { "epoch": 10.01139240506329, "grad_norm": 0.24826167523860931, "learning_rate": 4.381153305203938e-05, "loss": 0.0955, "step": 3340 }, { "epoch": 10.012025316455697, "grad_norm": 5.6375579833984375, "learning_rate": 4.377637130801688e-05, "loss": 0.1472, "step": 3350 }, { "epoch": 10.012658227848101, "grad_norm": 0.28531956672668457, "learning_rate": 4.374120956399438e-05, "loss": 0.0843, "step": 3360 }, { "epoch": 10.013291139240506, "grad_norm": 5.148308277130127, "learning_rate": 4.370604781997187e-05, "loss": 0.222, "step": 3370 }, { "epoch": 10.01392405063291, "grad_norm": 0.3964751064777374, "learning_rate": 4.367088607594937e-05, "loss": 0.1279, "step": 3380 }, { "epoch": 10.014556962025317, "grad_norm": 4.597806930541992, "learning_rate": 4.363572433192686e-05, "loss": 0.1258, "step": 3390 }, { "epoch": 10.015189873417722, "grad_norm": 1.5554261207580566, "learning_rate": 4.3600562587904366e-05, "loss": 0.1193, "step": 3400 }, { "epoch": 10.015822784810126, "grad_norm": 0.0851406529545784, "learning_rate": 4.356540084388186e-05, "loss": 0.1182, "step": 3410 }, { "epoch": 10.01645569620253, "grad_norm": 3.5009474754333496, "learning_rate": 4.3530239099859356e-05, "loss": 0.155, "step": 3420 }, { "epoch": 10.017088607594937, "grad_norm": 2.019500732421875, "learning_rate": 4.349507735583685e-05, "loss": 0.1827, "step": 3430 }, { "epoch": 10.017721518987342, "grad_norm": 1.833369255065918, "learning_rate": 4.345991561181435e-05, "loss": 0.0991, "step": 3440 }, { "epoch": 10.018354430379746, "grad_norm": 0.4851783215999603, "learning_rate": 4.3424753867791844e-05, "loss": 0.2478, "step": 3450 }, { "epoch": 10.018987341772151, "grad_norm": 2.687387466430664, "learning_rate": 4.338959212376934e-05, "loss": 0.0995, "step": 3460 }, { "epoch": 10.019620253164558, "grad_norm": 11.315146446228027, "learning_rate": 4.3354430379746834e-05, "loss": 0.2251, "step": 3470 }, { "epoch": 10.02, "eval_accuracy": 0.9558541266794626, "eval_loss": 0.0965082123875618, "eval_runtime": 881.0116, "eval_samples_per_second": 0.591, "eval_steps_per_second": 0.075, "step": 3476 }, { "epoch": 11.000253164556963, "grad_norm": 8.084904670715332, "learning_rate": 4.331926863572434e-05, "loss": 0.1515, "step": 3480 }, { "epoch": 11.000886075949367, "grad_norm": 27.45550537109375, "learning_rate": 4.328410689170183e-05, "loss": 0.1284, "step": 3490 }, { "epoch": 11.001518987341772, "grad_norm": 8.539572715759277, "learning_rate": 4.324894514767933e-05, "loss": 0.1598, "step": 3500 }, { "epoch": 11.002151898734176, "grad_norm": 0.08583386987447739, "learning_rate": 4.321378340365682e-05, "loss": 0.0517, "step": 3510 }, { "epoch": 11.002784810126583, "grad_norm": 0.14813897013664246, "learning_rate": 4.317862165963432e-05, "loss": 0.1125, "step": 3520 }, { "epoch": 11.003417721518987, "grad_norm": 2.4185800552368164, "learning_rate": 4.314345991561182e-05, "loss": 0.1984, "step": 3530 }, { "epoch": 11.004050632911392, "grad_norm": 0.21438997983932495, "learning_rate": 4.3108298171589315e-05, "loss": 0.0713, "step": 3540 }, { "epoch": 11.004683544303797, "grad_norm": 0.394996702671051, "learning_rate": 4.307313642756681e-05, "loss": 0.1331, "step": 3550 }, { "epoch": 11.005316455696203, "grad_norm": 1.3863253593444824, "learning_rate": 4.3037974683544305e-05, "loss": 0.0773, "step": 3560 }, { "epoch": 11.005949367088608, "grad_norm": 1.7728908061981201, "learning_rate": 4.3002812939521803e-05, "loss": 0.1833, "step": 3570 }, { "epoch": 11.006582278481012, "grad_norm": 0.08379014581441879, "learning_rate": 4.2967651195499295e-05, "loss": 0.0516, "step": 3580 }, { "epoch": 11.007215189873417, "grad_norm": 2.6824705600738525, "learning_rate": 4.293248945147679e-05, "loss": 0.1616, "step": 3590 }, { "epoch": 11.007848101265823, "grad_norm": 2.570587158203125, "learning_rate": 4.289732770745429e-05, "loss": 0.1306, "step": 3600 }, { "epoch": 11.008481012658228, "grad_norm": 14.159221649169922, "learning_rate": 4.286216596343179e-05, "loss": 0.1083, "step": 3610 }, { "epoch": 11.009113924050633, "grad_norm": 9.88803482055664, "learning_rate": 4.282700421940928e-05, "loss": 0.1391, "step": 3620 }, { "epoch": 11.009746835443037, "grad_norm": 5.837044715881348, "learning_rate": 4.279184247538678e-05, "loss": 0.1622, "step": 3630 }, { "epoch": 11.010379746835444, "grad_norm": 0.1928025782108307, "learning_rate": 4.275668073136428e-05, "loss": 0.1777, "step": 3640 }, { "epoch": 11.011012658227848, "grad_norm": 0.26675212383270264, "learning_rate": 4.2721518987341776e-05, "loss": 0.088, "step": 3650 }, { "epoch": 11.011645569620253, "grad_norm": 0.1680004894733429, "learning_rate": 4.268635724331927e-05, "loss": 0.1404, "step": 3660 }, { "epoch": 11.012278481012657, "grad_norm": 2.046947717666626, "learning_rate": 4.2651195499296766e-05, "loss": 0.1198, "step": 3670 }, { "epoch": 11.012911392405064, "grad_norm": 1.6072787046432495, "learning_rate": 4.2616033755274265e-05, "loss": 0.085, "step": 3680 }, { "epoch": 11.013544303797469, "grad_norm": 0.38024991750717163, "learning_rate": 4.258087201125176e-05, "loss": 0.0692, "step": 3690 }, { "epoch": 11.014177215189873, "grad_norm": 0.4724186062812805, "learning_rate": 4.2545710267229255e-05, "loss": 0.1008, "step": 3700 }, { "epoch": 11.014810126582278, "grad_norm": 4.140766143798828, "learning_rate": 4.251054852320675e-05, "loss": 0.1582, "step": 3710 }, { "epoch": 11.015443037974684, "grad_norm": 2.098039388656616, "learning_rate": 4.247538677918425e-05, "loss": 0.1359, "step": 3720 }, { "epoch": 11.016075949367089, "grad_norm": 1.2859419584274292, "learning_rate": 4.244022503516174e-05, "loss": 0.0806, "step": 3730 }, { "epoch": 11.016708860759493, "grad_norm": 4.074576377868652, "learning_rate": 4.240506329113924e-05, "loss": 0.2069, "step": 3740 }, { "epoch": 11.017341772151898, "grad_norm": 11.83212947845459, "learning_rate": 4.236990154711674e-05, "loss": 0.1142, "step": 3750 }, { "epoch": 11.017974683544304, "grad_norm": 2.487534999847412, "learning_rate": 4.233473980309424e-05, "loss": 0.1747, "step": 3760 }, { "epoch": 11.018607594936709, "grad_norm": 6.369086265563965, "learning_rate": 4.229957805907173e-05, "loss": 0.2394, "step": 3770 }, { "epoch": 11.019240506329114, "grad_norm": 14.795226097106934, "learning_rate": 4.226441631504923e-05, "loss": 0.1638, "step": 3780 }, { "epoch": 11.019873417721518, "grad_norm": 2.7429358959198, "learning_rate": 4.2229254571026726e-05, "loss": 0.1697, "step": 3790 }, { "epoch": 11.02, "eval_accuracy": 0.9539347408829175, "eval_loss": 0.11410919576883316, "eval_runtime": 878.2486, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.075, "step": 3792 }, { "epoch": 12.000506329113923, "grad_norm": 6.182743072509766, "learning_rate": 4.2194092827004224e-05, "loss": 0.1566, "step": 3800 }, { "epoch": 12.00113924050633, "grad_norm": 3.826575994491577, "learning_rate": 4.2158931082981716e-05, "loss": 0.0969, "step": 3810 }, { "epoch": 12.001772151898734, "grad_norm": 0.3016466498374939, "learning_rate": 4.2123769338959214e-05, "loss": 0.2192, "step": 3820 }, { "epoch": 12.002405063291139, "grad_norm": 2.917588710784912, "learning_rate": 4.208860759493671e-05, "loss": 0.0723, "step": 3830 }, { "epoch": 12.003037974683544, "grad_norm": 0.10706198215484619, "learning_rate": 4.205344585091421e-05, "loss": 0.136, "step": 3840 }, { "epoch": 12.00367088607595, "grad_norm": 1.9893161058425903, "learning_rate": 4.20182841068917e-05, "loss": 0.1361, "step": 3850 }, { "epoch": 12.004303797468355, "grad_norm": 1.9333717823028564, "learning_rate": 4.19831223628692e-05, "loss": 0.1467, "step": 3860 }, { "epoch": 12.00493670886076, "grad_norm": 0.13750018179416656, "learning_rate": 4.19479606188467e-05, "loss": 0.072, "step": 3870 }, { "epoch": 12.005569620253164, "grad_norm": 3.1651651859283447, "learning_rate": 4.19127988748242e-05, "loss": 0.1241, "step": 3880 }, { "epoch": 12.00620253164557, "grad_norm": 1.7854634523391724, "learning_rate": 4.187763713080169e-05, "loss": 0.1204, "step": 3890 }, { "epoch": 12.006835443037975, "grad_norm": 2.030898332595825, "learning_rate": 4.184247538677919e-05, "loss": 0.1028, "step": 3900 }, { "epoch": 12.00746835443038, "grad_norm": 2.5038902759552, "learning_rate": 4.1807313642756686e-05, "loss": 0.0767, "step": 3910 }, { "epoch": 12.008101265822784, "grad_norm": 0.07481967657804489, "learning_rate": 4.177215189873418e-05, "loss": 0.1092, "step": 3920 }, { "epoch": 12.00873417721519, "grad_norm": 15.426689147949219, "learning_rate": 4.1736990154711675e-05, "loss": 0.1277, "step": 3930 }, { "epoch": 12.009367088607595, "grad_norm": 0.38814952969551086, "learning_rate": 4.170182841068917e-05, "loss": 0.0369, "step": 3940 }, { "epoch": 12.01, "grad_norm": 3.7652478218078613, "learning_rate": 4.166666666666667e-05, "loss": 0.0799, "step": 3950 }, { "epoch": 12.010632911392404, "grad_norm": 0.06228223815560341, "learning_rate": 4.1631504922644164e-05, "loss": 0.1217, "step": 3960 }, { "epoch": 12.01126582278481, "grad_norm": 0.3022870719432831, "learning_rate": 4.159634317862166e-05, "loss": 0.0405, "step": 3970 }, { "epoch": 12.011898734177215, "grad_norm": 2.400820255279541, "learning_rate": 4.1561181434599153e-05, "loss": 0.0291, "step": 3980 }, { "epoch": 12.01253164556962, "grad_norm": 2.68705415725708, "learning_rate": 4.152601969057666e-05, "loss": 0.1775, "step": 3990 }, { "epoch": 12.013164556962025, "grad_norm": 0.5902582406997681, "learning_rate": 4.149085794655415e-05, "loss": 0.1165, "step": 4000 }, { "epoch": 12.013797468354431, "grad_norm": 0.9047210812568665, "learning_rate": 4.145569620253165e-05, "loss": 0.07, "step": 4010 }, { "epoch": 12.014430379746836, "grad_norm": 0.09479296952486038, "learning_rate": 4.142053445850914e-05, "loss": 0.0495, "step": 4020 }, { "epoch": 12.01506329113924, "grad_norm": 1.401077151298523, "learning_rate": 4.1385372714486645e-05, "loss": 0.0204, "step": 4030 }, { "epoch": 12.015696202531645, "grad_norm": 4.037631988525391, "learning_rate": 4.135021097046414e-05, "loss": 0.1883, "step": 4040 }, { "epoch": 12.016329113924051, "grad_norm": 1.7805005311965942, "learning_rate": 4.1315049226441635e-05, "loss": 0.1802, "step": 4050 }, { "epoch": 12.016962025316456, "grad_norm": 3.6074600219726562, "learning_rate": 4.1279887482419127e-05, "loss": 0.0523, "step": 4060 }, { "epoch": 12.01759493670886, "grad_norm": 0.1754535734653473, "learning_rate": 4.1244725738396625e-05, "loss": 0.0763, "step": 4070 }, { "epoch": 12.018227848101265, "grad_norm": 1.0715147256851196, "learning_rate": 4.120956399437412e-05, "loss": 0.1114, "step": 4080 }, { "epoch": 12.018860759493672, "grad_norm": 0.06572480499744415, "learning_rate": 4.1174402250351615e-05, "loss": 0.0499, "step": 4090 }, { "epoch": 12.019493670886076, "grad_norm": 2.0521042346954346, "learning_rate": 4.113924050632912e-05, "loss": 0.1229, "step": 4100 }, { "epoch": 12.02, "eval_accuracy": 0.9539347408829175, "eval_loss": 0.13622045516967773, "eval_runtime": 873.7358, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.076, "step": 4108 }, { "epoch": 13.000126582278481, "grad_norm": 0.08451604843139648, "learning_rate": 4.110407876230661e-05, "loss": 0.119, "step": 4110 }, { "epoch": 13.000759493670886, "grad_norm": 1.373328685760498, "learning_rate": 4.106891701828411e-05, "loss": 0.0773, "step": 4120 }, { "epoch": 13.00139240506329, "grad_norm": 5.43549919128418, "learning_rate": 4.10337552742616e-05, "loss": 0.1298, "step": 4130 }, { "epoch": 13.002025316455697, "grad_norm": 1.0948410034179688, "learning_rate": 4.0998593530239106e-05, "loss": 0.1255, "step": 4140 }, { "epoch": 13.002658227848102, "grad_norm": 0.060861099511384964, "learning_rate": 4.09634317862166e-05, "loss": 0.1304, "step": 4150 }, { "epoch": 13.003291139240506, "grad_norm": 15.299447059631348, "learning_rate": 4.0928270042194096e-05, "loss": 0.1789, "step": 4160 }, { "epoch": 13.00392405063291, "grad_norm": 0.08800447732210159, "learning_rate": 4.089310829817159e-05, "loss": 0.0934, "step": 4170 }, { "epoch": 13.004556962025317, "grad_norm": 2.429713010787964, "learning_rate": 4.085794655414909e-05, "loss": 0.1312, "step": 4180 }, { "epoch": 13.005189873417722, "grad_norm": 0.05040478706359863, "learning_rate": 4.0822784810126584e-05, "loss": 0.0764, "step": 4190 }, { "epoch": 13.005822784810126, "grad_norm": 12.875263214111328, "learning_rate": 4.078762306610408e-05, "loss": 0.2538, "step": 4200 }, { "epoch": 13.006455696202531, "grad_norm": 1.7678494453430176, "learning_rate": 4.0752461322081574e-05, "loss": 0.0656, "step": 4210 }, { "epoch": 13.007088607594937, "grad_norm": 0.05160669609904289, "learning_rate": 4.071729957805907e-05, "loss": 0.1236, "step": 4220 }, { "epoch": 13.007721518987342, "grad_norm": 2.3818519115448, "learning_rate": 4.068213783403657e-05, "loss": 0.1056, "step": 4230 }, { "epoch": 13.008354430379747, "grad_norm": 1.16487717628479, "learning_rate": 4.064697609001407e-05, "loss": 0.1062, "step": 4240 }, { "epoch": 13.008987341772151, "grad_norm": 1.0950814485549927, "learning_rate": 4.061181434599156e-05, "loss": 0.114, "step": 4250 }, { "epoch": 13.009620253164558, "grad_norm": 2.856937885284424, "learning_rate": 4.057665260196906e-05, "loss": 0.1701, "step": 4260 }, { "epoch": 13.010253164556962, "grad_norm": 0.057572536170482635, "learning_rate": 4.054149085794656e-05, "loss": 0.1006, "step": 4270 }, { "epoch": 13.010886075949367, "grad_norm": 0.05074339732527733, "learning_rate": 4.050632911392405e-05, "loss": 0.1008, "step": 4280 }, { "epoch": 13.011518987341772, "grad_norm": 3.1123783588409424, "learning_rate": 4.047116736990155e-05, "loss": 0.0352, "step": 4290 }, { "epoch": 13.012151898734178, "grad_norm": 2.150893211364746, "learning_rate": 4.0436005625879046e-05, "loss": 0.0824, "step": 4300 }, { "epoch": 13.012784810126583, "grad_norm": 2.5087695121765137, "learning_rate": 4.0400843881856544e-05, "loss": 0.1343, "step": 4310 }, { "epoch": 13.013417721518987, "grad_norm": 1.6841331720352173, "learning_rate": 4.0365682137834036e-05, "loss": 0.1258, "step": 4320 }, { "epoch": 13.014050632911392, "grad_norm": 0.3586716949939728, "learning_rate": 4.0330520393811534e-05, "loss": 0.1204, "step": 4330 }, { "epoch": 13.014683544303798, "grad_norm": 0.045751988887786865, "learning_rate": 4.029535864978903e-05, "loss": 0.0682, "step": 4340 }, { "epoch": 13.015316455696203, "grad_norm": 3.5271830558776855, "learning_rate": 4.026019690576653e-05, "loss": 0.261, "step": 4350 }, { "epoch": 13.015949367088608, "grad_norm": 3.505953550338745, "learning_rate": 4.022503516174402e-05, "loss": 0.1309, "step": 4360 }, { "epoch": 13.016582278481012, "grad_norm": 0.1506178230047226, "learning_rate": 4.018987341772152e-05, "loss": 0.0923, "step": 4370 }, { "epoch": 13.017215189873419, "grad_norm": 0.3128964304924011, "learning_rate": 4.015471167369902e-05, "loss": 0.0573, "step": 4380 }, { "epoch": 13.017848101265823, "grad_norm": 2.3400206565856934, "learning_rate": 4.011954992967652e-05, "loss": 0.099, "step": 4390 }, { "epoch": 13.018481012658228, "grad_norm": 0.4466443955898285, "learning_rate": 4.008438818565401e-05, "loss": 0.0531, "step": 4400 }, { "epoch": 13.019113924050632, "grad_norm": 2.060894250869751, "learning_rate": 4.004922644163151e-05, "loss": 0.0446, "step": 4410 }, { "epoch": 13.019746835443039, "grad_norm": 1.2128475904464722, "learning_rate": 4.0014064697609005e-05, "loss": 0.0676, "step": 4420 }, { "epoch": 13.02, "eval_accuracy": 0.9654510556621881, "eval_loss": 0.07451339066028595, "eval_runtime": 911.5098, "eval_samples_per_second": 0.572, "eval_steps_per_second": 0.072, "step": 4424 }, { "epoch": 14.000379746835444, "grad_norm": 0.05884459242224693, "learning_rate": 3.99789029535865e-05, "loss": 0.0896, "step": 4430 }, { "epoch": 14.001012658227848, "grad_norm": 0.05943896621465683, "learning_rate": 3.9943741209563995e-05, "loss": 0.1132, "step": 4440 }, { "epoch": 14.001645569620253, "grad_norm": 0.13569866120815277, "learning_rate": 3.9908579465541493e-05, "loss": 0.091, "step": 4450 }, { "epoch": 14.002278481012658, "grad_norm": 0.12505175173282623, "learning_rate": 3.987341772151899e-05, "loss": 0.0719, "step": 4460 }, { "epoch": 14.002911392405064, "grad_norm": 1.747598648071289, "learning_rate": 3.983825597749648e-05, "loss": 0.0855, "step": 4470 }, { "epoch": 14.003544303797469, "grad_norm": 0.03635944798588753, "learning_rate": 3.980309423347398e-05, "loss": 0.1261, "step": 4480 }, { "epoch": 14.004177215189873, "grad_norm": 1.7012404203414917, "learning_rate": 3.976793248945147e-05, "loss": 0.1321, "step": 4490 }, { "epoch": 14.004810126582278, "grad_norm": 0.040289971977472305, "learning_rate": 3.973277074542898e-05, "loss": 0.0378, "step": 4500 }, { "epoch": 14.005443037974684, "grad_norm": 2.7338550090789795, "learning_rate": 3.969760900140647e-05, "loss": 0.0765, "step": 4510 }, { "epoch": 14.006075949367089, "grad_norm": 3.6036548614501953, "learning_rate": 3.966244725738397e-05, "loss": 0.1414, "step": 4520 }, { "epoch": 14.006708860759494, "grad_norm": 0.07101106643676758, "learning_rate": 3.9627285513361467e-05, "loss": 0.0402, "step": 4530 }, { "epoch": 14.007341772151898, "grad_norm": 2.1385743618011475, "learning_rate": 3.9592123769338965e-05, "loss": 0.082, "step": 4540 }, { "epoch": 14.007974683544305, "grad_norm": 1.6399197578430176, "learning_rate": 3.9556962025316456e-05, "loss": 0.1278, "step": 4550 }, { "epoch": 14.00860759493671, "grad_norm": 0.14580035209655762, "learning_rate": 3.9521800281293955e-05, "loss": 0.097, "step": 4560 }, { "epoch": 14.009240506329114, "grad_norm": 0.044299304485321045, "learning_rate": 3.948663853727145e-05, "loss": 0.1258, "step": 4570 }, { "epoch": 14.009873417721519, "grad_norm": 0.06222519651055336, "learning_rate": 3.945147679324895e-05, "loss": 0.0526, "step": 4580 }, { "epoch": 14.010506329113925, "grad_norm": 2.3559911251068115, "learning_rate": 3.941631504922644e-05, "loss": 0.0934, "step": 4590 }, { "epoch": 14.01113924050633, "grad_norm": 0.762189507484436, "learning_rate": 3.938115330520394e-05, "loss": 0.0937, "step": 4600 }, { "epoch": 14.011772151898734, "grad_norm": 0.03613949567079544, "learning_rate": 3.934599156118144e-05, "loss": 0.1002, "step": 4610 }, { "epoch": 14.012405063291139, "grad_norm": 0.0497271791100502, "learning_rate": 3.931082981715893e-05, "loss": 0.1768, "step": 4620 }, { "epoch": 14.013037974683545, "grad_norm": 0.0881432294845581, "learning_rate": 3.927566807313643e-05, "loss": 0.111, "step": 4630 }, { "epoch": 14.01367088607595, "grad_norm": 1.8317328691482544, "learning_rate": 3.924050632911392e-05, "loss": 0.0322, "step": 4640 }, { "epoch": 14.014303797468354, "grad_norm": 0.1062496155500412, "learning_rate": 3.9205344585091426e-05, "loss": 0.0409, "step": 4650 }, { "epoch": 14.014936708860759, "grad_norm": 0.05037945136427879, "learning_rate": 3.917018284106892e-05, "loss": 0.158, "step": 4660 }, { "epoch": 14.015569620253165, "grad_norm": 1.9508050680160522, "learning_rate": 3.9135021097046416e-05, "loss": 0.1135, "step": 4670 }, { "epoch": 14.01620253164557, "grad_norm": 0.03313547745347023, "learning_rate": 3.909985935302391e-05, "loss": 0.0667, "step": 4680 }, { "epoch": 14.016835443037975, "grad_norm": 0.0592011958360672, "learning_rate": 3.906469760900141e-05, "loss": 0.096, "step": 4690 }, { "epoch": 14.01746835443038, "grad_norm": 0.03745780512690544, "learning_rate": 3.9029535864978904e-05, "loss": 0.1084, "step": 4700 }, { "epoch": 14.018101265822784, "grad_norm": 2.3232529163360596, "learning_rate": 3.89943741209564e-05, "loss": 0.0571, "step": 4710 }, { "epoch": 14.01873417721519, "grad_norm": 0.03583463653922081, "learning_rate": 3.8959212376933894e-05, "loss": 0.0409, "step": 4720 }, { "epoch": 14.019367088607595, "grad_norm": 3.5223681926727295, "learning_rate": 3.89240506329114e-05, "loss": 0.0784, "step": 4730 }, { "epoch": 14.02, "grad_norm": 0.07330437749624252, "learning_rate": 3.888888888888889e-05, "loss": 0.1228, "step": 4740 }, { "epoch": 14.02, "eval_accuracy": 0.963531669865643, "eval_loss": 0.08169866353273392, "eval_runtime": 921.6107, "eval_samples_per_second": 0.565, "eval_steps_per_second": 0.072, "step": 4740 }, { "epoch": 15.000632911392405, "grad_norm": 0.6237608194351196, "learning_rate": 3.885372714486639e-05, "loss": 0.0534, "step": 4750 }, { "epoch": 15.00126582278481, "grad_norm": 1.1300462484359741, "learning_rate": 3.881856540084388e-05, "loss": 0.1765, "step": 4760 }, { "epoch": 15.001898734177216, "grad_norm": 0.06267885118722916, "learning_rate": 3.878340365682138e-05, "loss": 0.0799, "step": 4770 }, { "epoch": 15.00253164556962, "grad_norm": 0.0465877391397953, "learning_rate": 3.874824191279888e-05, "loss": 0.0156, "step": 4780 }, { "epoch": 15.003164556962025, "grad_norm": 0.08922750502824783, "learning_rate": 3.8713080168776376e-05, "loss": 0.1283, "step": 4790 }, { "epoch": 15.00379746835443, "grad_norm": 3.4005231857299805, "learning_rate": 3.867791842475387e-05, "loss": 0.0989, "step": 4800 }, { "epoch": 15.004430379746836, "grad_norm": 0.07256881147623062, "learning_rate": 3.8642756680731365e-05, "loss": 0.099, "step": 4810 }, { "epoch": 15.00506329113924, "grad_norm": 0.05675378069281578, "learning_rate": 3.8607594936708864e-05, "loss": 0.1094, "step": 4820 }, { "epoch": 15.005696202531645, "grad_norm": 1.7418971061706543, "learning_rate": 3.8572433192686355e-05, "loss": 0.0712, "step": 4830 }, { "epoch": 15.00632911392405, "grad_norm": 0.034461941570043564, "learning_rate": 3.8537271448663854e-05, "loss": 0.1568, "step": 4840 }, { "epoch": 15.006962025316456, "grad_norm": 0.12834057211875916, "learning_rate": 3.850210970464135e-05, "loss": 0.071, "step": 4850 }, { "epoch": 15.00759493670886, "grad_norm": 2.435114622116089, "learning_rate": 3.846694796061885e-05, "loss": 0.0736, "step": 4860 }, { "epoch": 15.008227848101265, "grad_norm": 0.10457431524991989, "learning_rate": 3.843178621659634e-05, "loss": 0.0941, "step": 4870 }, { "epoch": 15.00886075949367, "grad_norm": 1.1757208108901978, "learning_rate": 3.839662447257384e-05, "loss": 0.089, "step": 4880 }, { "epoch": 15.009493670886076, "grad_norm": 0.11455094069242477, "learning_rate": 3.836146272855134e-05, "loss": 0.146, "step": 4890 }, { "epoch": 15.010126582278481, "grad_norm": 0.047735344618558884, "learning_rate": 3.832630098452884e-05, "loss": 0.0979, "step": 4900 }, { "epoch": 15.010759493670886, "grad_norm": 2.26092267036438, "learning_rate": 3.829113924050633e-05, "loss": 0.0598, "step": 4910 }, { "epoch": 15.01139240506329, "grad_norm": 0.04012997820973396, "learning_rate": 3.825597749648383e-05, "loss": 0.0603, "step": 4920 }, { "epoch": 15.012025316455697, "grad_norm": 0.060564495623111725, "learning_rate": 3.8220815752461325e-05, "loss": 0.0079, "step": 4930 }, { "epoch": 15.012658227848101, "grad_norm": 1.690059781074524, "learning_rate": 3.8185654008438823e-05, "loss": 0.1119, "step": 4940 }, { "epoch": 15.013291139240506, "grad_norm": 0.04470343515276909, "learning_rate": 3.8150492264416315e-05, "loss": 0.0421, "step": 4950 }, { "epoch": 15.01392405063291, "grad_norm": 2.4390032291412354, "learning_rate": 3.811533052039381e-05, "loss": 0.1052, "step": 4960 }, { "epoch": 15.014556962025317, "grad_norm": 0.034953054040670395, "learning_rate": 3.808016877637131e-05, "loss": 0.0869, "step": 4970 }, { "epoch": 15.015189873417722, "grad_norm": 1.8463102579116821, "learning_rate": 3.80450070323488e-05, "loss": 0.0601, "step": 4980 }, { "epoch": 15.015822784810126, "grad_norm": 0.025022268295288086, "learning_rate": 3.80098452883263e-05, "loss": 0.1042, "step": 4990 }, { "epoch": 15.01645569620253, "grad_norm": 2.0151307582855225, "learning_rate": 3.79746835443038e-05, "loss": 0.1235, "step": 5000 }, { "epoch": 15.017088607594937, "grad_norm": 2.130545139312744, "learning_rate": 3.79395218002813e-05, "loss": 0.1464, "step": 5010 }, { "epoch": 15.017721518987342, "grad_norm": 0.02256944589316845, "learning_rate": 3.790436005625879e-05, "loss": 0.057, "step": 5020 }, { "epoch": 15.018354430379746, "grad_norm": 0.2824922204017639, "learning_rate": 3.786919831223629e-05, "loss": 0.0738, "step": 5030 }, { "epoch": 15.018987341772151, "grad_norm": 0.2349025160074234, "learning_rate": 3.7834036568213786e-05, "loss": 0.125, "step": 5040 }, { "epoch": 15.019620253164558, "grad_norm": 1.1086318492889404, "learning_rate": 3.7798874824191285e-05, "loss": 0.0143, "step": 5050 }, { "epoch": 15.02, "eval_accuracy": 0.9692898272552783, "eval_loss": 0.061516787856817245, "eval_runtime": 919.3915, "eval_samples_per_second": 0.567, "eval_steps_per_second": 0.072, "step": 5056 }, { "epoch": 16.000253164556963, "grad_norm": 0.026827219873666763, "learning_rate": 3.7763713080168776e-05, "loss": 0.0445, "step": 5060 }, { "epoch": 16.000886075949367, "grad_norm": 0.04409428685903549, "learning_rate": 3.7728551336146275e-05, "loss": 0.0879, "step": 5070 }, { "epoch": 16.001518987341772, "grad_norm": 0.038987431675195694, "learning_rate": 3.769338959212377e-05, "loss": 0.0599, "step": 5080 }, { "epoch": 16.002151898734176, "grad_norm": 4.49123477935791, "learning_rate": 3.765822784810127e-05, "loss": 0.0703, "step": 5090 }, { "epoch": 16.00278481012658, "grad_norm": 0.03677366301417351, "learning_rate": 3.762306610407876e-05, "loss": 0.075, "step": 5100 }, { "epoch": 16.003417721518986, "grad_norm": 3.2981064319610596, "learning_rate": 3.758790436005626e-05, "loss": 0.1017, "step": 5110 }, { "epoch": 16.004050632911394, "grad_norm": 1.8348667621612549, "learning_rate": 3.755274261603376e-05, "loss": 0.0831, "step": 5120 }, { "epoch": 16.0046835443038, "grad_norm": 0.023824410513043404, "learning_rate": 3.751758087201125e-05, "loss": 0.0696, "step": 5130 }, { "epoch": 16.005316455696203, "grad_norm": 0.04150988906621933, "learning_rate": 3.748241912798875e-05, "loss": 0.0828, "step": 5140 }, { "epoch": 16.005949367088608, "grad_norm": 0.020287124440073967, "learning_rate": 3.744725738396625e-05, "loss": 0.0027, "step": 5150 }, { "epoch": 16.006582278481012, "grad_norm": 0.999756932258606, "learning_rate": 3.7412095639943746e-05, "loss": 0.0737, "step": 5160 }, { "epoch": 16.007215189873417, "grad_norm": 0.02667284570634365, "learning_rate": 3.737693389592124e-05, "loss": 0.0686, "step": 5170 }, { "epoch": 16.00784810126582, "grad_norm": 2.912930727005005, "learning_rate": 3.7341772151898736e-05, "loss": 0.0712, "step": 5180 }, { "epoch": 16.008481012658226, "grad_norm": 0.03943086788058281, "learning_rate": 3.730661040787623e-05, "loss": 0.1255, "step": 5190 }, { "epoch": 16.009113924050634, "grad_norm": 0.02671695314347744, "learning_rate": 3.727144866385373e-05, "loss": 0.1072, "step": 5200 }, { "epoch": 16.00974683544304, "grad_norm": 0.052541956305503845, "learning_rate": 3.7236286919831224e-05, "loss": 0.0429, "step": 5210 }, { "epoch": 16.010379746835444, "grad_norm": 1.465926170349121, "learning_rate": 3.720112517580872e-05, "loss": 0.0737, "step": 5220 }, { "epoch": 16.01101265822785, "grad_norm": 0.02939579077064991, "learning_rate": 3.7165963431786214e-05, "loss": 0.026, "step": 5230 }, { "epoch": 16.011645569620253, "grad_norm": 15.934955596923828, "learning_rate": 3.713080168776372e-05, "loss": 0.1409, "step": 5240 }, { "epoch": 16.012278481012657, "grad_norm": 0.3337193727493286, "learning_rate": 3.709563994374121e-05, "loss": 0.1064, "step": 5250 }, { "epoch": 16.012911392405062, "grad_norm": 1.509924292564392, "learning_rate": 3.706047819971871e-05, "loss": 0.1198, "step": 5260 }, { "epoch": 16.013544303797467, "grad_norm": 1.6039562225341797, "learning_rate": 3.70253164556962e-05, "loss": 0.1751, "step": 5270 }, { "epoch": 16.014177215189875, "grad_norm": 0.024508150294423103, "learning_rate": 3.6990154711673706e-05, "loss": 0.2054, "step": 5280 }, { "epoch": 16.01481012658228, "grad_norm": 0.10826098173856735, "learning_rate": 3.69549929676512e-05, "loss": 0.0953, "step": 5290 }, { "epoch": 16.015443037974684, "grad_norm": 0.1340492069721222, "learning_rate": 3.6919831223628695e-05, "loss": 0.1661, "step": 5300 }, { "epoch": 16.01607594936709, "grad_norm": 0.20521284639835358, "learning_rate": 3.688466947960619e-05, "loss": 0.0446, "step": 5310 }, { "epoch": 16.016708860759493, "grad_norm": 2.418673276901245, "learning_rate": 3.6849507735583685e-05, "loss": 0.1078, "step": 5320 }, { "epoch": 16.017341772151898, "grad_norm": 0.0307689867913723, "learning_rate": 3.6814345991561184e-05, "loss": 0.0744, "step": 5330 }, { "epoch": 16.017974683544303, "grad_norm": 12.458963394165039, "learning_rate": 3.6779184247538675e-05, "loss": 0.1081, "step": 5340 }, { "epoch": 16.018607594936707, "grad_norm": 0.07104455679655075, "learning_rate": 3.674402250351618e-05, "loss": 0.0946, "step": 5350 }, { "epoch": 16.019240506329115, "grad_norm": 0.0781354159116745, "learning_rate": 3.670886075949367e-05, "loss": 0.0375, "step": 5360 }, { "epoch": 16.01987341772152, "grad_norm": 0.041876938194036484, "learning_rate": 3.667369901547117e-05, "loss": 0.0621, "step": 5370 }, { "epoch": 16.02, "eval_accuracy": 0.9596928982725528, "eval_loss": 0.07680489867925644, "eval_runtime": 887.3026, "eval_samples_per_second": 0.587, "eval_steps_per_second": 0.074, "step": 5372 }, { "epoch": 17.000506329113925, "grad_norm": 0.03362284228205681, "learning_rate": 3.663853727144866e-05, "loss": 0.0939, "step": 5380 }, { "epoch": 17.00113924050633, "grad_norm": 2.931182384490967, "learning_rate": 3.660337552742617e-05, "loss": 0.0942, "step": 5390 }, { "epoch": 17.001772151898734, "grad_norm": 9.610499382019043, "learning_rate": 3.656821378340366e-05, "loss": 0.2421, "step": 5400 }, { "epoch": 17.00240506329114, "grad_norm": 2.01033091545105, "learning_rate": 3.653305203938116e-05, "loss": 0.0545, "step": 5410 }, { "epoch": 17.003037974683544, "grad_norm": 0.5791252851486206, "learning_rate": 3.649789029535865e-05, "loss": 0.0443, "step": 5420 }, { "epoch": 17.00367088607595, "grad_norm": 2.2349610328674316, "learning_rate": 3.646272855133615e-05, "loss": 0.1949, "step": 5430 }, { "epoch": 17.004303797468353, "grad_norm": 0.06409902125597, "learning_rate": 3.6427566807313645e-05, "loss": 0.0731, "step": 5440 }, { "epoch": 17.00493670886076, "grad_norm": 0.11862684041261673, "learning_rate": 3.639240506329114e-05, "loss": 0.0298, "step": 5450 }, { "epoch": 17.005569620253166, "grad_norm": 1.8723206520080566, "learning_rate": 3.6357243319268635e-05, "loss": 0.1438, "step": 5460 }, { "epoch": 17.00620253164557, "grad_norm": 0.04506813734769821, "learning_rate": 3.632208157524613e-05, "loss": 0.1927, "step": 5470 }, { "epoch": 17.006835443037975, "grad_norm": 0.07324240356683731, "learning_rate": 3.628691983122363e-05, "loss": 0.2084, "step": 5480 }, { "epoch": 17.00746835443038, "grad_norm": 16.583223342895508, "learning_rate": 3.625175808720113e-05, "loss": 0.1256, "step": 5490 }, { "epoch": 17.008101265822784, "grad_norm": 2.8780710697174072, "learning_rate": 3.621659634317862e-05, "loss": 0.1533, "step": 5500 }, { "epoch": 17.00873417721519, "grad_norm": 0.049451783299446106, "learning_rate": 3.618143459915612e-05, "loss": 0.0673, "step": 5510 }, { "epoch": 17.009367088607593, "grad_norm": 1.1349679231643677, "learning_rate": 3.614627285513362e-05, "loss": 0.009, "step": 5520 }, { "epoch": 17.01, "grad_norm": 0.08764008432626724, "learning_rate": 3.611111111111111e-05, "loss": 0.0392, "step": 5530 }, { "epoch": 17.010632911392406, "grad_norm": 0.04414183646440506, "learning_rate": 3.607594936708861e-05, "loss": 0.1909, "step": 5540 }, { "epoch": 17.01126582278481, "grad_norm": 2.099726676940918, "learning_rate": 3.6040787623066106e-05, "loss": 0.3287, "step": 5550 }, { "epoch": 17.011898734177215, "grad_norm": 0.1605502963066101, "learning_rate": 3.6005625879043604e-05, "loss": 0.1195, "step": 5560 }, { "epoch": 17.01253164556962, "grad_norm": 0.11115490645170212, "learning_rate": 3.5970464135021096e-05, "loss": 0.1011, "step": 5570 }, { "epoch": 17.013164556962025, "grad_norm": 1.932108998298645, "learning_rate": 3.5935302390998594e-05, "loss": 0.1557, "step": 5580 }, { "epoch": 17.01379746835443, "grad_norm": 0.022036854177713394, "learning_rate": 3.590014064697609e-05, "loss": 0.0526, "step": 5590 }, { "epoch": 17.014430379746834, "grad_norm": 0.0311493631452322, "learning_rate": 3.586497890295359e-05, "loss": 0.1158, "step": 5600 }, { "epoch": 17.015063291139242, "grad_norm": 0.37227359414100647, "learning_rate": 3.582981715893108e-05, "loss": 0.0607, "step": 5610 }, { "epoch": 17.015696202531647, "grad_norm": 0.047853514552116394, "learning_rate": 3.579465541490858e-05, "loss": 0.1834, "step": 5620 }, { "epoch": 17.01632911392405, "grad_norm": 0.798313558101654, "learning_rate": 3.575949367088608e-05, "loss": 0.0497, "step": 5630 }, { "epoch": 17.016962025316456, "grad_norm": 0.020088857039809227, "learning_rate": 3.572433192686358e-05, "loss": 0.0975, "step": 5640 }, { "epoch": 17.01759493670886, "grad_norm": 3.423915386199951, "learning_rate": 3.568917018284107e-05, "loss": 0.1255, "step": 5650 }, { "epoch": 17.018227848101265, "grad_norm": 2.1361560821533203, "learning_rate": 3.565400843881857e-05, "loss": 0.1294, "step": 5660 }, { "epoch": 17.01886075949367, "grad_norm": 3.7424023151397705, "learning_rate": 3.5618846694796066e-05, "loss": 0.21, "step": 5670 }, { "epoch": 17.019493670886074, "grad_norm": 0.9021095633506775, "learning_rate": 3.558368495077356e-05, "loss": 0.0597, "step": 5680 }, { "epoch": 17.02, "eval_accuracy": 0.963531669865643, "eval_loss": 0.08725160360336304, "eval_runtime": 887.9543, "eval_samples_per_second": 0.587, "eval_steps_per_second": 0.074, "step": 5688 }, { "epoch": 18.00012658227848, "grad_norm": 9.254701614379883, "learning_rate": 3.5548523206751056e-05, "loss": 0.0679, "step": 5690 }, { "epoch": 18.000759493670888, "grad_norm": 0.09168372303247452, "learning_rate": 3.551336146272855e-05, "loss": 0.0487, "step": 5700 }, { "epoch": 18.001392405063292, "grad_norm": 1.7651177644729614, "learning_rate": 3.547819971870605e-05, "loss": 0.1359, "step": 5710 }, { "epoch": 18.002025316455697, "grad_norm": 1.6308825016021729, "learning_rate": 3.5443037974683544e-05, "loss": 0.0374, "step": 5720 }, { "epoch": 18.0026582278481, "grad_norm": 0.029071198776364326, "learning_rate": 3.540787623066104e-05, "loss": 0.0867, "step": 5730 }, { "epoch": 18.003291139240506, "grad_norm": 1.1404354572296143, "learning_rate": 3.5372714486638534e-05, "loss": 0.1032, "step": 5740 }, { "epoch": 18.00392405063291, "grad_norm": 0.060701146721839905, "learning_rate": 3.533755274261604e-05, "loss": 0.0807, "step": 5750 }, { "epoch": 18.004556962025315, "grad_norm": 0.023883167654275894, "learning_rate": 3.530239099859353e-05, "loss": 0.0917, "step": 5760 }, { "epoch": 18.00518987341772, "grad_norm": 2.6466054916381836, "learning_rate": 3.526722925457103e-05, "loss": 0.078, "step": 5770 }, { "epoch": 18.005822784810128, "grad_norm": 0.2958976626396179, "learning_rate": 3.523206751054853e-05, "loss": 0.0468, "step": 5780 }, { "epoch": 18.006455696202533, "grad_norm": 0.08598524332046509, "learning_rate": 3.5196905766526025e-05, "loss": 0.0925, "step": 5790 }, { "epoch": 18.007088607594937, "grad_norm": 0.02307640202343464, "learning_rate": 3.516174402250352e-05, "loss": 0.0784, "step": 5800 }, { "epoch": 18.007721518987342, "grad_norm": 1.9384537935256958, "learning_rate": 3.5126582278481015e-05, "loss": 0.0645, "step": 5810 }, { "epoch": 18.008354430379747, "grad_norm": 0.06027079373598099, "learning_rate": 3.5091420534458513e-05, "loss": 0.1358, "step": 5820 }, { "epoch": 18.00898734177215, "grad_norm": 1.6383880376815796, "learning_rate": 3.505625879043601e-05, "loss": 0.0505, "step": 5830 }, { "epoch": 18.009620253164556, "grad_norm": 1.355348825454712, "learning_rate": 3.50210970464135e-05, "loss": 0.0574, "step": 5840 }, { "epoch": 18.01025316455696, "grad_norm": 2.113496780395508, "learning_rate": 3.4985935302391e-05, "loss": 0.1005, "step": 5850 }, { "epoch": 18.01088607594937, "grad_norm": 0.022542107850313187, "learning_rate": 3.49507735583685e-05, "loss": 0.0671, "step": 5860 }, { "epoch": 18.011518987341773, "grad_norm": 0.05946587771177292, "learning_rate": 3.491561181434599e-05, "loss": 0.0933, "step": 5870 }, { "epoch": 18.012151898734178, "grad_norm": 1.936155915260315, "learning_rate": 3.488045007032349e-05, "loss": 0.0975, "step": 5880 }, { "epoch": 18.012784810126583, "grad_norm": 0.03399858996272087, "learning_rate": 3.484528832630098e-05, "loss": 0.0909, "step": 5890 }, { "epoch": 18.013417721518987, "grad_norm": 0.030509311705827713, "learning_rate": 3.4810126582278487e-05, "loss": 0.0685, "step": 5900 }, { "epoch": 18.014050632911392, "grad_norm": 0.016794001683592796, "learning_rate": 3.477496483825598e-05, "loss": 0.0481, "step": 5910 }, { "epoch": 18.014683544303796, "grad_norm": 2.368962049484253, "learning_rate": 3.4739803094233476e-05, "loss": 0.2066, "step": 5920 }, { "epoch": 18.0153164556962, "grad_norm": 1.5589197874069214, "learning_rate": 3.470464135021097e-05, "loss": 0.0798, "step": 5930 }, { "epoch": 18.01594936708861, "grad_norm": 2.055568218231201, "learning_rate": 3.466947960618847e-05, "loss": 0.0384, "step": 5940 }, { "epoch": 18.016582278481014, "grad_norm": 2.212440013885498, "learning_rate": 3.4634317862165965e-05, "loss": 0.0979, "step": 5950 }, { "epoch": 18.01721518987342, "grad_norm": 0.022211147472262383, "learning_rate": 3.459915611814346e-05, "loss": 0.0575, "step": 5960 }, { "epoch": 18.017848101265823, "grad_norm": 2.110724687576294, "learning_rate": 3.4563994374120954e-05, "loss": 0.0568, "step": 5970 }, { "epoch": 18.018481012658228, "grad_norm": 0.025756409391760826, "learning_rate": 3.452883263009846e-05, "loss": 0.0429, "step": 5980 }, { "epoch": 18.019113924050632, "grad_norm": 2.1070611476898193, "learning_rate": 3.449367088607595e-05, "loss": 0.0821, "step": 5990 }, { "epoch": 18.019746835443037, "grad_norm": 0.043322015553712845, "learning_rate": 3.445850914205345e-05, "loss": 0.0696, "step": 6000 }, { "epoch": 18.02, "eval_accuracy": 0.9539347408829175, "eval_loss": 0.11077545583248138, "eval_runtime": 890.6431, "eval_samples_per_second": 0.585, "eval_steps_per_second": 0.074, "step": 6004 }, { "epoch": 19.000379746835442, "grad_norm": 1.7157703638076782, "learning_rate": 3.442334739803094e-05, "loss": 0.184, "step": 6010 }, { "epoch": 19.001012658227847, "grad_norm": 0.03436505049467087, "learning_rate": 3.438818565400844e-05, "loss": 0.1121, "step": 6020 }, { "epoch": 19.001645569620255, "grad_norm": 0.02937289886176586, "learning_rate": 3.435302390998594e-05, "loss": 0.1085, "step": 6030 }, { "epoch": 19.00227848101266, "grad_norm": 2.090529680252075, "learning_rate": 3.431786216596343e-05, "loss": 0.1076, "step": 6040 }, { "epoch": 19.002911392405064, "grad_norm": 2.621086835861206, "learning_rate": 3.428270042194093e-05, "loss": 0.0475, "step": 6050 }, { "epoch": 19.00354430379747, "grad_norm": 0.081081323325634, "learning_rate": 3.4247538677918426e-05, "loss": 0.1206, "step": 6060 }, { "epoch": 19.004177215189873, "grad_norm": 0.9470970034599304, "learning_rate": 3.4212376933895924e-05, "loss": 0.0197, "step": 6070 }, { "epoch": 19.004810126582278, "grad_norm": 0.6718599200248718, "learning_rate": 3.4177215189873416e-05, "loss": 0.0415, "step": 6080 }, { "epoch": 19.005443037974683, "grad_norm": 0.017539095133543015, "learning_rate": 3.4142053445850914e-05, "loss": 0.0924, "step": 6090 }, { "epoch": 19.006075949367087, "grad_norm": 0.04726846516132355, "learning_rate": 3.410689170182841e-05, "loss": 0.0091, "step": 6100 }, { "epoch": 19.006708860759495, "grad_norm": 1.5941773653030396, "learning_rate": 3.407172995780591e-05, "loss": 0.1435, "step": 6110 }, { "epoch": 19.0073417721519, "grad_norm": 2.2858848571777344, "learning_rate": 3.40365682137834e-05, "loss": 0.0776, "step": 6120 }, { "epoch": 19.007974683544305, "grad_norm": 0.03234981372952461, "learning_rate": 3.40014064697609e-05, "loss": 0.0483, "step": 6130 }, { "epoch": 19.00860759493671, "grad_norm": 0.07574064284563065, "learning_rate": 3.39662447257384e-05, "loss": 0.0784, "step": 6140 }, { "epoch": 19.009240506329114, "grad_norm": 0.016080401837825775, "learning_rate": 3.39310829817159e-05, "loss": 0.0479, "step": 6150 }, { "epoch": 19.00987341772152, "grad_norm": 0.546974241733551, "learning_rate": 3.389592123769339e-05, "loss": 0.0931, "step": 6160 }, { "epoch": 19.010506329113923, "grad_norm": 18.685945510864258, "learning_rate": 3.386075949367089e-05, "loss": 0.0846, "step": 6170 }, { "epoch": 19.011139240506328, "grad_norm": 3.7618796825408936, "learning_rate": 3.3825597749648385e-05, "loss": 0.1187, "step": 6180 }, { "epoch": 19.011772151898736, "grad_norm": 3.0678319931030273, "learning_rate": 3.3790436005625884e-05, "loss": 0.2192, "step": 6190 }, { "epoch": 19.01240506329114, "grad_norm": 0.021779673174023628, "learning_rate": 3.3755274261603375e-05, "loss": 0.0531, "step": 6200 }, { "epoch": 19.013037974683545, "grad_norm": 2.7021806240081787, "learning_rate": 3.3720112517580874e-05, "loss": 0.0932, "step": 6210 }, { "epoch": 19.01367088607595, "grad_norm": 2.251648426055908, "learning_rate": 3.368495077355837e-05, "loss": 0.1122, "step": 6220 }, { "epoch": 19.014303797468354, "grad_norm": 18.46161651611328, "learning_rate": 3.3649789029535864e-05, "loss": 0.2242, "step": 6230 }, { "epoch": 19.01493670886076, "grad_norm": 1.9724690914154053, "learning_rate": 3.361462728551336e-05, "loss": 0.1283, "step": 6240 }, { "epoch": 19.015569620253164, "grad_norm": 1.8774776458740234, "learning_rate": 3.357946554149086e-05, "loss": 0.0629, "step": 6250 }, { "epoch": 19.01620253164557, "grad_norm": 0.08820010721683502, "learning_rate": 3.354430379746836e-05, "loss": 0.0487, "step": 6260 }, { "epoch": 19.016835443037976, "grad_norm": 0.9228050708770752, "learning_rate": 3.350914205344585e-05, "loss": 0.1959, "step": 6270 }, { "epoch": 19.01746835443038, "grad_norm": 0.061690233647823334, "learning_rate": 3.347398030942335e-05, "loss": 0.1182, "step": 6280 }, { "epoch": 19.018101265822786, "grad_norm": 23.2619686126709, "learning_rate": 3.343881856540085e-05, "loss": 0.1069, "step": 6290 }, { "epoch": 19.01873417721519, "grad_norm": 0.0341399684548378, "learning_rate": 3.3403656821378345e-05, "loss": 0.1889, "step": 6300 }, { "epoch": 19.019367088607595, "grad_norm": 14.88992691040039, "learning_rate": 3.3368495077355837e-05, "loss": 0.1285, "step": 6310 }, { "epoch": 19.02, "grad_norm": 0.3174104392528534, "learning_rate": 3.3333333333333335e-05, "loss": 0.2761, "step": 6320 }, { "epoch": 19.02, "eval_accuracy": 0.9520153550863724, "eval_loss": 0.14126819372177124, "eval_runtime": 862.4572, "eval_samples_per_second": 0.604, "eval_steps_per_second": 0.077, "step": 6320 }, { "epoch": 20.000632911392405, "grad_norm": 15.443863868713379, "learning_rate": 3.329817158931083e-05, "loss": 0.2663, "step": 6330 }, { "epoch": 20.00126582278481, "grad_norm": 0.15935222804546356, "learning_rate": 3.326300984528833e-05, "loss": 0.0412, "step": 6340 }, { "epoch": 20.001898734177214, "grad_norm": 2.3951077461242676, "learning_rate": 3.322784810126582e-05, "loss": 0.1944, "step": 6350 }, { "epoch": 20.00253164556962, "grad_norm": 0.055507030338048935, "learning_rate": 3.319268635724332e-05, "loss": 0.0692, "step": 6360 }, { "epoch": 20.003164556962027, "grad_norm": 15.818922996520996, "learning_rate": 3.315752461322082e-05, "loss": 0.1544, "step": 6370 }, { "epoch": 20.00379746835443, "grad_norm": 0.0613020621240139, "learning_rate": 3.312236286919831e-05, "loss": 0.0584, "step": 6380 }, { "epoch": 20.004430379746836, "grad_norm": 1.5182740688323975, "learning_rate": 3.308720112517581e-05, "loss": 0.0507, "step": 6390 }, { "epoch": 20.00506329113924, "grad_norm": 0.17965848743915558, "learning_rate": 3.305203938115331e-05, "loss": 0.1409, "step": 6400 }, { "epoch": 20.005696202531645, "grad_norm": 13.088379859924316, "learning_rate": 3.3016877637130806e-05, "loss": 0.1933, "step": 6410 }, { "epoch": 20.00632911392405, "grad_norm": 3.075198173522949, "learning_rate": 3.29817158931083e-05, "loss": 0.0366, "step": 6420 }, { "epoch": 20.006962025316454, "grad_norm": 0.25585731863975525, "learning_rate": 3.2946554149085796e-05, "loss": 0.0869, "step": 6430 }, { "epoch": 20.00759493670886, "grad_norm": 1.9562506675720215, "learning_rate": 3.291139240506329e-05, "loss": 0.0486, "step": 6440 }, { "epoch": 20.008227848101267, "grad_norm": 1.8931715488433838, "learning_rate": 3.287623066104079e-05, "loss": 0.1142, "step": 6450 }, { "epoch": 20.008860759493672, "grad_norm": 1.9258003234863281, "learning_rate": 3.2841068917018284e-05, "loss": 0.0716, "step": 6460 }, { "epoch": 20.009493670886076, "grad_norm": 0.2476690262556076, "learning_rate": 3.280590717299578e-05, "loss": 0.0593, "step": 6470 }, { "epoch": 20.01012658227848, "grad_norm": 0.041350286453962326, "learning_rate": 3.2770745428973274e-05, "loss": 0.0521, "step": 6480 }, { "epoch": 20.010759493670886, "grad_norm": 6.784064769744873, "learning_rate": 3.273558368495078e-05, "loss": 0.0805, "step": 6490 }, { "epoch": 20.01139240506329, "grad_norm": 1.9818538427352905, "learning_rate": 3.270042194092827e-05, "loss": 0.0722, "step": 6500 }, { "epoch": 20.012025316455695, "grad_norm": 2.0829291343688965, "learning_rate": 3.266526019690577e-05, "loss": 0.1185, "step": 6510 }, { "epoch": 20.0126582278481, "grad_norm": 0.06409807503223419, "learning_rate": 3.263009845288326e-05, "loss": 0.0777, "step": 6520 }, { "epoch": 20.013291139240508, "grad_norm": 0.052046943455934525, "learning_rate": 3.2594936708860766e-05, "loss": 0.053, "step": 6530 }, { "epoch": 20.013924050632912, "grad_norm": 6.423180103302002, "learning_rate": 3.255977496483826e-05, "loss": 0.1333, "step": 6540 }, { "epoch": 20.014556962025317, "grad_norm": 0.04574073851108551, "learning_rate": 3.2524613220815756e-05, "loss": 0.1059, "step": 6550 }, { "epoch": 20.01518987341772, "grad_norm": 5.25015115737915, "learning_rate": 3.248945147679325e-05, "loss": 0.1471, "step": 6560 }, { "epoch": 20.015822784810126, "grad_norm": 0.036584123969078064, "learning_rate": 3.2454289732770746e-05, "loss": 0.2641, "step": 6570 }, { "epoch": 20.01645569620253, "grad_norm": 10.215600967407227, "learning_rate": 3.2419127988748244e-05, "loss": 0.204, "step": 6580 }, { "epoch": 20.017088607594935, "grad_norm": 0.7967722415924072, "learning_rate": 3.2383966244725736e-05, "loss": 0.1105, "step": 6590 }, { "epoch": 20.01772151898734, "grad_norm": 28.42278480529785, "learning_rate": 3.234880450070324e-05, "loss": 0.1647, "step": 6600 }, { "epoch": 20.01835443037975, "grad_norm": 0.14231210947036743, "learning_rate": 3.231364275668073e-05, "loss": 0.0553, "step": 6610 }, { "epoch": 20.018987341772153, "grad_norm": 0.05637258663773537, "learning_rate": 3.227848101265823e-05, "loss": 0.074, "step": 6620 }, { "epoch": 20.019620253164558, "grad_norm": 0.2689402401447296, "learning_rate": 3.224331926863572e-05, "loss": 0.129, "step": 6630 }, { "epoch": 20.02, "eval_accuracy": 0.9520153550863724, "eval_loss": 0.1470969319343567, "eval_runtime": 903.3982, "eval_samples_per_second": 0.577, "eval_steps_per_second": 0.073, "step": 6636 }, { "epoch": 21.000253164556963, "grad_norm": 1.2210781574249268, "learning_rate": 3.220815752461323e-05, "loss": 0.1749, "step": 6640 }, { "epoch": 21.000886075949367, "grad_norm": 1.918101191520691, "learning_rate": 3.217299578059072e-05, "loss": 0.109, "step": 6650 }, { "epoch": 21.001518987341772, "grad_norm": 1.1142789125442505, "learning_rate": 3.213783403656822e-05, "loss": 0.2533, "step": 6660 }, { "epoch": 21.002151898734176, "grad_norm": 2.3127224445343018, "learning_rate": 3.210267229254571e-05, "loss": 0.1336, "step": 6670 }, { "epoch": 21.00278481012658, "grad_norm": 2.306687116622925, "learning_rate": 3.2067510548523214e-05, "loss": 0.1947, "step": 6680 }, { "epoch": 21.003417721518986, "grad_norm": 0.024246342480182648, "learning_rate": 3.2032348804500705e-05, "loss": 0.0446, "step": 6690 }, { "epoch": 21.004050632911394, "grad_norm": 0.0923205316066742, "learning_rate": 3.1997187060478204e-05, "loss": 0.0261, "step": 6700 }, { "epoch": 21.0046835443038, "grad_norm": 0.020252803340554237, "learning_rate": 3.1962025316455695e-05, "loss": 0.0664, "step": 6710 }, { "epoch": 21.005316455696203, "grad_norm": 1.8104360103607178, "learning_rate": 3.1926863572433193e-05, "loss": 0.0501, "step": 6720 }, { "epoch": 21.005949367088608, "grad_norm": 0.13536378741264343, "learning_rate": 3.189170182841069e-05, "loss": 0.1107, "step": 6730 }, { "epoch": 21.006582278481012, "grad_norm": 0.023490697145462036, "learning_rate": 3.185654008438819e-05, "loss": 0.0581, "step": 6740 }, { "epoch": 21.007215189873417, "grad_norm": 0.011167807504534721, "learning_rate": 3.182137834036568e-05, "loss": 0.0835, "step": 6750 }, { "epoch": 21.00784810126582, "grad_norm": 2.162050485610962, "learning_rate": 3.178621659634318e-05, "loss": 0.0804, "step": 6760 }, { "epoch": 21.008481012658226, "grad_norm": 0.026973918080329895, "learning_rate": 3.175105485232068e-05, "loss": 0.0677, "step": 6770 }, { "epoch": 21.009113924050634, "grad_norm": 2.1330974102020264, "learning_rate": 3.171589310829817e-05, "loss": 0.1016, "step": 6780 }, { "epoch": 21.00974683544304, "grad_norm": 1.0756934881210327, "learning_rate": 3.168073136427567e-05, "loss": 0.0232, "step": 6790 }, { "epoch": 21.010379746835444, "grad_norm": 18.678178787231445, "learning_rate": 3.1645569620253167e-05, "loss": 0.0977, "step": 6800 }, { "epoch": 21.01101265822785, "grad_norm": 1.5976709127426147, "learning_rate": 3.1610407876230665e-05, "loss": 0.069, "step": 6810 }, { "epoch": 21.011645569620253, "grad_norm": 0.5849307775497437, "learning_rate": 3.1575246132208156e-05, "loss": 0.1015, "step": 6820 }, { "epoch": 21.012278481012657, "grad_norm": 3.8058197498321533, "learning_rate": 3.1540084388185655e-05, "loss": 0.1165, "step": 6830 }, { "epoch": 21.012911392405062, "grad_norm": 1.8044683933258057, "learning_rate": 3.150492264416315e-05, "loss": 0.0801, "step": 6840 }, { "epoch": 21.013544303797467, "grad_norm": 1.89737868309021, "learning_rate": 3.146976090014065e-05, "loss": 0.0453, "step": 6850 }, { "epoch": 21.014177215189875, "grad_norm": 2.135648250579834, "learning_rate": 3.143459915611814e-05, "loss": 0.0314, "step": 6860 }, { "epoch": 21.01481012658228, "grad_norm": 0.015429515391588211, "learning_rate": 3.139943741209564e-05, "loss": 0.1356, "step": 6870 }, { "epoch": 21.015443037974684, "grad_norm": 0.029538467526435852, "learning_rate": 3.136427566807314e-05, "loss": 0.0375, "step": 6880 }, { "epoch": 21.01607594936709, "grad_norm": 0.013729414902627468, "learning_rate": 3.132911392405064e-05, "loss": 0.154, "step": 6890 }, { "epoch": 21.016708860759493, "grad_norm": 1.6405558586120605, "learning_rate": 3.129395218002813e-05, "loss": 0.0856, "step": 6900 }, { "epoch": 21.017341772151898, "grad_norm": 0.02224159613251686, "learning_rate": 3.125879043600563e-05, "loss": 0.0482, "step": 6910 }, { "epoch": 21.017974683544303, "grad_norm": 0.024815354496240616, "learning_rate": 3.1223628691983126e-05, "loss": 0.0808, "step": 6920 }, { "epoch": 21.018607594936707, "grad_norm": 2.0164098739624023, "learning_rate": 3.118846694796062e-05, "loss": 0.0392, "step": 6930 }, { "epoch": 21.019240506329115, "grad_norm": 1.8977909088134766, "learning_rate": 3.1153305203938116e-05, "loss": 0.0415, "step": 6940 }, { "epoch": 21.01987341772152, "grad_norm": 2.1623473167419434, "learning_rate": 3.111814345991561e-05, "loss": 0.0828, "step": 6950 }, { "epoch": 21.02, "eval_accuracy": 0.9673704414587332, "eval_loss": 0.06080710142850876, "eval_runtime": 897.4486, "eval_samples_per_second": 0.581, "eval_steps_per_second": 0.074, "step": 6952 }, { "epoch": 22.000506329113925, "grad_norm": 1.7337490320205688, "learning_rate": 3.108298171589311e-05, "loss": 0.0741, "step": 6960 }, { "epoch": 22.00113924050633, "grad_norm": 0.029260125011205673, "learning_rate": 3.1047819971870604e-05, "loss": 0.0384, "step": 6970 }, { "epoch": 22.001772151898734, "grad_norm": 0.014634775929152966, "learning_rate": 3.10126582278481e-05, "loss": 0.0472, "step": 6980 }, { "epoch": 22.00240506329114, "grad_norm": 0.031179388985037804, "learning_rate": 3.0977496483825594e-05, "loss": 0.0498, "step": 6990 }, { "epoch": 22.003037974683544, "grad_norm": 2.017439126968384, "learning_rate": 3.09423347398031e-05, "loss": 0.0726, "step": 7000 }, { "epoch": 22.00367088607595, "grad_norm": 0.01198955811560154, "learning_rate": 3.090717299578059e-05, "loss": 0.0546, "step": 7010 }, { "epoch": 22.004303797468353, "grad_norm": 1.72632896900177, "learning_rate": 3.087201125175809e-05, "loss": 0.0686, "step": 7020 }, { "epoch": 22.00493670886076, "grad_norm": 0.013193360529839993, "learning_rate": 3.083684950773559e-05, "loss": 0.0543, "step": 7030 }, { "epoch": 22.005569620253166, "grad_norm": 0.2840830385684967, "learning_rate": 3.0801687763713086e-05, "loss": 0.0632, "step": 7040 }, { "epoch": 22.00620253164557, "grad_norm": 3.4681308269500732, "learning_rate": 3.076652601969058e-05, "loss": 0.1418, "step": 7050 }, { "epoch": 22.006835443037975, "grad_norm": 2.013949155807495, "learning_rate": 3.0731364275668076e-05, "loss": 0.1189, "step": 7060 }, { "epoch": 22.00746835443038, "grad_norm": 0.03054201602935791, "learning_rate": 3.0696202531645574e-05, "loss": 0.0485, "step": 7070 }, { "epoch": 22.008101265822784, "grad_norm": 0.01309112273156643, "learning_rate": 3.0661040787623065e-05, "loss": 0.0337, "step": 7080 }, { "epoch": 22.00873417721519, "grad_norm": 0.009347977116703987, "learning_rate": 3.0625879043600564e-05, "loss": 0.1198, "step": 7090 }, { "epoch": 22.009367088607593, "grad_norm": 1.491429328918457, "learning_rate": 3.059071729957806e-05, "loss": 0.0425, "step": 7100 }, { "epoch": 22.01, "grad_norm": 2.0944862365722656, "learning_rate": 3.055555555555556e-05, "loss": 0.0553, "step": 7110 }, { "epoch": 22.010632911392406, "grad_norm": 0.02638574317097664, "learning_rate": 3.052039381153305e-05, "loss": 0.034, "step": 7120 }, { "epoch": 22.01126582278481, "grad_norm": 0.053611740469932556, "learning_rate": 3.048523206751055e-05, "loss": 0.0355, "step": 7130 }, { "epoch": 22.011898734177215, "grad_norm": 0.020069239661097527, "learning_rate": 3.0450070323488045e-05, "loss": 0.0657, "step": 7140 }, { "epoch": 22.01253164556962, "grad_norm": 0.7463811635971069, "learning_rate": 3.0414908579465547e-05, "loss": 0.119, "step": 7150 }, { "epoch": 22.013164556962025, "grad_norm": 1.8718035221099854, "learning_rate": 3.0379746835443042e-05, "loss": 0.0826, "step": 7160 }, { "epoch": 22.01379746835443, "grad_norm": 1.7672007083892822, "learning_rate": 3.0344585091420537e-05, "loss": 0.0616, "step": 7170 }, { "epoch": 22.014430379746834, "grad_norm": 0.014087089337408543, "learning_rate": 3.0309423347398032e-05, "loss": 0.0753, "step": 7180 }, { "epoch": 22.015063291139242, "grad_norm": 2.0793707370758057, "learning_rate": 3.027426160337553e-05, "loss": 0.0782, "step": 7190 }, { "epoch": 22.015696202531647, "grad_norm": 0.040184978395700455, "learning_rate": 3.0239099859353025e-05, "loss": 0.0132, "step": 7200 }, { "epoch": 22.01632911392405, "grad_norm": 3.020726203918457, "learning_rate": 3.020393811533052e-05, "loss": 0.0371, "step": 7210 }, { "epoch": 22.016962025316456, "grad_norm": 0.02047577127814293, "learning_rate": 3.0168776371308015e-05, "loss": 0.0535, "step": 7220 }, { "epoch": 22.01759493670886, "grad_norm": 0.014698415994644165, "learning_rate": 3.0133614627285517e-05, "loss": 0.063, "step": 7230 }, { "epoch": 22.018227848101265, "grad_norm": 0.016574831679463387, "learning_rate": 3.009845288326301e-05, "loss": 0.0531, "step": 7240 }, { "epoch": 22.01886075949367, "grad_norm": 0.010280176065862179, "learning_rate": 3.0063291139240506e-05, "loss": 0.0952, "step": 7250 }, { "epoch": 22.019493670886074, "grad_norm": 0.03817346692085266, "learning_rate": 3.0028129395218e-05, "loss": 0.0544, "step": 7260 }, { "epoch": 22.02, "eval_accuracy": 0.9712092130518234, "eval_loss": 0.05330738425254822, "eval_runtime": 868.0763, "eval_samples_per_second": 0.6, "eval_steps_per_second": 0.076, "step": 7268 }, { "epoch": 23.00012658227848, "grad_norm": 0.014792737551033497, "learning_rate": 2.9992967651195503e-05, "loss": 0.0958, "step": 7270 }, { "epoch": 23.000759493670888, "grad_norm": 0.01068217121064663, "learning_rate": 2.9957805907172998e-05, "loss": 0.0747, "step": 7280 }, { "epoch": 23.001392405063292, "grad_norm": 1.8503497838974, "learning_rate": 2.9922644163150493e-05, "loss": 0.0916, "step": 7290 }, { "epoch": 23.002025316455697, "grad_norm": 1.8655881881713867, "learning_rate": 2.9887482419127988e-05, "loss": 0.0526, "step": 7300 }, { "epoch": 23.0026582278481, "grad_norm": 2.7641124725341797, "learning_rate": 2.985232067510549e-05, "loss": 0.0862, "step": 7310 }, { "epoch": 23.003291139240506, "grad_norm": 0.013922685757279396, "learning_rate": 2.9817158931082985e-05, "loss": 0.0557, "step": 7320 }, { "epoch": 23.00392405063291, "grad_norm": 1.8126795291900635, "learning_rate": 2.978199718706048e-05, "loss": 0.1077, "step": 7330 }, { "epoch": 23.004556962025315, "grad_norm": 0.016684675589203835, "learning_rate": 2.9746835443037974e-05, "loss": 0.1824, "step": 7340 }, { "epoch": 23.00518987341772, "grad_norm": 0.010096821002662182, "learning_rate": 2.9711673699015473e-05, "loss": 0.0616, "step": 7350 }, { "epoch": 23.005822784810128, "grad_norm": 0.015037915669381618, "learning_rate": 2.9676511954992968e-05, "loss": 0.029, "step": 7360 }, { "epoch": 23.006455696202533, "grad_norm": 0.34261658787727356, "learning_rate": 2.9641350210970466e-05, "loss": 0.0256, "step": 7370 }, { "epoch": 23.007088607594937, "grad_norm": 2.0921599864959717, "learning_rate": 2.960618846694796e-05, "loss": 0.0621, "step": 7380 }, { "epoch": 23.007721518987342, "grad_norm": 0.009613982401788235, "learning_rate": 2.957102672292546e-05, "loss": 0.0634, "step": 7390 }, { "epoch": 23.008354430379747, "grad_norm": 0.009403366595506668, "learning_rate": 2.9535864978902954e-05, "loss": 0.1097, "step": 7400 }, { "epoch": 23.00898734177215, "grad_norm": 0.009238903410732746, "learning_rate": 2.950070323488045e-05, "loss": 0.0313, "step": 7410 }, { "epoch": 23.009620253164556, "grad_norm": 0.01602065935730934, "learning_rate": 2.9465541490857944e-05, "loss": 0.0149, "step": 7420 }, { "epoch": 23.01025316455696, "grad_norm": 0.01591380685567856, "learning_rate": 2.9430379746835446e-05, "loss": 0.0474, "step": 7430 }, { "epoch": 23.01088607594937, "grad_norm": 1.879072666168213, "learning_rate": 2.939521800281294e-05, "loss": 0.0615, "step": 7440 }, { "epoch": 23.011518987341773, "grad_norm": 0.014801949262619019, "learning_rate": 2.9360056258790436e-05, "loss": 0.0654, "step": 7450 }, { "epoch": 23.012151898734178, "grad_norm": 0.05049045756459236, "learning_rate": 2.9324894514767937e-05, "loss": 0.025, "step": 7460 }, { "epoch": 23.012784810126583, "grad_norm": 1.6079883575439453, "learning_rate": 2.9289732770745432e-05, "loss": 0.0663, "step": 7470 }, { "epoch": 23.013417721518987, "grad_norm": 0.018271761015057564, "learning_rate": 2.9254571026722927e-05, "loss": 0.0164, "step": 7480 }, { "epoch": 23.014050632911392, "grad_norm": 0.9706446528434753, "learning_rate": 2.9219409282700422e-05, "loss": 0.0981, "step": 7490 }, { "epoch": 23.014683544303796, "grad_norm": 0.010271470062434673, "learning_rate": 2.9184247538677924e-05, "loss": 0.0293, "step": 7500 }, { "epoch": 23.0153164556962, "grad_norm": 1.9835656881332397, "learning_rate": 2.914908579465542e-05, "loss": 0.1089, "step": 7510 }, { "epoch": 23.01594936708861, "grad_norm": 0.01825304701924324, "learning_rate": 2.9113924050632914e-05, "loss": 0.0524, "step": 7520 }, { "epoch": 23.016582278481014, "grad_norm": 0.009788533672690392, "learning_rate": 2.907876230661041e-05, "loss": 0.065, "step": 7530 }, { "epoch": 23.01721518987342, "grad_norm": 2.2043983936309814, "learning_rate": 2.9043600562587907e-05, "loss": 0.1945, "step": 7540 }, { "epoch": 23.017848101265823, "grad_norm": 0.00593506870791316, "learning_rate": 2.9008438818565402e-05, "loss": 0.0846, "step": 7550 }, { "epoch": 23.018481012658228, "grad_norm": 0.016028795391321182, "learning_rate": 2.8973277074542897e-05, "loss": 0.0697, "step": 7560 }, { "epoch": 23.019113924050632, "grad_norm": 2.0164456367492676, "learning_rate": 2.8938115330520392e-05, "loss": 0.0579, "step": 7570 }, { "epoch": 23.019746835443037, "grad_norm": 0.009185468778014183, "learning_rate": 2.8902953586497894e-05, "loss": 0.0509, "step": 7580 }, { "epoch": 23.02, "eval_accuracy": 0.9750479846449136, "eval_loss": 0.04993312060832977, "eval_runtime": 884.1639, "eval_samples_per_second": 0.589, "eval_steps_per_second": 0.075, "step": 7584 }, { "epoch": 24.000379746835442, "grad_norm": 0.007606441620737314, "learning_rate": 2.886779184247539e-05, "loss": 0.086, "step": 7590 }, { "epoch": 24.001012658227847, "grad_norm": 1.587971806526184, "learning_rate": 2.8832630098452884e-05, "loss": 0.0666, "step": 7600 }, { "epoch": 24.001645569620255, "grad_norm": 0.06458862125873566, "learning_rate": 2.879746835443038e-05, "loss": 0.1167, "step": 7610 }, { "epoch": 24.00227848101266, "grad_norm": 0.008511193096637726, "learning_rate": 2.876230661040788e-05, "loss": 0.0582, "step": 7620 }, { "epoch": 24.002911392405064, "grad_norm": 0.01144248154014349, "learning_rate": 2.8727144866385375e-05, "loss": 0.0537, "step": 7630 }, { "epoch": 24.00354430379747, "grad_norm": 2.393695116043091, "learning_rate": 2.869198312236287e-05, "loss": 0.0521, "step": 7640 }, { "epoch": 24.004177215189873, "grad_norm": 0.010813113301992416, "learning_rate": 2.8656821378340365e-05, "loss": 0.0567, "step": 7650 }, { "epoch": 24.004810126582278, "grad_norm": 0.6152679920196533, "learning_rate": 2.8621659634317867e-05, "loss": 0.0387, "step": 7660 }, { "epoch": 24.005443037974683, "grad_norm": 1.79938542842865, "learning_rate": 2.858649789029536e-05, "loss": 0.0198, "step": 7670 }, { "epoch": 24.006075949367087, "grad_norm": 2.5060055255889893, "learning_rate": 2.8551336146272857e-05, "loss": 0.142, "step": 7680 }, { "epoch": 24.006708860759495, "grad_norm": 0.009323079138994217, "learning_rate": 2.851617440225035e-05, "loss": 0.0479, "step": 7690 }, { "epoch": 24.0073417721519, "grad_norm": 2.835371732711792, "learning_rate": 2.848101265822785e-05, "loss": 0.0361, "step": 7700 }, { "epoch": 24.007974683544305, "grad_norm": 2.476736545562744, "learning_rate": 2.8445850914205345e-05, "loss": 0.1028, "step": 7710 }, { "epoch": 24.00860759493671, "grad_norm": 0.011100457981228828, "learning_rate": 2.8410689170182843e-05, "loss": 0.109, "step": 7720 }, { "epoch": 24.009240506329114, "grad_norm": 0.009698892012238503, "learning_rate": 2.8375527426160338e-05, "loss": 0.0402, "step": 7730 }, { "epoch": 24.00987341772152, "grad_norm": 0.32084423303604126, "learning_rate": 2.8340365682137836e-05, "loss": 0.0261, "step": 7740 }, { "epoch": 24.010506329113923, "grad_norm": 0.03804817050695419, "learning_rate": 2.830520393811533e-05, "loss": 0.0519, "step": 7750 }, { "epoch": 24.011139240506328, "grad_norm": 0.02210908569395542, "learning_rate": 2.8270042194092826e-05, "loss": 0.1201, "step": 7760 }, { "epoch": 24.011772151898736, "grad_norm": 0.013488766737282276, "learning_rate": 2.823488045007032e-05, "loss": 0.1115, "step": 7770 }, { "epoch": 24.01240506329114, "grad_norm": 0.01102654542773962, "learning_rate": 2.8199718706047823e-05, "loss": 0.0377, "step": 7780 }, { "epoch": 24.013037974683545, "grad_norm": 2.595425844192505, "learning_rate": 2.8164556962025318e-05, "loss": 0.2162, "step": 7790 }, { "epoch": 24.01367088607595, "grad_norm": 0.3236597180366516, "learning_rate": 2.8129395218002813e-05, "loss": 0.107, "step": 7800 }, { "epoch": 24.014303797468354, "grad_norm": 0.769616961479187, "learning_rate": 2.8094233473980308e-05, "loss": 0.1602, "step": 7810 }, { "epoch": 24.01493670886076, "grad_norm": 0.10283089429140091, "learning_rate": 2.805907172995781e-05, "loss": 0.0909, "step": 7820 }, { "epoch": 24.015569620253164, "grad_norm": 0.02031938172876835, "learning_rate": 2.8023909985935304e-05, "loss": 0.1172, "step": 7830 }, { "epoch": 24.01620253164557, "grad_norm": 1.7063891887664795, "learning_rate": 2.79887482419128e-05, "loss": 0.0797, "step": 7840 }, { "epoch": 24.016835443037976, "grad_norm": 1.6002981662750244, "learning_rate": 2.7953586497890294e-05, "loss": 0.0447, "step": 7850 }, { "epoch": 24.01746835443038, "grad_norm": 1.945346474647522, "learning_rate": 2.7918424753867796e-05, "loss": 0.0461, "step": 7860 }, { "epoch": 24.018101265822786, "grad_norm": 2.0498995780944824, "learning_rate": 2.788326300984529e-05, "loss": 0.1525, "step": 7870 }, { "epoch": 24.01873417721519, "grad_norm": 0.018395045772194862, "learning_rate": 2.7848101265822786e-05, "loss": 0.1554, "step": 7880 }, { "epoch": 24.019367088607595, "grad_norm": 2.4818601608276367, "learning_rate": 2.7812939521800284e-05, "loss": 0.2115, "step": 7890 }, { "epoch": 24.02, "grad_norm": 0.01920860819518566, "learning_rate": 2.777777777777778e-05, "loss": 0.0308, "step": 7900 }, { "epoch": 24.02, "eval_accuracy": 0.9596928982725528, "eval_loss": 0.09555886685848236, "eval_runtime": 895.4045, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.074, "step": 7900 }, { "epoch": 25.000632911392405, "grad_norm": 0.9809036254882812, "learning_rate": 2.7742616033755274e-05, "loss": 0.0833, "step": 7910 }, { "epoch": 25.00126582278481, "grad_norm": 0.0848427340388298, "learning_rate": 2.770745428973277e-05, "loss": 0.04, "step": 7920 }, { "epoch": 25.001898734177214, "grad_norm": 1.635724663734436, "learning_rate": 2.767229254571027e-05, "loss": 0.1677, "step": 7930 }, { "epoch": 25.00253164556962, "grad_norm": 1.4175573587417603, "learning_rate": 2.7637130801687766e-05, "loss": 0.0733, "step": 7940 }, { "epoch": 25.003164556962027, "grad_norm": 2.555398464202881, "learning_rate": 2.760196905766526e-05, "loss": 0.0981, "step": 7950 }, { "epoch": 25.00379746835443, "grad_norm": 0.024168817326426506, "learning_rate": 2.7566807313642756e-05, "loss": 0.0517, "step": 7960 }, { "epoch": 25.004430379746836, "grad_norm": 2.4232094287872314, "learning_rate": 2.7531645569620257e-05, "loss": 0.1317, "step": 7970 }, { "epoch": 25.00506329113924, "grad_norm": 0.030434638261795044, "learning_rate": 2.7496483825597752e-05, "loss": 0.162, "step": 7980 }, { "epoch": 25.005696202531645, "grad_norm": 0.24577154219150543, "learning_rate": 2.7461322081575247e-05, "loss": 0.0346, "step": 7990 }, { "epoch": 25.00632911392405, "grad_norm": 0.009798028506338596, "learning_rate": 2.7426160337552742e-05, "loss": 0.0209, "step": 8000 }, { "epoch": 25.006962025316454, "grad_norm": 0.009709338657557964, "learning_rate": 2.7390998593530244e-05, "loss": 0.0329, "step": 8010 }, { "epoch": 25.00759493670886, "grad_norm": 0.025753848254680634, "learning_rate": 2.735583684950774e-05, "loss": 0.0368, "step": 8020 }, { "epoch": 25.008227848101267, "grad_norm": 1.9421815872192383, "learning_rate": 2.7320675105485234e-05, "loss": 0.0755, "step": 8030 }, { "epoch": 25.008860759493672, "grad_norm": 0.00867847353219986, "learning_rate": 2.728551336146273e-05, "loss": 0.0265, "step": 8040 }, { "epoch": 25.009493670886076, "grad_norm": 0.008214855566620827, "learning_rate": 2.7250351617440227e-05, "loss": 0.0698, "step": 8050 }, { "epoch": 25.01012658227848, "grad_norm": 0.013393707573413849, "learning_rate": 2.7215189873417722e-05, "loss": 0.1019, "step": 8060 }, { "epoch": 25.010759493670886, "grad_norm": 0.007839508354663849, "learning_rate": 2.718002812939522e-05, "loss": 0.0401, "step": 8070 }, { "epoch": 25.01139240506329, "grad_norm": 1.8051772117614746, "learning_rate": 2.7144866385372715e-05, "loss": 0.0971, "step": 8080 }, { "epoch": 25.012025316455695, "grad_norm": 1.6515543460845947, "learning_rate": 2.7109704641350213e-05, "loss": 0.097, "step": 8090 }, { "epoch": 25.0126582278481, "grad_norm": 1.732775330543518, "learning_rate": 2.707454289732771e-05, "loss": 0.0661, "step": 8100 }, { "epoch": 25.013291139240508, "grad_norm": 2.0589871406555176, "learning_rate": 2.7039381153305203e-05, "loss": 0.0727, "step": 8110 }, { "epoch": 25.013924050632912, "grad_norm": 0.017162639647722244, "learning_rate": 2.7004219409282698e-05, "loss": 0.0666, "step": 8120 }, { "epoch": 25.014556962025317, "grad_norm": 0.025243617594242096, "learning_rate": 2.69690576652602e-05, "loss": 0.1164, "step": 8130 }, { "epoch": 25.01518987341772, "grad_norm": 0.011467033065855503, "learning_rate": 2.6933895921237695e-05, "loss": 0.0974, "step": 8140 }, { "epoch": 25.015822784810126, "grad_norm": 2.5216453075408936, "learning_rate": 2.689873417721519e-05, "loss": 0.1147, "step": 8150 }, { "epoch": 25.01645569620253, "grad_norm": 0.011125218123197556, "learning_rate": 2.6863572433192685e-05, "loss": 0.011, "step": 8160 }, { "epoch": 25.017088607594935, "grad_norm": 0.0575890839099884, "learning_rate": 2.6828410689170186e-05, "loss": 0.0757, "step": 8170 }, { "epoch": 25.01772151898734, "grad_norm": 0.01529190968722105, "learning_rate": 2.679324894514768e-05, "loss": 0.1118, "step": 8180 }, { "epoch": 25.01835443037975, "grad_norm": 1.783360481262207, "learning_rate": 2.6758087201125176e-05, "loss": 0.0738, "step": 8190 }, { "epoch": 25.018987341772153, "grad_norm": 0.2645050585269928, "learning_rate": 2.672292545710267e-05, "loss": 0.0719, "step": 8200 }, { "epoch": 25.019620253164558, "grad_norm": 1.2059509754180908, "learning_rate": 2.6687763713080173e-05, "loss": 0.0729, "step": 8210 }, { "epoch": 25.02, "eval_accuracy": 0.9731285988483686, "eval_loss": 0.07531290501356125, "eval_runtime": 883.3121, "eval_samples_per_second": 0.59, "eval_steps_per_second": 0.075, "step": 8216 }, { "epoch": 26.000253164556963, "grad_norm": 0.11341290175914764, "learning_rate": 2.6652601969057668e-05, "loss": 0.1398, "step": 8220 }, { "epoch": 26.000886075949367, "grad_norm": 0.1513560563325882, "learning_rate": 2.6617440225035163e-05, "loss": 0.0599, "step": 8230 }, { "epoch": 26.001518987341772, "grad_norm": 0.008346611633896828, "learning_rate": 2.6582278481012658e-05, "loss": 0.0526, "step": 8240 }, { "epoch": 26.002151898734176, "grad_norm": 0.04121720790863037, "learning_rate": 2.6547116736990156e-05, "loss": 0.0438, "step": 8250 }, { "epoch": 26.00278481012658, "grad_norm": 0.019222108647227287, "learning_rate": 2.651195499296765e-05, "loss": 0.0679, "step": 8260 }, { "epoch": 26.003417721518986, "grad_norm": 2.1048424243927, "learning_rate": 2.6476793248945146e-05, "loss": 0.0947, "step": 8270 }, { "epoch": 26.004050632911394, "grad_norm": 2.955946683883667, "learning_rate": 2.6441631504922648e-05, "loss": 0.1254, "step": 8280 }, { "epoch": 26.0046835443038, "grad_norm": 0.013249853625893593, "learning_rate": 2.6406469760900143e-05, "loss": 0.0344, "step": 8290 }, { "epoch": 26.005316455696203, "grad_norm": 0.008436400443315506, "learning_rate": 2.6371308016877638e-05, "loss": 0.0611, "step": 8300 }, { "epoch": 26.005949367088608, "grad_norm": 1.8616793155670166, "learning_rate": 2.6336146272855133e-05, "loss": 0.0528, "step": 8310 }, { "epoch": 26.006582278481012, "grad_norm": 0.030170224606990814, "learning_rate": 2.6300984528832634e-05, "loss": 0.0981, "step": 8320 }, { "epoch": 26.007215189873417, "grad_norm": 0.011277982033789158, "learning_rate": 2.626582278481013e-05, "loss": 0.0216, "step": 8330 }, { "epoch": 26.00784810126582, "grad_norm": 0.009784871712327003, "learning_rate": 2.6230661040787624e-05, "loss": 0.0485, "step": 8340 }, { "epoch": 26.008481012658226, "grad_norm": 4.80241584777832, "learning_rate": 2.619549929676512e-05, "loss": 0.1301, "step": 8350 }, { "epoch": 26.009113924050634, "grad_norm": 2.4445245265960693, "learning_rate": 2.616033755274262e-05, "loss": 0.0571, "step": 8360 }, { "epoch": 26.00974683544304, "grad_norm": 0.009524138644337654, "learning_rate": 2.6125175808720116e-05, "loss": 0.0229, "step": 8370 }, { "epoch": 26.010379746835444, "grad_norm": 0.010364298708736897, "learning_rate": 2.609001406469761e-05, "loss": 0.0559, "step": 8380 }, { "epoch": 26.01101265822785, "grad_norm": 0.047712892293930054, "learning_rate": 2.6054852320675106e-05, "loss": 0.1409, "step": 8390 }, { "epoch": 26.011645569620253, "grad_norm": 0.010216879658401012, "learning_rate": 2.6019690576652604e-05, "loss": 0.0701, "step": 8400 }, { "epoch": 26.012278481012657, "grad_norm": 1.5211375951766968, "learning_rate": 2.5984528832630102e-05, "loss": 0.0774, "step": 8410 }, { "epoch": 26.012911392405062, "grad_norm": 4.259005546569824, "learning_rate": 2.5949367088607597e-05, "loss": 0.0846, "step": 8420 }, { "epoch": 26.013544303797467, "grad_norm": 0.008692068047821522, "learning_rate": 2.5914205344585092e-05, "loss": 0.2192, "step": 8430 }, { "epoch": 26.014177215189875, "grad_norm": 0.9112133383750916, "learning_rate": 2.587904360056259e-05, "loss": 0.112, "step": 8440 }, { "epoch": 26.01481012658228, "grad_norm": 1.4133566617965698, "learning_rate": 2.5843881856540085e-05, "loss": 0.0895, "step": 8450 }, { "epoch": 26.015443037974684, "grad_norm": 0.019907064735889435, "learning_rate": 2.580872011251758e-05, "loss": 0.0752, "step": 8460 }, { "epoch": 26.01607594936709, "grad_norm": 0.22535677254199982, "learning_rate": 2.5773558368495075e-05, "loss": 0.1745, "step": 8470 }, { "epoch": 26.016708860759493, "grad_norm": 0.013840797357261181, "learning_rate": 2.5738396624472577e-05, "loss": 0.0569, "step": 8480 }, { "epoch": 26.017341772151898, "grad_norm": 1.7949068546295166, "learning_rate": 2.5703234880450072e-05, "loss": 0.1005, "step": 8490 }, { "epoch": 26.017974683544303, "grad_norm": 5.203663349151611, "learning_rate": 2.5668073136427567e-05, "loss": 0.1344, "step": 8500 }, { "epoch": 26.018607594936707, "grad_norm": 1.7371711730957031, "learning_rate": 2.5632911392405062e-05, "loss": 0.049, "step": 8510 }, { "epoch": 26.019240506329115, "grad_norm": 0.03200330212712288, "learning_rate": 2.5597749648382564e-05, "loss": 0.0778, "step": 8520 }, { "epoch": 26.01987341772152, "grad_norm": 16.085002899169922, "learning_rate": 2.556258790436006e-05, "loss": 0.2328, "step": 8530 }, { "epoch": 26.02, "eval_accuracy": 0.9654510556621881, "eval_loss": 0.07737769931554794, "eval_runtime": 872.9767, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.076, "step": 8532 }, { "epoch": 27.000506329113925, "grad_norm": 2.101548194885254, "learning_rate": 2.5527426160337553e-05, "loss": 0.0209, "step": 8540 }, { "epoch": 27.00113924050633, "grad_norm": 2.257962942123413, "learning_rate": 2.549226441631505e-05, "loss": 0.2402, "step": 8550 }, { "epoch": 27.001772151898734, "grad_norm": 1.685132622718811, "learning_rate": 2.545710267229255e-05, "loss": 0.0636, "step": 8560 }, { "epoch": 27.00240506329114, "grad_norm": 1.7050256729125977, "learning_rate": 2.5421940928270045e-05, "loss": 0.0449, "step": 8570 }, { "epoch": 27.003037974683544, "grad_norm": 1.991898775100708, "learning_rate": 2.538677918424754e-05, "loss": 0.0455, "step": 8580 }, { "epoch": 27.00367088607595, "grad_norm": 1.5045628547668457, "learning_rate": 2.5351617440225035e-05, "loss": 0.0672, "step": 8590 }, { "epoch": 27.004303797468353, "grad_norm": 0.01319907046854496, "learning_rate": 2.5316455696202533e-05, "loss": 0.094, "step": 8600 }, { "epoch": 27.00493670886076, "grad_norm": 0.01158622931689024, "learning_rate": 2.5281293952180028e-05, "loss": 0.068, "step": 8610 }, { "epoch": 27.005569620253166, "grad_norm": 1.6995807886123657, "learning_rate": 2.5246132208157523e-05, "loss": 0.1678, "step": 8620 }, { "epoch": 27.00620253164557, "grad_norm": 0.012468636967241764, "learning_rate": 2.521097046413502e-05, "loss": 0.0623, "step": 8630 }, { "epoch": 27.006835443037975, "grad_norm": 1.8979823589324951, "learning_rate": 2.517580872011252e-05, "loss": 0.1167, "step": 8640 }, { "epoch": 27.00746835443038, "grad_norm": 0.03327177092432976, "learning_rate": 2.5140646976090015e-05, "loss": 0.1498, "step": 8650 }, { "epoch": 27.008101265822784, "grad_norm": 0.013360361568629742, "learning_rate": 2.510548523206751e-05, "loss": 0.149, "step": 8660 }, { "epoch": 27.00873417721519, "grad_norm": 0.04175138100981712, "learning_rate": 2.5070323488045005e-05, "loss": 0.0581, "step": 8670 }, { "epoch": 27.009367088607593, "grad_norm": 1.2819527387619019, "learning_rate": 2.5035161744022506e-05, "loss": 0.0572, "step": 8680 }, { "epoch": 27.01, "grad_norm": 3.946152448654175, "learning_rate": 2.5e-05, "loss": 0.1165, "step": 8690 }, { "epoch": 27.010632911392406, "grad_norm": 1.6609853506088257, "learning_rate": 2.49648382559775e-05, "loss": 0.0659, "step": 8700 }, { "epoch": 27.01126582278481, "grad_norm": 0.019263017922639847, "learning_rate": 2.4929676511954994e-05, "loss": 0.1063, "step": 8710 }, { "epoch": 27.011898734177215, "grad_norm": 0.01123301312327385, "learning_rate": 2.4894514767932493e-05, "loss": 0.0287, "step": 8720 }, { "epoch": 27.01253164556962, "grad_norm": 0.03187470883131027, "learning_rate": 2.4859353023909988e-05, "loss": 0.0525, "step": 8730 }, { "epoch": 27.013164556962025, "grad_norm": 0.030269309878349304, "learning_rate": 2.4824191279887486e-05, "loss": 0.0155, "step": 8740 }, { "epoch": 27.01379746835443, "grad_norm": 0.04713929817080498, "learning_rate": 2.478902953586498e-05, "loss": 0.103, "step": 8750 }, { "epoch": 27.014430379746834, "grad_norm": 0.007141268812119961, "learning_rate": 2.475386779184248e-05, "loss": 0.0338, "step": 8760 }, { "epoch": 27.015063291139242, "grad_norm": 0.008212805725634098, "learning_rate": 2.4718706047819974e-05, "loss": 0.0764, "step": 8770 }, { "epoch": 27.015696202531647, "grad_norm": 0.007574188522994518, "learning_rate": 2.468354430379747e-05, "loss": 0.0772, "step": 8780 }, { "epoch": 27.01632911392405, "grad_norm": 1.347557544708252, "learning_rate": 2.4648382559774964e-05, "loss": 0.0767, "step": 8790 }, { "epoch": 27.016962025316456, "grad_norm": 0.008004224859178066, "learning_rate": 2.4613220815752462e-05, "loss": 0.0405, "step": 8800 }, { "epoch": 27.01759493670886, "grad_norm": 0.06255421787500381, "learning_rate": 2.4578059071729957e-05, "loss": 0.0634, "step": 8810 }, { "epoch": 27.018227848101265, "grad_norm": 0.014694114215672016, "learning_rate": 2.4542897327707456e-05, "loss": 0.0704, "step": 8820 }, { "epoch": 27.01886075949367, "grad_norm": 0.0064834184013307095, "learning_rate": 2.450773558368495e-05, "loss": 0.0492, "step": 8830 }, { "epoch": 27.019493670886074, "grad_norm": 0.01545525249093771, "learning_rate": 2.447257383966245e-05, "loss": 0.1085, "step": 8840 }, { "epoch": 27.02, "eval_accuracy": 0.9692898272552783, "eval_loss": 0.06086457893252373, "eval_runtime": 940.0459, "eval_samples_per_second": 0.554, "eval_steps_per_second": 0.07, "step": 8848 }, { "epoch": 28.00012658227848, "grad_norm": 1.5009100437164307, "learning_rate": 2.4437412095639944e-05, "loss": 0.0116, "step": 8850 }, { "epoch": 28.000759493670888, "grad_norm": 1.8109427690505981, "learning_rate": 2.4402250351617442e-05, "loss": 0.1101, "step": 8860 }, { "epoch": 28.001392405063292, "grad_norm": 0.016067614778876305, "learning_rate": 2.4367088607594937e-05, "loss": 0.0632, "step": 8870 }, { "epoch": 28.002025316455697, "grad_norm": 1.9723069667816162, "learning_rate": 2.4331926863572436e-05, "loss": 0.0556, "step": 8880 }, { "epoch": 28.0026582278481, "grad_norm": 6.840577125549316, "learning_rate": 2.429676511954993e-05, "loss": 0.0614, "step": 8890 }, { "epoch": 28.003291139240506, "grad_norm": 0.00987607054412365, "learning_rate": 2.426160337552743e-05, "loss": 0.0619, "step": 8900 }, { "epoch": 28.00392405063291, "grad_norm": 1.5094603300094604, "learning_rate": 2.4226441631504924e-05, "loss": 0.0817, "step": 8910 }, { "epoch": 28.004556962025315, "grad_norm": 0.0056548151187598705, "learning_rate": 2.4191279887482422e-05, "loss": 0.0395, "step": 8920 }, { "epoch": 28.00518987341772, "grad_norm": 1.6165566444396973, "learning_rate": 2.4156118143459917e-05, "loss": 0.0196, "step": 8930 }, { "epoch": 28.005822784810128, "grad_norm": 0.011191487312316895, "learning_rate": 2.4120956399437415e-05, "loss": 0.0488, "step": 8940 }, { "epoch": 28.006455696202533, "grad_norm": 0.013476500287652016, "learning_rate": 2.408579465541491e-05, "loss": 0.077, "step": 8950 }, { "epoch": 28.007088607594937, "grad_norm": 0.01594310626387596, "learning_rate": 2.4050632911392405e-05, "loss": 0.0764, "step": 8960 }, { "epoch": 28.007721518987342, "grad_norm": 2.4658524990081787, "learning_rate": 2.40154711673699e-05, "loss": 0.0816, "step": 8970 }, { "epoch": 28.008354430379747, "grad_norm": 0.050282131880521774, "learning_rate": 2.39803094233474e-05, "loss": 0.0483, "step": 8980 }, { "epoch": 28.00898734177215, "grad_norm": 0.0738360807299614, "learning_rate": 2.3945147679324893e-05, "loss": 0.0325, "step": 8990 }, { "epoch": 28.009620253164556, "grad_norm": 2.087075710296631, "learning_rate": 2.3909985935302392e-05, "loss": 0.1095, "step": 9000 }, { "epoch": 28.01025316455696, "grad_norm": 0.012715667486190796, "learning_rate": 2.3874824191279887e-05, "loss": 0.0548, "step": 9010 }, { "epoch": 28.01088607594937, "grad_norm": 2.233593463897705, "learning_rate": 2.3839662447257385e-05, "loss": 0.0342, "step": 9020 }, { "epoch": 28.011518987341773, "grad_norm": 0.016205696389079094, "learning_rate": 2.380450070323488e-05, "loss": 0.0916, "step": 9030 }, { "epoch": 28.012151898734178, "grad_norm": 0.008019771426916122, "learning_rate": 2.3769338959212378e-05, "loss": 0.0851, "step": 9040 }, { "epoch": 28.012784810126583, "grad_norm": 1.6921287775039673, "learning_rate": 2.3734177215189873e-05, "loss": 0.0543, "step": 9050 }, { "epoch": 28.013417721518987, "grad_norm": 0.03432793170213699, "learning_rate": 2.369901547116737e-05, "loss": 0.1162, "step": 9060 }, { "epoch": 28.014050632911392, "grad_norm": 4.5309648513793945, "learning_rate": 2.3663853727144866e-05, "loss": 0.1159, "step": 9070 }, { "epoch": 28.014683544303796, "grad_norm": 1.7229171991348267, "learning_rate": 2.3628691983122365e-05, "loss": 0.0453, "step": 9080 }, { "epoch": 28.0153164556962, "grad_norm": 1.9000073671340942, "learning_rate": 2.359353023909986e-05, "loss": 0.0311, "step": 9090 }, { "epoch": 28.01594936708861, "grad_norm": 23.23397445678711, "learning_rate": 2.3558368495077358e-05, "loss": 0.1287, "step": 9100 }, { "epoch": 28.016582278481014, "grad_norm": 0.14218182861804962, "learning_rate": 2.3523206751054856e-05, "loss": 0.0783, "step": 9110 }, { "epoch": 28.01721518987342, "grad_norm": 1.8179298639297485, "learning_rate": 2.348804500703235e-05, "loss": 0.0228, "step": 9120 }, { "epoch": 28.017848101265823, "grad_norm": 0.01124257780611515, "learning_rate": 2.3452883263009846e-05, "loss": 0.0606, "step": 9130 }, { "epoch": 28.018481012658228, "grad_norm": 0.06378079950809479, "learning_rate": 2.341772151898734e-05, "loss": 0.0613, "step": 9140 }, { "epoch": 28.019113924050632, "grad_norm": 0.5730488896369934, "learning_rate": 2.338255977496484e-05, "loss": 0.0922, "step": 9150 }, { "epoch": 28.019746835443037, "grad_norm": 0.1582580953836441, "learning_rate": 2.3347398030942334e-05, "loss": 0.099, "step": 9160 }, { "epoch": 28.02, "eval_accuracy": 0.9673704414587332, "eval_loss": 0.06770172715187073, "eval_runtime": 874.5137, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.075, "step": 9164 }, { "epoch": 29.000379746835442, "grad_norm": 1.0581729412078857, "learning_rate": 2.3312236286919833e-05, "loss": 0.0999, "step": 9170 }, { "epoch": 29.001012658227847, "grad_norm": 1.9762561321258545, "learning_rate": 2.3277074542897328e-05, "loss": 0.0571, "step": 9180 }, { "epoch": 29.001645569620255, "grad_norm": 1.5621942281723022, "learning_rate": 2.3241912798874826e-05, "loss": 0.0549, "step": 9190 }, { "epoch": 29.00227848101266, "grad_norm": 0.007985897362232208, "learning_rate": 2.320675105485232e-05, "loss": 0.0897, "step": 9200 }, { "epoch": 29.002911392405064, "grad_norm": 1.7005807161331177, "learning_rate": 2.317158931082982e-05, "loss": 0.0953, "step": 9210 }, { "epoch": 29.00354430379747, "grad_norm": 0.01191030628979206, "learning_rate": 2.3136427566807314e-05, "loss": 0.0495, "step": 9220 }, { "epoch": 29.004177215189873, "grad_norm": 2.738387107849121, "learning_rate": 2.3101265822784813e-05, "loss": 0.0748, "step": 9230 }, { "epoch": 29.004810126582278, "grad_norm": 1.7861073017120361, "learning_rate": 2.3066104078762308e-05, "loss": 0.1279, "step": 9240 }, { "epoch": 29.005443037974683, "grad_norm": 2.4535388946533203, "learning_rate": 2.3030942334739806e-05, "loss": 0.1033, "step": 9250 }, { "epoch": 29.006075949367087, "grad_norm": 0.028340207412838936, "learning_rate": 2.29957805907173e-05, "loss": 0.2441, "step": 9260 }, { "epoch": 29.006708860759495, "grad_norm": 0.020006628707051277, "learning_rate": 2.29606188466948e-05, "loss": 0.1339, "step": 9270 }, { "epoch": 29.0073417721519, "grad_norm": 0.06801651418209076, "learning_rate": 2.2925457102672294e-05, "loss": 0.0383, "step": 9280 }, { "epoch": 29.007974683544305, "grad_norm": 0.03369883447885513, "learning_rate": 2.2890295358649792e-05, "loss": 0.0552, "step": 9290 }, { "epoch": 29.00860759493671, "grad_norm": 1.2586785554885864, "learning_rate": 2.2855133614627287e-05, "loss": 0.0745, "step": 9300 }, { "epoch": 29.009240506329114, "grad_norm": 0.008519505150616169, "learning_rate": 2.2819971870604782e-05, "loss": 0.1307, "step": 9310 }, { "epoch": 29.00987341772152, "grad_norm": 0.04472700506448746, "learning_rate": 2.278481012658228e-05, "loss": 0.0757, "step": 9320 }, { "epoch": 29.010506329113923, "grad_norm": 1.7951302528381348, "learning_rate": 2.2749648382559775e-05, "loss": 0.0504, "step": 9330 }, { "epoch": 29.011139240506328, "grad_norm": 7.45163106918335, "learning_rate": 2.271448663853727e-05, "loss": 0.0637, "step": 9340 }, { "epoch": 29.011772151898736, "grad_norm": 1.5808919668197632, "learning_rate": 2.267932489451477e-05, "loss": 0.0681, "step": 9350 }, { "epoch": 29.01240506329114, "grad_norm": 0.009355315938591957, "learning_rate": 2.2644163150492264e-05, "loss": 0.0789, "step": 9360 }, { "epoch": 29.013037974683545, "grad_norm": 36.12007522583008, "learning_rate": 2.2609001406469762e-05, "loss": 0.1642, "step": 9370 }, { "epoch": 29.01367088607595, "grad_norm": 0.007421193178743124, "learning_rate": 2.2573839662447257e-05, "loss": 0.0126, "step": 9380 }, { "epoch": 29.014303797468354, "grad_norm": 1.9807149171829224, "learning_rate": 2.2538677918424755e-05, "loss": 0.1069, "step": 9390 }, { "epoch": 29.01493670886076, "grad_norm": 0.016664525493979454, "learning_rate": 2.250351617440225e-05, "loss": 0.174, "step": 9400 }, { "epoch": 29.015569620253164, "grad_norm": 0.015380149707198143, "learning_rate": 2.246835443037975e-05, "loss": 0.0959, "step": 9410 }, { "epoch": 29.01620253164557, "grad_norm": 1.9330257177352905, "learning_rate": 2.2433192686357243e-05, "loss": 0.0604, "step": 9420 }, { "epoch": 29.016835443037976, "grad_norm": 55.1077880859375, "learning_rate": 2.2398030942334742e-05, "loss": 0.0694, "step": 9430 }, { "epoch": 29.01746835443038, "grad_norm": 1.9939329624176025, "learning_rate": 2.2362869198312237e-05, "loss": 0.1348, "step": 9440 }, { "epoch": 29.018101265822786, "grad_norm": 0.015108354389667511, "learning_rate": 2.2327707454289735e-05, "loss": 0.0449, "step": 9450 }, { "epoch": 29.01873417721519, "grad_norm": 12.801469802856445, "learning_rate": 2.229254571026723e-05, "loss": 0.1694, "step": 9460 }, { "epoch": 29.019367088607595, "grad_norm": 2.3264384269714355, "learning_rate": 2.225738396624473e-05, "loss": 0.044, "step": 9470 }, { "epoch": 29.02, "grad_norm": 0.13212990760803223, "learning_rate": 2.2222222222222223e-05, "loss": 0.1988, "step": 9480 }, { "epoch": 29.02, "eval_accuracy": 0.9558541266794626, "eval_loss": 0.14145594835281372, "eval_runtime": 928.9271, "eval_samples_per_second": 0.561, "eval_steps_per_second": 0.071, "step": 9480 }, { "epoch": 30.000632911392405, "grad_norm": 2.014655113220215, "learning_rate": 2.2187060478199718e-05, "loss": 0.0823, "step": 9490 }, { "epoch": 30.00126582278481, "grad_norm": 7.12656831741333, "learning_rate": 2.2151898734177217e-05, "loss": 0.0963, "step": 9500 }, { "epoch": 30.001898734177214, "grad_norm": 1.7992146015167236, "learning_rate": 2.211673699015471e-05, "loss": 0.1107, "step": 9510 }, { "epoch": 30.00253164556962, "grad_norm": 7.597923755645752, "learning_rate": 2.2081575246132206e-05, "loss": 0.102, "step": 9520 }, { "epoch": 30.003164556962027, "grad_norm": 0.026418205350637436, "learning_rate": 2.2046413502109705e-05, "loss": 0.1202, "step": 9530 }, { "epoch": 30.00379746835443, "grad_norm": 0.01834811083972454, "learning_rate": 2.2011251758087203e-05, "loss": 0.0728, "step": 9540 }, { "epoch": 30.004430379746836, "grad_norm": 0.02628573216497898, "learning_rate": 2.1976090014064698e-05, "loss": 0.0948, "step": 9550 }, { "epoch": 30.00506329113924, "grad_norm": 0.023844990879297256, "learning_rate": 2.1940928270042196e-05, "loss": 0.0601, "step": 9560 }, { "epoch": 30.005696202531645, "grad_norm": 0.019756905734539032, "learning_rate": 2.190576652601969e-05, "loss": 0.0759, "step": 9570 }, { "epoch": 30.00632911392405, "grad_norm": 6.277482986450195, "learning_rate": 2.187060478199719e-05, "loss": 0.053, "step": 9580 }, { "epoch": 30.006962025316454, "grad_norm": 0.5496724843978882, "learning_rate": 2.1835443037974685e-05, "loss": 0.0653, "step": 9590 }, { "epoch": 30.00759493670886, "grad_norm": 1.7114007472991943, "learning_rate": 2.1800281293952183e-05, "loss": 0.0466, "step": 9600 }, { "epoch": 30.008227848101267, "grad_norm": 2.283554792404175, "learning_rate": 2.1765119549929678e-05, "loss": 0.0608, "step": 9610 }, { "epoch": 30.008860759493672, "grad_norm": 4.336071014404297, "learning_rate": 2.1729957805907176e-05, "loss": 0.0702, "step": 9620 }, { "epoch": 30.009493670886076, "grad_norm": 0.2585821747779846, "learning_rate": 2.169479606188467e-05, "loss": 0.0716, "step": 9630 }, { "epoch": 30.01012658227848, "grad_norm": 2.0628931522369385, "learning_rate": 2.165963431786217e-05, "loss": 0.0975, "step": 9640 }, { "epoch": 30.010759493670886, "grad_norm": 0.9379414319992065, "learning_rate": 2.1624472573839664e-05, "loss": 0.0643, "step": 9650 }, { "epoch": 30.01139240506329, "grad_norm": 1.865263819694519, "learning_rate": 2.158931082981716e-05, "loss": 0.089, "step": 9660 }, { "epoch": 30.012025316455695, "grad_norm": 0.01576172187924385, "learning_rate": 2.1554149085794658e-05, "loss": 0.0863, "step": 9670 }, { "epoch": 30.0126582278481, "grad_norm": 8.34028434753418, "learning_rate": 2.1518987341772153e-05, "loss": 0.0266, "step": 9680 }, { "epoch": 30.013291139240508, "grad_norm": 2.5415472984313965, "learning_rate": 2.1483825597749647e-05, "loss": 0.1101, "step": 9690 }, { "epoch": 30.013924050632912, "grad_norm": 0.010155543684959412, "learning_rate": 2.1448663853727146e-05, "loss": 0.037, "step": 9700 }, { "epoch": 30.014556962025317, "grad_norm": 2.1569736003875732, "learning_rate": 2.141350210970464e-05, "loss": 0.03, "step": 9710 }, { "epoch": 30.01518987341772, "grad_norm": 2.8239643573760986, "learning_rate": 2.137834036568214e-05, "loss": 0.1167, "step": 9720 }, { "epoch": 30.015822784810126, "grad_norm": 1.6771701574325562, "learning_rate": 2.1343178621659634e-05, "loss": 0.1783, "step": 9730 }, { "epoch": 30.01645569620253, "grad_norm": 1.3176885843276978, "learning_rate": 2.1308016877637132e-05, "loss": 0.0518, "step": 9740 }, { "epoch": 30.017088607594935, "grad_norm": 2.2316536903381348, "learning_rate": 2.1272855133614627e-05, "loss": 0.0688, "step": 9750 }, { "epoch": 30.01772151898734, "grad_norm": 0.02535848319530487, "learning_rate": 2.1237693389592126e-05, "loss": 0.1062, "step": 9760 }, { "epoch": 30.01835443037975, "grad_norm": 0.026239003986120224, "learning_rate": 2.120253164556962e-05, "loss": 0.0434, "step": 9770 }, { "epoch": 30.018987341772153, "grad_norm": 0.008352408185601234, "learning_rate": 2.116736990154712e-05, "loss": 0.1434, "step": 9780 }, { "epoch": 30.019620253164558, "grad_norm": 0.008794278837740421, "learning_rate": 2.1132208157524614e-05, "loss": 0.0747, "step": 9790 }, { "epoch": 30.02, "eval_accuracy": 0.9712092130518234, "eval_loss": 0.058070629835128784, "eval_runtime": 882.2085, "eval_samples_per_second": 0.591, "eval_steps_per_second": 0.075, "step": 9796 }, { "epoch": 31.000253164556963, "grad_norm": 0.15315893292427063, "learning_rate": 2.1097046413502112e-05, "loss": 0.1227, "step": 9800 }, { "epoch": 31.000886075949367, "grad_norm": 0.007744469679892063, "learning_rate": 2.1061884669479607e-05, "loss": 0.0588, "step": 9810 }, { "epoch": 31.001518987341772, "grad_norm": 1.7301479578018188, "learning_rate": 2.1026722925457105e-05, "loss": 0.1045, "step": 9820 }, { "epoch": 31.002151898734176, "grad_norm": 0.016438119113445282, "learning_rate": 2.09915611814346e-05, "loss": 0.1089, "step": 9830 }, { "epoch": 31.00278481012658, "grad_norm": 2.2902982234954834, "learning_rate": 2.09563994374121e-05, "loss": 0.0569, "step": 9840 }, { "epoch": 31.003417721518986, "grad_norm": 0.01934031955897808, "learning_rate": 2.0921237693389594e-05, "loss": 0.0613, "step": 9850 }, { "epoch": 31.004050632911394, "grad_norm": 0.10376526415348053, "learning_rate": 2.088607594936709e-05, "loss": 0.0278, "step": 9860 }, { "epoch": 31.0046835443038, "grad_norm": 0.03141545131802559, "learning_rate": 2.0850914205344583e-05, "loss": 0.0557, "step": 9870 }, { "epoch": 31.005316455696203, "grad_norm": 0.024358389899134636, "learning_rate": 2.0815752461322082e-05, "loss": 0.0358, "step": 9880 }, { "epoch": 31.005949367088608, "grad_norm": 0.09153765439987183, "learning_rate": 2.0780590717299577e-05, "loss": 0.0682, "step": 9890 }, { "epoch": 31.006582278481012, "grad_norm": 0.012928824871778488, "learning_rate": 2.0745428973277075e-05, "loss": 0.044, "step": 9900 }, { "epoch": 31.007215189873417, "grad_norm": 0.02423839271068573, "learning_rate": 2.071026722925457e-05, "loss": 0.029, "step": 9910 }, { "epoch": 31.00784810126582, "grad_norm": 0.007467347197234631, "learning_rate": 2.067510548523207e-05, "loss": 0.1689, "step": 9920 }, { "epoch": 31.008481012658226, "grad_norm": 61.641075134277344, "learning_rate": 2.0639943741209563e-05, "loss": 0.0393, "step": 9930 }, { "epoch": 31.009113924050634, "grad_norm": 0.013753454200923443, "learning_rate": 2.060478199718706e-05, "loss": 0.0425, "step": 9940 }, { "epoch": 31.00974683544304, "grad_norm": 2.26615047454834, "learning_rate": 2.056962025316456e-05, "loss": 0.0482, "step": 9950 }, { "epoch": 31.010379746835444, "grad_norm": 17.814420700073242, "learning_rate": 2.0534458509142055e-05, "loss": 0.1064, "step": 9960 }, { "epoch": 31.01101265822785, "grad_norm": 0.0065340083092451096, "learning_rate": 2.0499296765119553e-05, "loss": 0.0222, "step": 9970 }, { "epoch": 31.011645569620253, "grad_norm": 0.004512071143835783, "learning_rate": 2.0464135021097048e-05, "loss": 0.0561, "step": 9980 }, { "epoch": 31.012278481012657, "grad_norm": 2.7562315464019775, "learning_rate": 2.0428973277074546e-05, "loss": 0.0865, "step": 9990 }, { "epoch": 31.012911392405062, "grad_norm": 0.01689945161342621, "learning_rate": 2.039381153305204e-05, "loss": 0.04, "step": 10000 }, { "epoch": 31.013544303797467, "grad_norm": 2.3072574138641357, "learning_rate": 2.0358649789029536e-05, "loss": 0.1108, "step": 10010 }, { "epoch": 31.014177215189875, "grad_norm": 2.39740252494812, "learning_rate": 2.0323488045007035e-05, "loss": 0.0603, "step": 10020 }, { "epoch": 31.01481012658228, "grad_norm": 2.2645068168640137, "learning_rate": 2.028832630098453e-05, "loss": 0.1595, "step": 10030 }, { "epoch": 31.015443037974684, "grad_norm": 0.020067648962140083, "learning_rate": 2.0253164556962025e-05, "loss": 0.0824, "step": 10040 }, { "epoch": 31.01607594936709, "grad_norm": 1.732828974723816, "learning_rate": 2.0218002812939523e-05, "loss": 0.0979, "step": 10050 }, { "epoch": 31.016708860759493, "grad_norm": 0.006261592730879784, "learning_rate": 2.0182841068917018e-05, "loss": 0.0294, "step": 10060 }, { "epoch": 31.017341772151898, "grad_norm": 0.009185651317238808, "learning_rate": 2.0147679324894516e-05, "loss": 0.0684, "step": 10070 }, { "epoch": 31.017974683544303, "grad_norm": 2.076765298843384, "learning_rate": 2.011251758087201e-05, "loss": 0.0763, "step": 10080 }, { "epoch": 31.018607594936707, "grad_norm": 0.010820266790688038, "learning_rate": 2.007735583684951e-05, "loss": 0.1151, "step": 10090 }, { "epoch": 31.019240506329115, "grad_norm": 0.9778680801391602, "learning_rate": 2.0042194092827004e-05, "loss": 0.0274, "step": 10100 }, { "epoch": 31.01987341772152, "grad_norm": 0.03660624101758003, "learning_rate": 2.0007032348804503e-05, "loss": 0.0556, "step": 10110 }, { "epoch": 31.02, "eval_accuracy": 0.9692898272552783, "eval_loss": 0.051862768828868866, "eval_runtime": 895.6108, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.074, "step": 10112 }, { "epoch": 32.000506329113925, "grad_norm": 0.014388811774551868, "learning_rate": 1.9971870604781998e-05, "loss": 0.0289, "step": 10120 }, { "epoch": 32.001139240506326, "grad_norm": 1.698613166809082, "learning_rate": 1.9936708860759496e-05, "loss": 0.0312, "step": 10130 }, { "epoch": 32.001772151898734, "grad_norm": 0.026271585375070572, "learning_rate": 1.990154711673699e-05, "loss": 0.0451, "step": 10140 }, { "epoch": 32.00240506329114, "grad_norm": 0.00675854692235589, "learning_rate": 1.986638537271449e-05, "loss": 0.0247, "step": 10150 }, { "epoch": 32.003037974683544, "grad_norm": 0.022877847775816917, "learning_rate": 1.9831223628691984e-05, "loss": 0.0235, "step": 10160 }, { "epoch": 32.00367088607595, "grad_norm": 0.008142875507473946, "learning_rate": 1.9796061884669482e-05, "loss": 0.0699, "step": 10170 }, { "epoch": 32.00430379746835, "grad_norm": 0.004865671973675489, "learning_rate": 1.9760900140646977e-05, "loss": 0.0645, "step": 10180 }, { "epoch": 32.00493670886076, "grad_norm": 0.05031589791178703, "learning_rate": 1.9725738396624476e-05, "loss": 0.0675, "step": 10190 }, { "epoch": 32.00556962025316, "grad_norm": 0.006223399192094803, "learning_rate": 1.969057665260197e-05, "loss": 0.0812, "step": 10200 }, { "epoch": 32.00620253164557, "grad_norm": 0.008529792539775372, "learning_rate": 1.9655414908579466e-05, "loss": 0.1027, "step": 10210 }, { "epoch": 32.00683544303797, "grad_norm": 0.004334003198891878, "learning_rate": 1.962025316455696e-05, "loss": 0.0511, "step": 10220 }, { "epoch": 32.00746835443038, "grad_norm": 0.004973968956619501, "learning_rate": 1.958509142053446e-05, "loss": 0.0556, "step": 10230 }, { "epoch": 32.00810126582279, "grad_norm": 0.005091523285955191, "learning_rate": 1.9549929676511954e-05, "loss": 0.0889, "step": 10240 }, { "epoch": 32.00873417721519, "grad_norm": 0.004862161818891764, "learning_rate": 1.9514767932489452e-05, "loss": 0.0728, "step": 10250 }, { "epoch": 32.0093670886076, "grad_norm": 0.43511274456977844, "learning_rate": 1.9479606188466947e-05, "loss": 0.0398, "step": 10260 }, { "epoch": 32.01, "grad_norm": 0.006799314171075821, "learning_rate": 1.9444444444444445e-05, "loss": 0.0543, "step": 10270 }, { "epoch": 32.010632911392406, "grad_norm": 2.4008827209472656, "learning_rate": 1.940928270042194e-05, "loss": 0.0623, "step": 10280 }, { "epoch": 32.01126582278481, "grad_norm": 0.00461224839091301, "learning_rate": 1.937412095639944e-05, "loss": 0.0434, "step": 10290 }, { "epoch": 32.011898734177215, "grad_norm": 0.004693826660513878, "learning_rate": 1.9338959212376934e-05, "loss": 0.0904, "step": 10300 }, { "epoch": 32.012531645569624, "grad_norm": 0.006937893573194742, "learning_rate": 1.9303797468354432e-05, "loss": 0.0769, "step": 10310 }, { "epoch": 32.013164556962025, "grad_norm": 0.005765580106526613, "learning_rate": 1.9268635724331927e-05, "loss": 0.0743, "step": 10320 }, { "epoch": 32.01379746835443, "grad_norm": 0.003979605156928301, "learning_rate": 1.9233473980309425e-05, "loss": 0.0406, "step": 10330 }, { "epoch": 32.014430379746834, "grad_norm": 1.9673800468444824, "learning_rate": 1.919831223628692e-05, "loss": 0.0928, "step": 10340 }, { "epoch": 32.01506329113924, "grad_norm": 2.0642683506011963, "learning_rate": 1.916315049226442e-05, "loss": 0.0872, "step": 10350 }, { "epoch": 32.01569620253164, "grad_norm": 2.0600333213806152, "learning_rate": 1.9127988748241913e-05, "loss": 0.0747, "step": 10360 }, { "epoch": 32.01632911392405, "grad_norm": 2.7668960094451904, "learning_rate": 1.9092827004219412e-05, "loss": 0.0496, "step": 10370 }, { "epoch": 32.01696202531645, "grad_norm": 0.0035948660224676132, "learning_rate": 1.9057665260196907e-05, "loss": 0.0713, "step": 10380 }, { "epoch": 32.01759493670886, "grad_norm": 1.5214300155639648, "learning_rate": 1.90225035161744e-05, "loss": 0.0854, "step": 10390 }, { "epoch": 32.01822784810127, "grad_norm": 1.6459559202194214, "learning_rate": 1.89873417721519e-05, "loss": 0.0765, "step": 10400 }, { "epoch": 32.01886075949367, "grad_norm": 1.9180864095687866, "learning_rate": 1.8952180028129395e-05, "loss": 0.0367, "step": 10410 }, { "epoch": 32.01949367088608, "grad_norm": 2.7413058280944824, "learning_rate": 1.8917018284106893e-05, "loss": 0.0763, "step": 10420 }, { "epoch": 32.02, "eval_accuracy": 0.9731285988483686, "eval_loss": 0.050615016371011734, "eval_runtime": 876.7902, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.075, "step": 10428 }, { "epoch": 33.00012658227848, "grad_norm": 0.003906856290996075, "learning_rate": 1.8881856540084388e-05, "loss": 0.0514, "step": 10430 }, { "epoch": 33.000759493670884, "grad_norm": 1.0877649784088135, "learning_rate": 1.8846694796061886e-05, "loss": 0.0264, "step": 10440 }, { "epoch": 33.00139240506329, "grad_norm": 0.0038005015812814236, "learning_rate": 1.881153305203938e-05, "loss": 0.1031, "step": 10450 }, { "epoch": 33.00202531645569, "grad_norm": 0.0038557087536901236, "learning_rate": 1.877637130801688e-05, "loss": 0.055, "step": 10460 }, { "epoch": 33.0026582278481, "grad_norm": 0.005424036644399166, "learning_rate": 1.8741209563994375e-05, "loss": 0.0734, "step": 10470 }, { "epoch": 33.00329113924051, "grad_norm": 0.003683975664898753, "learning_rate": 1.8706047819971873e-05, "loss": 0.0477, "step": 10480 }, { "epoch": 33.00392405063291, "grad_norm": 0.3991844952106476, "learning_rate": 1.8670886075949368e-05, "loss": 0.0014, "step": 10490 }, { "epoch": 33.00455696202532, "grad_norm": 1.7069263458251953, "learning_rate": 1.8635724331926866e-05, "loss": 0.0679, "step": 10500 }, { "epoch": 33.00518987341772, "grad_norm": 1.4248631000518799, "learning_rate": 1.860056258790436e-05, "loss": 0.0478, "step": 10510 }, { "epoch": 33.00582278481013, "grad_norm": 0.006337147206068039, "learning_rate": 1.856540084388186e-05, "loss": 0.0769, "step": 10520 }, { "epoch": 33.00645569620253, "grad_norm": 0.003814426949247718, "learning_rate": 1.8530239099859354e-05, "loss": 0.1142, "step": 10530 }, { "epoch": 33.00708860759494, "grad_norm": 0.003283862257376313, "learning_rate": 1.8495077355836853e-05, "loss": 0.0826, "step": 10540 }, { "epoch": 33.00772151898734, "grad_norm": 0.7462538480758667, "learning_rate": 1.8459915611814348e-05, "loss": 0.078, "step": 10550 }, { "epoch": 33.00835443037975, "grad_norm": 1.6937042474746704, "learning_rate": 1.8424753867791843e-05, "loss": 0.0805, "step": 10560 }, { "epoch": 33.008987341772155, "grad_norm": 2.729776382446289, "learning_rate": 1.8389592123769338e-05, "loss": 0.0982, "step": 10570 }, { "epoch": 33.009620253164556, "grad_norm": 0.005251895170658827, "learning_rate": 1.8354430379746836e-05, "loss": 0.0348, "step": 10580 }, { "epoch": 33.010253164556964, "grad_norm": 0.005169942043721676, "learning_rate": 1.831926863572433e-05, "loss": 0.1224, "step": 10590 }, { "epoch": 33.010886075949365, "grad_norm": 2.058396577835083, "learning_rate": 1.828410689170183e-05, "loss": 0.0732, "step": 10600 }, { "epoch": 33.01151898734177, "grad_norm": 0.004916292615234852, "learning_rate": 1.8248945147679324e-05, "loss": 0.0203, "step": 10610 }, { "epoch": 33.012151898734174, "grad_norm": 0.004748203791677952, "learning_rate": 1.8213783403656822e-05, "loss": 0.0726, "step": 10620 }, { "epoch": 33.01278481012658, "grad_norm": 0.004022388719022274, "learning_rate": 1.8178621659634317e-05, "loss": 0.0105, "step": 10630 }, { "epoch": 33.01341772151899, "grad_norm": 0.002989129861816764, "learning_rate": 1.8143459915611816e-05, "loss": 0.0093, "step": 10640 }, { "epoch": 33.01405063291139, "grad_norm": 0.0042097545228898525, "learning_rate": 1.810829817158931e-05, "loss": 0.0425, "step": 10650 }, { "epoch": 33.0146835443038, "grad_norm": 0.008987381123006344, "learning_rate": 1.807313642756681e-05, "loss": 0.0721, "step": 10660 }, { "epoch": 33.0153164556962, "grad_norm": 0.004144416656345129, "learning_rate": 1.8037974683544304e-05, "loss": 0.0474, "step": 10670 }, { "epoch": 33.01594936708861, "grad_norm": 0.0029516241047531366, "learning_rate": 1.8002812939521802e-05, "loss": 0.0501, "step": 10680 }, { "epoch": 33.01658227848101, "grad_norm": 0.00844433344900608, "learning_rate": 1.7967651195499297e-05, "loss": 0.0652, "step": 10690 }, { "epoch": 33.01721518987342, "grad_norm": 2.04638409614563, "learning_rate": 1.7932489451476795e-05, "loss": 0.084, "step": 10700 }, { "epoch": 33.01784810126582, "grad_norm": 0.004865339025855064, "learning_rate": 1.789732770745429e-05, "loss": 0.0698, "step": 10710 }, { "epoch": 33.01848101265823, "grad_norm": 2.136470079421997, "learning_rate": 1.786216596343179e-05, "loss": 0.0798, "step": 10720 }, { "epoch": 33.019113924050636, "grad_norm": 0.004824280273169279, "learning_rate": 1.7827004219409284e-05, "loss": 0.0244, "step": 10730 }, { "epoch": 33.01974683544304, "grad_norm": 2.0193657875061035, "learning_rate": 1.779184247538678e-05, "loss": 0.0635, "step": 10740 }, { "epoch": 33.02, "eval_accuracy": 0.9750479846449136, "eval_loss": 0.049176860600709915, "eval_runtime": 874.5591, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.075, "step": 10744 }, { "epoch": 34.00037974683544, "grad_norm": 0.004216327331960201, "learning_rate": 1.7756680731364274e-05, "loss": 0.0312, "step": 10750 }, { "epoch": 34.00101265822785, "grad_norm": 0.004103939048945904, "learning_rate": 1.7721518987341772e-05, "loss": 0.041, "step": 10760 }, { "epoch": 34.00164556962025, "grad_norm": 0.014023522846400738, "learning_rate": 1.7686357243319267e-05, "loss": 0.0759, "step": 10770 }, { "epoch": 34.00227848101266, "grad_norm": 0.003603527322411537, "learning_rate": 1.7651195499296765e-05, "loss": 0.0688, "step": 10780 }, { "epoch": 34.00291139240506, "grad_norm": 0.0033313506282866, "learning_rate": 1.7616033755274263e-05, "loss": 0.0465, "step": 10790 }, { "epoch": 34.00354430379747, "grad_norm": 1.8435585498809814, "learning_rate": 1.758087201125176e-05, "loss": 0.0949, "step": 10800 }, { "epoch": 34.00417721518988, "grad_norm": 1.6445870399475098, "learning_rate": 1.7545710267229257e-05, "loss": 0.1285, "step": 10810 }, { "epoch": 34.00481012658228, "grad_norm": 0.002922611776739359, "learning_rate": 1.751054852320675e-05, "loss": 0.0706, "step": 10820 }, { "epoch": 34.005443037974686, "grad_norm": 2.673938274383545, "learning_rate": 1.747538677918425e-05, "loss": 0.0942, "step": 10830 }, { "epoch": 34.00607594936709, "grad_norm": 0.004203031305223703, "learning_rate": 1.7440225035161745e-05, "loss": 0.0326, "step": 10840 }, { "epoch": 34.006708860759495, "grad_norm": 1.8749343156814575, "learning_rate": 1.7405063291139243e-05, "loss": 0.1136, "step": 10850 }, { "epoch": 34.0073417721519, "grad_norm": 0.004128487780690193, "learning_rate": 1.7369901547116738e-05, "loss": 0.0677, "step": 10860 }, { "epoch": 34.007974683544305, "grad_norm": 0.00483922241255641, "learning_rate": 1.7334739803094237e-05, "loss": 0.0719, "step": 10870 }, { "epoch": 34.008607594936706, "grad_norm": 1.308681607246399, "learning_rate": 1.729957805907173e-05, "loss": 0.0435, "step": 10880 }, { "epoch": 34.009240506329114, "grad_norm": 0.0035531967878341675, "learning_rate": 1.726441631504923e-05, "loss": 0.0415, "step": 10890 }, { "epoch": 34.00987341772152, "grad_norm": 1.937238335609436, "learning_rate": 1.7229254571026725e-05, "loss": 0.0518, "step": 10900 }, { "epoch": 34.01050632911392, "grad_norm": 0.0039803506806492805, "learning_rate": 1.719409282700422e-05, "loss": 0.065, "step": 10910 }, { "epoch": 34.01113924050633, "grad_norm": 1.8601768016815186, "learning_rate": 1.7158931082981715e-05, "loss": 0.0599, "step": 10920 }, { "epoch": 34.01177215189873, "grad_norm": 0.003120964393019676, "learning_rate": 1.7123769338959213e-05, "loss": 0.0747, "step": 10930 }, { "epoch": 34.01240506329114, "grad_norm": 2.323869228363037, "learning_rate": 1.7088607594936708e-05, "loss": 0.0448, "step": 10940 }, { "epoch": 34.01303797468354, "grad_norm": 0.007831540890038013, "learning_rate": 1.7053445850914206e-05, "loss": 0.1065, "step": 10950 }, { "epoch": 34.01367088607595, "grad_norm": 0.007483489811420441, "learning_rate": 1.70182841068917e-05, "loss": 0.057, "step": 10960 }, { "epoch": 34.01430379746836, "grad_norm": 1.7984932661056519, "learning_rate": 1.69831223628692e-05, "loss": 0.1047, "step": 10970 }, { "epoch": 34.01493670886076, "grad_norm": 0.011337379924952984, "learning_rate": 1.6947960618846694e-05, "loss": 0.0281, "step": 10980 }, { "epoch": 34.01556962025317, "grad_norm": 0.004797440022230148, "learning_rate": 1.6912798874824193e-05, "loss": 0.0295, "step": 10990 }, { "epoch": 34.01620253164557, "grad_norm": 0.0023510728497058153, "learning_rate": 1.6877637130801688e-05, "loss": 0.0391, "step": 11000 }, { "epoch": 34.01683544303798, "grad_norm": 0.015410896390676498, "learning_rate": 1.6842475386779186e-05, "loss": 0.0316, "step": 11010 }, { "epoch": 34.01746835443038, "grad_norm": 0.008044656366109848, "learning_rate": 1.680731364275668e-05, "loss": 0.036, "step": 11020 }, { "epoch": 34.018101265822786, "grad_norm": 0.004154231399297714, "learning_rate": 1.677215189873418e-05, "loss": 0.0852, "step": 11030 }, { "epoch": 34.01873417721519, "grad_norm": 0.006638134364038706, "learning_rate": 1.6736990154711674e-05, "loss": 0.0142, "step": 11040 }, { "epoch": 34.019367088607595, "grad_norm": 0.0038269851356744766, "learning_rate": 1.6701828410689173e-05, "loss": 0.0155, "step": 11050 }, { "epoch": 34.02, "grad_norm": 0.0027346955612301826, "learning_rate": 1.6666666666666667e-05, "loss": 0.0729, "step": 11060 }, { "epoch": 34.02, "eval_accuracy": 0.9692898272552783, "eval_loss": 0.048282425850629807, "eval_runtime": 866.1958, "eval_samples_per_second": 0.601, "eval_steps_per_second": 0.076, "step": 11060 }, { "epoch": 35.00063291139241, "grad_norm": 0.0029359341133385897, "learning_rate": 1.6631504922644166e-05, "loss": 0.0964, "step": 11070 }, { "epoch": 35.00126582278481, "grad_norm": 0.0031327735632658005, "learning_rate": 1.659634317862166e-05, "loss": 0.057, "step": 11080 }, { "epoch": 35.00189873417722, "grad_norm": 0.004360498860478401, "learning_rate": 1.6561181434599156e-05, "loss": 0.0264, "step": 11090 }, { "epoch": 35.00253164556962, "grad_norm": 0.013268140144646168, "learning_rate": 1.6526019690576654e-05, "loss": 0.0785, "step": 11100 }, { "epoch": 35.00316455696203, "grad_norm": 0.0029599510598927736, "learning_rate": 1.649085794655415e-05, "loss": 0.0914, "step": 11110 }, { "epoch": 35.00379746835443, "grad_norm": 0.00652879336848855, "learning_rate": 1.6455696202531644e-05, "loss": 0.0423, "step": 11120 }, { "epoch": 35.004430379746836, "grad_norm": 0.004876864142715931, "learning_rate": 1.6420534458509142e-05, "loss": 0.0367, "step": 11130 }, { "epoch": 35.00506329113924, "grad_norm": 0.0030262030195444822, "learning_rate": 1.6385372714486637e-05, "loss": 0.0579, "step": 11140 }, { "epoch": 35.005696202531645, "grad_norm": 0.005096379201859236, "learning_rate": 1.6350210970464135e-05, "loss": 0.0162, "step": 11150 }, { "epoch": 35.00632911392405, "grad_norm": 0.005958447232842445, "learning_rate": 1.631504922644163e-05, "loss": 0.0704, "step": 11160 }, { "epoch": 35.006962025316454, "grad_norm": 0.09998659044504166, "learning_rate": 1.627988748241913e-05, "loss": 0.0396, "step": 11170 }, { "epoch": 35.00759493670886, "grad_norm": 2.7944371700286865, "learning_rate": 1.6244725738396624e-05, "loss": 0.0813, "step": 11180 }, { "epoch": 35.008227848101264, "grad_norm": 0.00864170491695404, "learning_rate": 1.6209563994374122e-05, "loss": 0.069, "step": 11190 }, { "epoch": 35.00886075949367, "grad_norm": 0.008192246779799461, "learning_rate": 1.617440225035162e-05, "loss": 0.1284, "step": 11200 }, { "epoch": 35.00949367088607, "grad_norm": 0.007694208063185215, "learning_rate": 1.6139240506329115e-05, "loss": 0.0198, "step": 11210 }, { "epoch": 35.01012658227848, "grad_norm": 0.0043876878917217255, "learning_rate": 1.6104078762306614e-05, "loss": 0.0759, "step": 11220 }, { "epoch": 35.01075949367089, "grad_norm": 5.103017330169678, "learning_rate": 1.606891701828411e-05, "loss": 0.0263, "step": 11230 }, { "epoch": 35.01139240506329, "grad_norm": 0.038662366569042206, "learning_rate": 1.6033755274261607e-05, "loss": 0.1215, "step": 11240 }, { "epoch": 35.0120253164557, "grad_norm": 0.0065099457278847694, "learning_rate": 1.5998593530239102e-05, "loss": 0.1129, "step": 11250 }, { "epoch": 35.0126582278481, "grad_norm": 3.32816219329834, "learning_rate": 1.5963431786216597e-05, "loss": 0.1243, "step": 11260 }, { "epoch": 35.01329113924051, "grad_norm": 2.0049562454223633, "learning_rate": 1.5928270042194095e-05, "loss": 0.0641, "step": 11270 }, { "epoch": 35.01392405063291, "grad_norm": 0.7978969216346741, "learning_rate": 1.589310829817159e-05, "loss": 0.0625, "step": 11280 }, { "epoch": 35.01455696202532, "grad_norm": 0.004707363899797201, "learning_rate": 1.5857946554149085e-05, "loss": 0.064, "step": 11290 }, { "epoch": 35.01518987341772, "grad_norm": 0.0142443822696805, "learning_rate": 1.5822784810126583e-05, "loss": 0.0638, "step": 11300 }, { "epoch": 35.015822784810126, "grad_norm": 0.011079243384301662, "learning_rate": 1.5787623066104078e-05, "loss": 0.0004, "step": 11310 }, { "epoch": 35.016455696202534, "grad_norm": 2.3243215084075928, "learning_rate": 1.5752461322081577e-05, "loss": 0.0786, "step": 11320 }, { "epoch": 35.017088607594935, "grad_norm": 0.004174171946942806, "learning_rate": 1.571729957805907e-05, "loss": 0.0151, "step": 11330 }, { "epoch": 35.017721518987344, "grad_norm": 0.006861343514174223, "learning_rate": 1.568213783403657e-05, "loss": 0.1092, "step": 11340 }, { "epoch": 35.018354430379745, "grad_norm": 2.0469024181365967, "learning_rate": 1.5646976090014065e-05, "loss": 0.035, "step": 11350 }, { "epoch": 35.01898734177215, "grad_norm": 2.0664782524108887, "learning_rate": 1.5611814345991563e-05, "loss": 0.0619, "step": 11360 }, { "epoch": 35.019620253164554, "grad_norm": 0.004111067857593298, "learning_rate": 1.5576652601969058e-05, "loss": 0.0692, "step": 11370 }, { "epoch": 35.02, "eval_accuracy": 0.9750479846449136, "eval_loss": 0.04807919263839722, "eval_runtime": 894.7526, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.074, "step": 11376 }, { "epoch": 36.00025316455696, "grad_norm": 0.0060658580623567104, "learning_rate": 1.5541490857946556e-05, "loss": 0.0883, "step": 11380 }, { "epoch": 36.00088607594937, "grad_norm": 2.9155092239379883, "learning_rate": 1.550632911392405e-05, "loss": 0.0509, "step": 11390 }, { "epoch": 36.001518987341775, "grad_norm": 0.004734630696475506, "learning_rate": 1.547116736990155e-05, "loss": 0.0554, "step": 11400 }, { "epoch": 36.002151898734176, "grad_norm": 2.3105368614196777, "learning_rate": 1.5436005625879045e-05, "loss": 0.0768, "step": 11410 }, { "epoch": 36.002784810126585, "grad_norm": 0.0034461403265595436, "learning_rate": 1.5400843881856543e-05, "loss": 0.0336, "step": 11420 }, { "epoch": 36.003417721518986, "grad_norm": 2.3787004947662354, "learning_rate": 1.5365682137834038e-05, "loss": 0.0541, "step": 11430 }, { "epoch": 36.004050632911394, "grad_norm": 0.003510306589305401, "learning_rate": 1.5330520393811533e-05, "loss": 0.0132, "step": 11440 }, { "epoch": 36.004683544303795, "grad_norm": 2.900848627090454, "learning_rate": 1.529535864978903e-05, "loss": 0.0892, "step": 11450 }, { "epoch": 36.0053164556962, "grad_norm": 0.0045495242811739445, "learning_rate": 1.5260196905766526e-05, "loss": 0.0582, "step": 11460 }, { "epoch": 36.005949367088604, "grad_norm": 2.1150054931640625, "learning_rate": 1.5225035161744023e-05, "loss": 0.0567, "step": 11470 }, { "epoch": 36.00658227848101, "grad_norm": 2.303497791290283, "learning_rate": 1.5189873417721521e-05, "loss": 0.0471, "step": 11480 }, { "epoch": 36.00721518987342, "grad_norm": 1.5887198448181152, "learning_rate": 1.5154711673699016e-05, "loss": 0.0762, "step": 11490 }, { "epoch": 36.00784810126582, "grad_norm": 0.009701371192932129, "learning_rate": 1.5119549929676513e-05, "loss": 0.0994, "step": 11500 }, { "epoch": 36.00848101265823, "grad_norm": 1.875978708267212, "learning_rate": 1.5084388185654007e-05, "loss": 0.037, "step": 11510 }, { "epoch": 36.00911392405063, "grad_norm": 1.7351700067520142, "learning_rate": 1.5049226441631506e-05, "loss": 0.1036, "step": 11520 }, { "epoch": 36.00974683544304, "grad_norm": 2.9241271018981934, "learning_rate": 1.5014064697609e-05, "loss": 0.0546, "step": 11530 }, { "epoch": 36.01037974683544, "grad_norm": 0.00652047386392951, "learning_rate": 1.4978902953586499e-05, "loss": 0.0837, "step": 11540 }, { "epoch": 36.01101265822785, "grad_norm": 0.005075695458799601, "learning_rate": 1.4943741209563994e-05, "loss": 0.0413, "step": 11550 }, { "epoch": 36.011645569620256, "grad_norm": 0.005239785648882389, "learning_rate": 1.4908579465541492e-05, "loss": 0.0714, "step": 11560 }, { "epoch": 36.01227848101266, "grad_norm": 0.0033316484186798334, "learning_rate": 1.4873417721518987e-05, "loss": 0.0297, "step": 11570 }, { "epoch": 36.012911392405066, "grad_norm": 2.3135130405426025, "learning_rate": 1.4838255977496484e-05, "loss": 0.1521, "step": 11580 }, { "epoch": 36.01354430379747, "grad_norm": 2.01491117477417, "learning_rate": 1.480309423347398e-05, "loss": 0.0542, "step": 11590 }, { "epoch": 36.014177215189875, "grad_norm": 0.003011218272149563, "learning_rate": 1.4767932489451477e-05, "loss": 0.0605, "step": 11600 }, { "epoch": 36.014810126582276, "grad_norm": 0.003551116678863764, "learning_rate": 1.4732770745428972e-05, "loss": 0.0359, "step": 11610 }, { "epoch": 36.015443037974684, "grad_norm": 0.004775399807840586, "learning_rate": 1.469760900140647e-05, "loss": 0.0181, "step": 11620 }, { "epoch": 36.016075949367085, "grad_norm": 0.0036576108541339636, "learning_rate": 1.4662447257383969e-05, "loss": 0.1157, "step": 11630 }, { "epoch": 36.01670886075949, "grad_norm": 0.00402354309335351, "learning_rate": 1.4627285513361464e-05, "loss": 0.0341, "step": 11640 }, { "epoch": 36.0173417721519, "grad_norm": 0.003662722185254097, "learning_rate": 1.4592123769338962e-05, "loss": 0.0393, "step": 11650 }, { "epoch": 36.0179746835443, "grad_norm": 2.0367825031280518, "learning_rate": 1.4556962025316457e-05, "loss": 0.0609, "step": 11660 }, { "epoch": 36.01860759493671, "grad_norm": 0.4210270345211029, "learning_rate": 1.4521800281293954e-05, "loss": 0.0793, "step": 11670 }, { "epoch": 36.01924050632911, "grad_norm": 2.6557528972625732, "learning_rate": 1.4486638537271449e-05, "loss": 0.0707, "step": 11680 }, { "epoch": 36.01987341772152, "grad_norm": 2.917741537094116, "learning_rate": 1.4451476793248947e-05, "loss": 0.1023, "step": 11690 }, { "epoch": 36.02, "eval_accuracy": 0.9712092130518234, "eval_loss": 0.047843087464571, "eval_runtime": 935.2983, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.071, "step": 11692 }, { "epoch": 37.000506329113925, "grad_norm": 0.004461715929210186, "learning_rate": 1.4416315049226442e-05, "loss": 0.0132, "step": 11700 }, { "epoch": 37.001139240506326, "grad_norm": 2.282515525817871, "learning_rate": 1.438115330520394e-05, "loss": 0.097, "step": 11710 }, { "epoch": 37.001772151898734, "grad_norm": 1.2947165966033936, "learning_rate": 1.4345991561181435e-05, "loss": 0.0536, "step": 11720 }, { "epoch": 37.00240506329114, "grad_norm": 0.003223339095711708, "learning_rate": 1.4310829817158933e-05, "loss": 0.0295, "step": 11730 }, { "epoch": 37.003037974683544, "grad_norm": 0.003211404662579298, "learning_rate": 1.4275668073136428e-05, "loss": 0.0408, "step": 11740 }, { "epoch": 37.00367088607595, "grad_norm": 0.004511035978794098, "learning_rate": 1.4240506329113925e-05, "loss": 0.0594, "step": 11750 }, { "epoch": 37.00430379746835, "grad_norm": 0.9744784235954285, "learning_rate": 1.4205344585091422e-05, "loss": 0.0235, "step": 11760 }, { "epoch": 37.00493670886076, "grad_norm": 0.0027787466533482075, "learning_rate": 1.4170182841068918e-05, "loss": 0.0331, "step": 11770 }, { "epoch": 37.00556962025316, "grad_norm": 2.0447912216186523, "learning_rate": 1.4135021097046413e-05, "loss": 0.0689, "step": 11780 }, { "epoch": 37.00620253164557, "grad_norm": 0.0037356666289269924, "learning_rate": 1.4099859353023911e-05, "loss": 0.1165, "step": 11790 }, { "epoch": 37.00683544303797, "grad_norm": 0.00903794914484024, "learning_rate": 1.4064697609001406e-05, "loss": 0.0184, "step": 11800 }, { "epoch": 37.00746835443038, "grad_norm": 2.4592278003692627, "learning_rate": 1.4029535864978905e-05, "loss": 0.0623, "step": 11810 }, { "epoch": 37.00810126582279, "grad_norm": 30.086641311645508, "learning_rate": 1.39943741209564e-05, "loss": 0.0916, "step": 11820 }, { "epoch": 37.00873417721519, "grad_norm": 0.006698968820273876, "learning_rate": 1.3959212376933898e-05, "loss": 0.0514, "step": 11830 }, { "epoch": 37.0093670886076, "grad_norm": 2.0741055011749268, "learning_rate": 1.3924050632911393e-05, "loss": 0.1023, "step": 11840 }, { "epoch": 37.01, "grad_norm": 3.4092001914978027, "learning_rate": 1.388888888888889e-05, "loss": 0.1079, "step": 11850 }, { "epoch": 37.010632911392406, "grad_norm": 1.7051777839660645, "learning_rate": 1.3853727144866384e-05, "loss": 0.1146, "step": 11860 }, { "epoch": 37.01126582278481, "grad_norm": 0.007281308062374592, "learning_rate": 1.3818565400843883e-05, "loss": 0.0593, "step": 11870 }, { "epoch": 37.011898734177215, "grad_norm": 0.01118686143308878, "learning_rate": 1.3783403656821378e-05, "loss": 0.0342, "step": 11880 }, { "epoch": 37.012531645569624, "grad_norm": 0.014248767867684364, "learning_rate": 1.3748241912798876e-05, "loss": 0.0685, "step": 11890 }, { "epoch": 37.013164556962025, "grad_norm": 1.9155502319335938, "learning_rate": 1.3713080168776371e-05, "loss": 0.0562, "step": 11900 }, { "epoch": 37.01379746835443, "grad_norm": 2.529233694076538, "learning_rate": 1.367791842475387e-05, "loss": 0.0588, "step": 11910 }, { "epoch": 37.014430379746834, "grad_norm": 0.006225684192031622, "learning_rate": 1.3642756680731364e-05, "loss": 0.046, "step": 11920 }, { "epoch": 37.01506329113924, "grad_norm": 1.680939793586731, "learning_rate": 1.3607594936708861e-05, "loss": 0.0775, "step": 11930 }, { "epoch": 37.01569620253164, "grad_norm": 1.7170394659042358, "learning_rate": 1.3572433192686358e-05, "loss": 0.0498, "step": 11940 }, { "epoch": 37.01632911392405, "grad_norm": 5.160343170166016, "learning_rate": 1.3537271448663854e-05, "loss": 0.0707, "step": 11950 }, { "epoch": 37.01696202531645, "grad_norm": 2.1870486736297607, "learning_rate": 1.3502109704641349e-05, "loss": 0.0583, "step": 11960 }, { "epoch": 37.01759493670886, "grad_norm": 2.745607852935791, "learning_rate": 1.3466947960618847e-05, "loss": 0.0737, "step": 11970 }, { "epoch": 37.01822784810127, "grad_norm": 0.005020118784159422, "learning_rate": 1.3431786216596342e-05, "loss": 0.0867, "step": 11980 }, { "epoch": 37.01886075949367, "grad_norm": 2.2842442989349365, "learning_rate": 1.339662447257384e-05, "loss": 0.0794, "step": 11990 }, { "epoch": 37.01949367088608, "grad_norm": 0.011187170632183552, "learning_rate": 1.3361462728551336e-05, "loss": 0.0863, "step": 12000 }, { "epoch": 37.02, "eval_accuracy": 0.9750479846449136, "eval_loss": 0.04793384671211243, "eval_runtime": 885.4471, "eval_samples_per_second": 0.588, "eval_steps_per_second": 0.075, "step": 12008 }, { "epoch": 38.00012658227848, "grad_norm": 0.004787384066730738, "learning_rate": 1.3326300984528834e-05, "loss": 0.0004, "step": 12010 }, { "epoch": 38.000759493670884, "grad_norm": 0.006256919354200363, "learning_rate": 1.3291139240506329e-05, "loss": 0.0641, "step": 12020 }, { "epoch": 38.00139240506329, "grad_norm": 2.1292147636413574, "learning_rate": 1.3255977496483826e-05, "loss": 0.0635, "step": 12030 }, { "epoch": 38.00202531645569, "grad_norm": 1.8341985940933228, "learning_rate": 1.3220815752461324e-05, "loss": 0.0635, "step": 12040 }, { "epoch": 38.0026582278481, "grad_norm": 0.012860477901995182, "learning_rate": 1.3185654008438819e-05, "loss": 0.0439, "step": 12050 }, { "epoch": 38.00329113924051, "grad_norm": 0.005083575379103422, "learning_rate": 1.3150492264416317e-05, "loss": 0.0468, "step": 12060 }, { "epoch": 38.00392405063291, "grad_norm": 1.4542759656906128, "learning_rate": 1.3115330520393812e-05, "loss": 0.0388, "step": 12070 }, { "epoch": 38.00455696202532, "grad_norm": 1.959895372390747, "learning_rate": 1.308016877637131e-05, "loss": 0.0416, "step": 12080 }, { "epoch": 38.00518987341772, "grad_norm": 2.081799030303955, "learning_rate": 1.3045007032348805e-05, "loss": 0.1381, "step": 12090 }, { "epoch": 38.00582278481013, "grad_norm": 0.004892106633633375, "learning_rate": 1.3009845288326302e-05, "loss": 0.0568, "step": 12100 }, { "epoch": 38.00645569620253, "grad_norm": 0.005953035783022642, "learning_rate": 1.2974683544303799e-05, "loss": 0.0619, "step": 12110 }, { "epoch": 38.00708860759494, "grad_norm": 0.00476424815133214, "learning_rate": 1.2939521800281295e-05, "loss": 0.0487, "step": 12120 }, { "epoch": 38.00772151898734, "grad_norm": 0.004643861670047045, "learning_rate": 1.290436005625879e-05, "loss": 0.078, "step": 12130 }, { "epoch": 38.00835443037975, "grad_norm": 0.004535979591310024, "learning_rate": 1.2869198312236289e-05, "loss": 0.1059, "step": 12140 }, { "epoch": 38.008987341772155, "grad_norm": 0.9413653612136841, "learning_rate": 1.2834036568213783e-05, "loss": 0.0808, "step": 12150 }, { "epoch": 38.009620253164556, "grad_norm": 0.23469361662864685, "learning_rate": 1.2798874824191282e-05, "loss": 0.0552, "step": 12160 }, { "epoch": 38.010253164556964, "grad_norm": 9.23499870300293, "learning_rate": 1.2763713080168777e-05, "loss": 0.1727, "step": 12170 }, { "epoch": 38.010886075949365, "grad_norm": 2.0388455390930176, "learning_rate": 1.2728551336146275e-05, "loss": 0.1044, "step": 12180 }, { "epoch": 38.01151898734177, "grad_norm": 1.715149164199829, "learning_rate": 1.269338959212377e-05, "loss": 0.0649, "step": 12190 }, { "epoch": 38.012151898734174, "grad_norm": 0.0031002135947346687, "learning_rate": 1.2658227848101267e-05, "loss": 0.0792, "step": 12200 }, { "epoch": 38.01278481012658, "grad_norm": 0.00621592253446579, "learning_rate": 1.2623066104078762e-05, "loss": 0.0711, "step": 12210 }, { "epoch": 38.01341772151899, "grad_norm": 0.010905969887971878, "learning_rate": 1.258790436005626e-05, "loss": 0.05, "step": 12220 }, { "epoch": 38.01405063291139, "grad_norm": 0.014951992779970169, "learning_rate": 1.2552742616033755e-05, "loss": 0.0523, "step": 12230 }, { "epoch": 38.0146835443038, "grad_norm": 1.8830513954162598, "learning_rate": 1.2517580872011253e-05, "loss": 0.0341, "step": 12240 }, { "epoch": 38.0153164556962, "grad_norm": 2.3067173957824707, "learning_rate": 1.248241912798875e-05, "loss": 0.0422, "step": 12250 }, { "epoch": 38.01594936708861, "grad_norm": 0.008932225406169891, "learning_rate": 1.2447257383966246e-05, "loss": 0.0639, "step": 12260 }, { "epoch": 38.01658227848101, "grad_norm": 0.00562430452555418, "learning_rate": 1.2412095639943743e-05, "loss": 0.0559, "step": 12270 }, { "epoch": 38.01721518987342, "grad_norm": 0.003999199718236923, "learning_rate": 1.237693389592124e-05, "loss": 0.0529, "step": 12280 }, { "epoch": 38.01784810126582, "grad_norm": 0.01232972927391529, "learning_rate": 1.2341772151898735e-05, "loss": 0.0269, "step": 12290 }, { "epoch": 38.01848101265823, "grad_norm": 0.004044785164296627, "learning_rate": 1.2306610407876231e-05, "loss": 0.0154, "step": 12300 }, { "epoch": 38.019113924050636, "grad_norm": 0.0040607457049191, "learning_rate": 1.2271448663853728e-05, "loss": 0.0157, "step": 12310 }, { "epoch": 38.01974683544304, "grad_norm": 1.6415541172027588, "learning_rate": 1.2236286919831224e-05, "loss": 0.0934, "step": 12320 }, { "epoch": 38.02, "eval_accuracy": 0.9712092130518234, "eval_loss": 0.04641543701291084, "eval_runtime": 886.1854, "eval_samples_per_second": 0.588, "eval_steps_per_second": 0.074, "step": 12324 }, { "epoch": 39.00037974683544, "grad_norm": 1.787951111793518, "learning_rate": 1.2201125175808721e-05, "loss": 0.049, "step": 12330 }, { "epoch": 39.00101265822785, "grad_norm": 0.0033679732587188482, "learning_rate": 1.2165963431786218e-05, "loss": 0.0464, "step": 12340 }, { "epoch": 39.00164556962025, "grad_norm": 0.003388448618352413, "learning_rate": 1.2130801687763714e-05, "loss": 0.0743, "step": 12350 }, { "epoch": 39.00227848101266, "grad_norm": 0.004701158031821251, "learning_rate": 1.2095639943741211e-05, "loss": 0.0753, "step": 12360 }, { "epoch": 39.00291139240506, "grad_norm": 4.90346097946167, "learning_rate": 1.2060478199718708e-05, "loss": 0.0743, "step": 12370 }, { "epoch": 39.00354430379747, "grad_norm": 1.0210288763046265, "learning_rate": 1.2025316455696203e-05, "loss": 0.0239, "step": 12380 }, { "epoch": 39.00417721518988, "grad_norm": 0.0035583917051553726, "learning_rate": 1.19901547116737e-05, "loss": 0.0558, "step": 12390 }, { "epoch": 39.00481012658228, "grad_norm": 0.010518099181354046, "learning_rate": 1.1954992967651196e-05, "loss": 0.0485, "step": 12400 }, { "epoch": 39.005443037974686, "grad_norm": 1.8886620998382568, "learning_rate": 1.1919831223628692e-05, "loss": 0.0778, "step": 12410 }, { "epoch": 39.00607594936709, "grad_norm": 1.6945433616638184, "learning_rate": 1.1884669479606189e-05, "loss": 0.0958, "step": 12420 }, { "epoch": 39.006708860759495, "grad_norm": 0.005481211934238672, "learning_rate": 1.1849507735583686e-05, "loss": 0.072, "step": 12430 }, { "epoch": 39.0073417721519, "grad_norm": 0.009708485566079617, "learning_rate": 1.1814345991561182e-05, "loss": 0.0492, "step": 12440 }, { "epoch": 39.007974683544305, "grad_norm": 2.023264169692993, "learning_rate": 1.1779184247538679e-05, "loss": 0.0423, "step": 12450 }, { "epoch": 39.008607594936706, "grad_norm": 0.002309863455593586, "learning_rate": 1.1744022503516176e-05, "loss": 0.0424, "step": 12460 }, { "epoch": 39.009240506329114, "grad_norm": 3.2563633918762207, "learning_rate": 1.170886075949367e-05, "loss": 0.0603, "step": 12470 }, { "epoch": 39.00987341772152, "grad_norm": 0.005214280914515257, "learning_rate": 1.1673699015471167e-05, "loss": 0.0973, "step": 12480 }, { "epoch": 39.01050632911392, "grad_norm": 0.0032180920243263245, "learning_rate": 1.1638537271448664e-05, "loss": 0.0352, "step": 12490 }, { "epoch": 39.01113924050633, "grad_norm": 0.0023686427157372236, "learning_rate": 1.160337552742616e-05, "loss": 0.0459, "step": 12500 }, { "epoch": 39.01177215189873, "grad_norm": 0.0045258644968271255, "learning_rate": 1.1568213783403657e-05, "loss": 0.0683, "step": 12510 }, { "epoch": 39.01240506329114, "grad_norm": 0.0025700768455863, "learning_rate": 1.1533052039381154e-05, "loss": 0.0412, "step": 12520 }, { "epoch": 39.01303797468354, "grad_norm": 0.0044273072853684425, "learning_rate": 1.149789029535865e-05, "loss": 0.0338, "step": 12530 }, { "epoch": 39.01367088607595, "grad_norm": 0.007956129498779774, "learning_rate": 1.1462728551336147e-05, "loss": 0.0658, "step": 12540 }, { "epoch": 39.01430379746836, "grad_norm": 0.005596678238362074, "learning_rate": 1.1427566807313644e-05, "loss": 0.0658, "step": 12550 }, { "epoch": 39.01493670886076, "grad_norm": 0.003668359015136957, "learning_rate": 1.139240506329114e-05, "loss": 0.0757, "step": 12560 }, { "epoch": 39.01556962025317, "grad_norm": 0.0037127614486962557, "learning_rate": 1.1357243319268635e-05, "loss": 0.0477, "step": 12570 }, { "epoch": 39.01620253164557, "grad_norm": 1.9813635349273682, "learning_rate": 1.1322081575246132e-05, "loss": 0.0453, "step": 12580 }, { "epoch": 39.01683544303798, "grad_norm": 0.004934421740472317, "learning_rate": 1.1286919831223628e-05, "loss": 0.0507, "step": 12590 }, { "epoch": 39.01746835443038, "grad_norm": 0.0019371870439499617, "learning_rate": 1.1251758087201125e-05, "loss": 0.053, "step": 12600 }, { "epoch": 39.018101265822786, "grad_norm": 0.011108608916401863, "learning_rate": 1.1216596343178622e-05, "loss": 0.0283, "step": 12610 }, { "epoch": 39.01873417721519, "grad_norm": 2.060152530670166, "learning_rate": 1.1181434599156118e-05, "loss": 0.0567, "step": 12620 }, { "epoch": 39.019367088607595, "grad_norm": 0.002437671646475792, "learning_rate": 1.1146272855133615e-05, "loss": 0.0331, "step": 12630 }, { "epoch": 39.02, "grad_norm": 0.0036968346685171127, "learning_rate": 1.1111111111111112e-05, "loss": 0.0927, "step": 12640 }, { "epoch": 39.02, "eval_accuracy": 0.9712092130518234, "eval_loss": 0.04621642827987671, "eval_runtime": 901.4604, "eval_samples_per_second": 0.578, "eval_steps_per_second": 0.073, "step": 12640 }, { "epoch": 40.00063291139241, "grad_norm": 0.0033778073266148567, "learning_rate": 1.1075949367088608e-05, "loss": 0.0145, "step": 12650 }, { "epoch": 40.00126582278481, "grad_norm": 2.246983528137207, "learning_rate": 1.1040787623066103e-05, "loss": 0.0527, "step": 12660 }, { "epoch": 40.00189873417722, "grad_norm": 0.002830845071002841, "learning_rate": 1.1005625879043602e-05, "loss": 0.0437, "step": 12670 }, { "epoch": 40.00253164556962, "grad_norm": 1.5313416719436646, "learning_rate": 1.0970464135021098e-05, "loss": 0.0223, "step": 12680 }, { "epoch": 40.00316455696203, "grad_norm": 1.529272437095642, "learning_rate": 1.0935302390998595e-05, "loss": 0.0464, "step": 12690 }, { "epoch": 40.00379746835443, "grad_norm": 0.0037761295679956675, "learning_rate": 1.0900140646976091e-05, "loss": 0.0859, "step": 12700 }, { "epoch": 40.004430379746836, "grad_norm": 0.003269642125815153, "learning_rate": 1.0864978902953588e-05, "loss": 0.068, "step": 12710 }, { "epoch": 40.00506329113924, "grad_norm": 2.6645495891571045, "learning_rate": 1.0829817158931085e-05, "loss": 0.0689, "step": 12720 }, { "epoch": 40.005696202531645, "grad_norm": 2.0125551223754883, "learning_rate": 1.079465541490858e-05, "loss": 0.0139, "step": 12730 }, { "epoch": 40.00632911392405, "grad_norm": 0.002355450764298439, "learning_rate": 1.0759493670886076e-05, "loss": 0.0098, "step": 12740 }, { "epoch": 40.006962025316454, "grad_norm": 0.004568996839225292, "learning_rate": 1.0724331926863573e-05, "loss": 0.0692, "step": 12750 }, { "epoch": 40.00759493670886, "grad_norm": 0.020498152822256088, "learning_rate": 1.068917018284107e-05, "loss": 0.0148, "step": 12760 }, { "epoch": 40.008227848101264, "grad_norm": 0.006843405310064554, "learning_rate": 1.0654008438818566e-05, "loss": 0.0895, "step": 12770 }, { "epoch": 40.00886075949367, "grad_norm": 1.5150411128997803, "learning_rate": 1.0618846694796063e-05, "loss": 0.0739, "step": 12780 }, { "epoch": 40.00949367088607, "grad_norm": 0.003931212704628706, "learning_rate": 1.058368495077356e-05, "loss": 0.1223, "step": 12790 }, { "epoch": 40.01012658227848, "grad_norm": 0.007164576090872288, "learning_rate": 1.0548523206751056e-05, "loss": 0.0627, "step": 12800 }, { "epoch": 40.01075949367089, "grad_norm": 0.009334239177405834, "learning_rate": 1.0513361462728553e-05, "loss": 0.0764, "step": 12810 }, { "epoch": 40.01139240506329, "grad_norm": 5.856595516204834, "learning_rate": 1.047819971870605e-05, "loss": 0.1549, "step": 12820 }, { "epoch": 40.0120253164557, "grad_norm": 0.006400711834430695, "learning_rate": 1.0443037974683544e-05, "loss": 0.1071, "step": 12830 }, { "epoch": 40.0126582278481, "grad_norm": 0.005136122461408377, "learning_rate": 1.0407876230661041e-05, "loss": 0.0683, "step": 12840 }, { "epoch": 40.01329113924051, "grad_norm": 1.7356547117233276, "learning_rate": 1.0372714486638538e-05, "loss": 0.0767, "step": 12850 }, { "epoch": 40.01392405063291, "grad_norm": 0.008359666913747787, "learning_rate": 1.0337552742616034e-05, "loss": 0.0188, "step": 12860 }, { "epoch": 40.01455696202532, "grad_norm": 0.0056666117161512375, "learning_rate": 1.030239099859353e-05, "loss": 0.1222, "step": 12870 }, { "epoch": 40.01518987341772, "grad_norm": 3.0140154361724854, "learning_rate": 1.0267229254571027e-05, "loss": 0.0681, "step": 12880 }, { "epoch": 40.015822784810126, "grad_norm": 2.062131881713867, "learning_rate": 1.0232067510548524e-05, "loss": 0.1132, "step": 12890 }, { "epoch": 40.016455696202534, "grad_norm": 0.027009891346096992, "learning_rate": 1.019690576652602e-05, "loss": 0.0292, "step": 12900 }, { "epoch": 40.017088607594935, "grad_norm": 0.004601217340677977, "learning_rate": 1.0161744022503517e-05, "loss": 0.0366, "step": 12910 }, { "epoch": 40.017721518987344, "grad_norm": 0.004661747720092535, "learning_rate": 1.0126582278481012e-05, "loss": 0.0386, "step": 12920 }, { "epoch": 40.018354430379745, "grad_norm": 0.00786674115806818, "learning_rate": 1.0091420534458509e-05, "loss": 0.0872, "step": 12930 }, { "epoch": 40.01898734177215, "grad_norm": 0.00579045619815588, "learning_rate": 1.0056258790436006e-05, "loss": 0.0319, "step": 12940 }, { "epoch": 40.019620253164554, "grad_norm": 1.1140903234481812, "learning_rate": 1.0021097046413502e-05, "loss": 0.0254, "step": 12950 }, { "epoch": 40.02, "eval_accuracy": 0.9731285988483686, "eval_loss": 0.04482452943921089, "eval_runtime": 863.7445, "eval_samples_per_second": 0.603, "eval_steps_per_second": 0.076, "step": 12956 }, { "epoch": 41.00025316455696, "grad_norm": 1.9278745651245117, "learning_rate": 9.985935302390999e-06, "loss": 0.1063, "step": 12960 }, { "epoch": 41.00088607594937, "grad_norm": 0.011978060938417912, "learning_rate": 9.950773558368495e-06, "loss": 0.047, "step": 12970 }, { "epoch": 41.001518987341775, "grad_norm": 0.0027657151222229004, "learning_rate": 9.915611814345992e-06, "loss": 0.0534, "step": 12980 }, { "epoch": 41.002151898734176, "grad_norm": 0.003485744819045067, "learning_rate": 9.880450070323489e-06, "loss": 0.045, "step": 12990 }, { "epoch": 41.002784810126585, "grad_norm": 0.019399795681238174, "learning_rate": 9.845288326300985e-06, "loss": 0.0372, "step": 13000 }, { "epoch": 41.003417721518986, "grad_norm": 2.4368183612823486, "learning_rate": 9.81012658227848e-06, "loss": 0.0384, "step": 13010 }, { "epoch": 41.004050632911394, "grad_norm": 0.0074830991216003895, "learning_rate": 9.774964838255977e-06, "loss": 0.0164, "step": 13020 }, { "epoch": 41.004683544303795, "grad_norm": 1.7207592725753784, "learning_rate": 9.739803094233474e-06, "loss": 0.025, "step": 13030 }, { "epoch": 41.0053164556962, "grad_norm": 2.901007652282715, "learning_rate": 9.70464135021097e-06, "loss": 0.0496, "step": 13040 }, { "epoch": 41.005949367088604, "grad_norm": 2.028984785079956, "learning_rate": 9.669479606188467e-06, "loss": 0.0475, "step": 13050 }, { "epoch": 41.00658227848101, "grad_norm": 2.09635066986084, "learning_rate": 9.634317862165963e-06, "loss": 0.1207, "step": 13060 }, { "epoch": 41.00721518987342, "grad_norm": 1.6683522462844849, "learning_rate": 9.59915611814346e-06, "loss": 0.0922, "step": 13070 }, { "epoch": 41.00784810126582, "grad_norm": 5.185677528381348, "learning_rate": 9.563994374120957e-06, "loss": 0.0592, "step": 13080 }, { "epoch": 41.00848101265823, "grad_norm": 1.2516071796417236, "learning_rate": 9.528832630098453e-06, "loss": 0.0768, "step": 13090 }, { "epoch": 41.00911392405063, "grad_norm": 0.0021934949327260256, "learning_rate": 9.49367088607595e-06, "loss": 0.1049, "step": 13100 }, { "epoch": 41.00974683544304, "grad_norm": 0.0028072514105588198, "learning_rate": 9.458509142053447e-06, "loss": 0.0724, "step": 13110 }, { "epoch": 41.01037974683544, "grad_norm": 0.0021622704807668924, "learning_rate": 9.423347398030943e-06, "loss": 0.05, "step": 13120 }, { "epoch": 41.01101265822785, "grad_norm": 0.0022336526308208704, "learning_rate": 9.38818565400844e-06, "loss": 0.0391, "step": 13130 }, { "epoch": 41.011645569620256, "grad_norm": 3.3134849071502686, "learning_rate": 9.353023909985936e-06, "loss": 0.0497, "step": 13140 }, { "epoch": 41.01227848101266, "grad_norm": 2.5143470764160156, "learning_rate": 9.317862165963433e-06, "loss": 0.0941, "step": 13150 }, { "epoch": 41.012911392405066, "grad_norm": 0.003348015947267413, "learning_rate": 9.28270042194093e-06, "loss": 0.0363, "step": 13160 }, { "epoch": 41.01354430379747, "grad_norm": 0.00388367404229939, "learning_rate": 9.247538677918426e-06, "loss": 0.0579, "step": 13170 }, { "epoch": 41.014177215189875, "grad_norm": 2.559396266937256, "learning_rate": 9.212376933895921e-06, "loss": 0.0345, "step": 13180 }, { "epoch": 41.014810126582276, "grad_norm": 0.0016660054679960012, "learning_rate": 9.177215189873418e-06, "loss": 0.0792, "step": 13190 }, { "epoch": 41.015443037974684, "grad_norm": 1.8110253810882568, "learning_rate": 9.142053445850915e-06, "loss": 0.0731, "step": 13200 }, { "epoch": 41.016075949367085, "grad_norm": 1.7615784406661987, "learning_rate": 9.106891701828411e-06, "loss": 0.0562, "step": 13210 }, { "epoch": 41.01670886075949, "grad_norm": 0.002290283562615514, "learning_rate": 9.071729957805908e-06, "loss": 0.0677, "step": 13220 }, { "epoch": 41.0173417721519, "grad_norm": 1.738335132598877, "learning_rate": 9.036568213783404e-06, "loss": 0.0458, "step": 13230 }, { "epoch": 41.0179746835443, "grad_norm": 0.003094849642366171, "learning_rate": 9.001406469760901e-06, "loss": 0.0521, "step": 13240 }, { "epoch": 41.01860759493671, "grad_norm": 2.013655662536621, "learning_rate": 8.966244725738398e-06, "loss": 0.0521, "step": 13250 }, { "epoch": 41.01924050632911, "grad_norm": 0.0036079958081245422, "learning_rate": 8.931082981715894e-06, "loss": 0.07, "step": 13260 }, { "epoch": 41.01987341772152, "grad_norm": 0.0022296609822660685, "learning_rate": 8.89592123769339e-06, "loss": 0.043, "step": 13270 }, { "epoch": 41.02, "eval_accuracy": 0.9750479846449136, "eval_loss": 0.044986799359321594, "eval_runtime": 875.8809, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.075, "step": 13272 }, { "epoch": 42.000506329113925, "grad_norm": 0.004048566333949566, "learning_rate": 8.860759493670886e-06, "loss": 0.0185, "step": 13280 }, { "epoch": 42.001139240506326, "grad_norm": 0.003146181348711252, "learning_rate": 8.825597749648383e-06, "loss": 0.0577, "step": 13290 }, { "epoch": 42.001772151898734, "grad_norm": 1.8363380432128906, "learning_rate": 8.79043600562588e-06, "loss": 0.0858, "step": 13300 }, { "epoch": 42.00240506329114, "grad_norm": 0.010127292014658451, "learning_rate": 8.755274261603376e-06, "loss": 0.0343, "step": 13310 }, { "epoch": 42.003037974683544, "grad_norm": 0.003768622875213623, "learning_rate": 8.720112517580872e-06, "loss": 0.0568, "step": 13320 }, { "epoch": 42.00367088607595, "grad_norm": 4.169876575469971, "learning_rate": 8.684950773558369e-06, "loss": 0.0695, "step": 13330 }, { "epoch": 42.00430379746835, "grad_norm": 1.0784434080123901, "learning_rate": 8.649789029535866e-06, "loss": 0.0755, "step": 13340 }, { "epoch": 42.00493670886076, "grad_norm": 2.58248233795166, "learning_rate": 8.614627285513362e-06, "loss": 0.051, "step": 13350 }, { "epoch": 42.00556962025316, "grad_norm": 0.001903594471514225, "learning_rate": 8.579465541490857e-06, "loss": 0.0168, "step": 13360 }, { "epoch": 42.00620253164557, "grad_norm": 0.0027827962767332792, "learning_rate": 8.544303797468354e-06, "loss": 0.015, "step": 13370 }, { "epoch": 42.00683544303797, "grad_norm": 1.1959846019744873, "learning_rate": 8.50914205344585e-06, "loss": 0.0271, "step": 13380 }, { "epoch": 42.00746835443038, "grad_norm": 0.0016241382109001279, "learning_rate": 8.473980309423347e-06, "loss": 0.0164, "step": 13390 }, { "epoch": 42.00810126582279, "grad_norm": 1.522375226020813, "learning_rate": 8.438818565400844e-06, "loss": 0.0359, "step": 13400 }, { "epoch": 42.00873417721519, "grad_norm": 1.8420836925506592, "learning_rate": 8.40365682137834e-06, "loss": 0.0759, "step": 13410 }, { "epoch": 42.0093670886076, "grad_norm": 0.0029541512485593557, "learning_rate": 8.368495077355837e-06, "loss": 0.0002, "step": 13420 }, { "epoch": 42.01, "grad_norm": 1.9524579048156738, "learning_rate": 8.333333333333334e-06, "loss": 0.0445, "step": 13430 }, { "epoch": 42.010632911392406, "grad_norm": 1.539594292640686, "learning_rate": 8.29817158931083e-06, "loss": 0.1011, "step": 13440 }, { "epoch": 42.01126582278481, "grad_norm": 2.301950693130493, "learning_rate": 8.263009845288327e-06, "loss": 0.1041, "step": 13450 }, { "epoch": 42.011898734177215, "grad_norm": 0.004067489877343178, "learning_rate": 8.227848101265822e-06, "loss": 0.086, "step": 13460 }, { "epoch": 42.012531645569624, "grad_norm": 2.2229349613189697, "learning_rate": 8.192686357243319e-06, "loss": 0.1346, "step": 13470 }, { "epoch": 42.013164556962025, "grad_norm": 0.005787982139736414, "learning_rate": 8.157524613220815e-06, "loss": 0.0442, "step": 13480 }, { "epoch": 42.01379746835443, "grad_norm": 0.0025440517347306013, "learning_rate": 8.122362869198312e-06, "loss": 0.0091, "step": 13490 }, { "epoch": 42.014430379746834, "grad_norm": 3.941728115081787, "learning_rate": 8.08720112517581e-06, "loss": 0.0812, "step": 13500 }, { "epoch": 42.01506329113924, "grad_norm": 0.6516626477241516, "learning_rate": 8.052039381153307e-06, "loss": 0.0892, "step": 13510 }, { "epoch": 42.01569620253164, "grad_norm": 0.002771928673610091, "learning_rate": 8.016877637130803e-06, "loss": 0.0256, "step": 13520 }, { "epoch": 42.01632911392405, "grad_norm": 1.8547269105911255, "learning_rate": 7.981715893108298e-06, "loss": 0.0839, "step": 13530 }, { "epoch": 42.01696202531645, "grad_norm": 0.0014696965226903558, "learning_rate": 7.946554149085795e-06, "loss": 0.0356, "step": 13540 }, { "epoch": 42.01759493670886, "grad_norm": 1.8392219543457031, "learning_rate": 7.911392405063292e-06, "loss": 0.0614, "step": 13550 }, { "epoch": 42.01822784810127, "grad_norm": 0.0020308021921664476, "learning_rate": 7.876230661040788e-06, "loss": 0.0192, "step": 13560 }, { "epoch": 42.01886075949367, "grad_norm": 1.8742437362670898, "learning_rate": 7.841068917018285e-06, "loss": 0.0993, "step": 13570 }, { "epoch": 42.01949367088608, "grad_norm": 2.1382617950439453, "learning_rate": 7.805907172995782e-06, "loss": 0.0695, "step": 13580 }, { "epoch": 42.02, "eval_accuracy": 0.9750479846449136, "eval_loss": 0.04481815919280052, "eval_runtime": 864.6404, "eval_samples_per_second": 0.603, "eval_steps_per_second": 0.076, "step": 13588 }, { "epoch": 43.00012658227848, "grad_norm": 0.0016333996318280697, "learning_rate": 7.770745428973278e-06, "loss": 0.0536, "step": 13590 }, { "epoch": 43.000759493670884, "grad_norm": 2.334437131881714, "learning_rate": 7.735583684950775e-06, "loss": 0.0156, "step": 13600 }, { "epoch": 43.00139240506329, "grad_norm": 1.4097012281417847, "learning_rate": 7.700421940928271e-06, "loss": 0.0638, "step": 13610 }, { "epoch": 43.00202531645569, "grad_norm": 0.0024437035899609327, "learning_rate": 7.665260196905766e-06, "loss": 0.0184, "step": 13620 }, { "epoch": 43.0026582278481, "grad_norm": 1.7008681297302246, "learning_rate": 7.630098452883263e-06, "loss": 0.0664, "step": 13630 }, { "epoch": 43.00329113924051, "grad_norm": 0.0019239940447732806, "learning_rate": 7.5949367088607605e-06, "loss": 0.1075, "step": 13640 }, { "epoch": 43.00392405063291, "grad_norm": 0.007515426259487867, "learning_rate": 7.559774964838256e-06, "loss": 0.0423, "step": 13650 }, { "epoch": 43.00455696202532, "grad_norm": 2.043806552886963, "learning_rate": 7.524613220815753e-06, "loss": 0.057, "step": 13660 }, { "epoch": 43.00518987341772, "grad_norm": 1.7065999507904053, "learning_rate": 7.4894514767932495e-06, "loss": 0.0867, "step": 13670 }, { "epoch": 43.00582278481013, "grad_norm": 0.02369196154177189, "learning_rate": 7.454289732770746e-06, "loss": 0.035, "step": 13680 }, { "epoch": 43.00645569620253, "grad_norm": 2.008984088897705, "learning_rate": 7.419127988748242e-06, "loss": 0.0754, "step": 13690 }, { "epoch": 43.00708860759494, "grad_norm": 0.002092506969347596, "learning_rate": 7.3839662447257386e-06, "loss": 0.0776, "step": 13700 }, { "epoch": 43.00772151898734, "grad_norm": 2.8015787601470947, "learning_rate": 7.348804500703235e-06, "loss": 0.0597, "step": 13710 }, { "epoch": 43.00835443037975, "grad_norm": 0.002701831515878439, "learning_rate": 7.313642756680732e-06, "loss": 0.0401, "step": 13720 }, { "epoch": 43.008987341772155, "grad_norm": 0.002448461716994643, "learning_rate": 7.2784810126582285e-06, "loss": 0.061, "step": 13730 }, { "epoch": 43.009620253164556, "grad_norm": 0.001818045973777771, "learning_rate": 7.243319268635724e-06, "loss": 0.0198, "step": 13740 }, { "epoch": 43.010253164556964, "grad_norm": 1.9535473585128784, "learning_rate": 7.208157524613221e-06, "loss": 0.1018, "step": 13750 }, { "epoch": 43.010886075949365, "grad_norm": 0.0013298611156642437, "learning_rate": 7.1729957805907175e-06, "loss": 0.0232, "step": 13760 }, { "epoch": 43.01151898734177, "grad_norm": 0.002159226918593049, "learning_rate": 7.137834036568214e-06, "loss": 0.0652, "step": 13770 }, { "epoch": 43.012151898734174, "grad_norm": 2.004695177078247, "learning_rate": 7.102672292545711e-06, "loss": 0.0787, "step": 13780 }, { "epoch": 43.01278481012658, "grad_norm": 0.002241474576294422, "learning_rate": 7.0675105485232066e-06, "loss": 0.0564, "step": 13790 }, { "epoch": 43.01341772151899, "grad_norm": 0.0015954429982230067, "learning_rate": 7.032348804500703e-06, "loss": 0.0715, "step": 13800 }, { "epoch": 43.01405063291139, "grad_norm": 0.04250750690698624, "learning_rate": 6.9971870604782e-06, "loss": 0.0477, "step": 13810 }, { "epoch": 43.0146835443038, "grad_norm": 0.002124907448887825, "learning_rate": 6.9620253164556965e-06, "loss": 0.0152, "step": 13820 }, { "epoch": 43.0153164556962, "grad_norm": 0.00806543417274952, "learning_rate": 6.926863572433192e-06, "loss": 0.0002, "step": 13830 }, { "epoch": 43.01594936708861, "grad_norm": 0.0016234181821346283, "learning_rate": 6.891701828410689e-06, "loss": 0.0443, "step": 13840 }, { "epoch": 43.01658227848101, "grad_norm": 0.0016976363258436322, "learning_rate": 6.8565400843881855e-06, "loss": 0.0811, "step": 13850 }, { "epoch": 43.01721518987342, "grad_norm": 0.002512180246412754, "learning_rate": 6.821378340365682e-06, "loss": 0.0635, "step": 13860 }, { "epoch": 43.01784810126582, "grad_norm": 0.014850087463855743, "learning_rate": 6.786216596343179e-06, "loss": 0.1057, "step": 13870 }, { "epoch": 43.01848101265823, "grad_norm": 1.433249831199646, "learning_rate": 6.7510548523206746e-06, "loss": 0.0757, "step": 13880 }, { "epoch": 43.019113924050636, "grad_norm": 0.0025778491981327534, "learning_rate": 6.715893108298171e-06, "loss": 0.01, "step": 13890 }, { "epoch": 43.01974683544304, "grad_norm": 0.0015471552032977343, "learning_rate": 6.680731364275668e-06, "loss": 0.0398, "step": 13900 }, { "epoch": 43.02, "eval_accuracy": 0.9769673704414588, "eval_loss": 0.04403243213891983, "eval_runtime": 918.4271, "eval_samples_per_second": 0.567, "eval_steps_per_second": 0.072, "step": 13904 }, { "epoch": 44.00037974683544, "grad_norm": 0.002836668398231268, "learning_rate": 6.6455696202531645e-06, "loss": 0.0786, "step": 13910 }, { "epoch": 44.00101265822785, "grad_norm": 1.8331170082092285, "learning_rate": 6.610407876230662e-06, "loss": 0.0305, "step": 13920 }, { "epoch": 44.00164556962025, "grad_norm": 0.002194299828261137, "learning_rate": 6.5752461322081586e-06, "loss": 0.032, "step": 13930 }, { "epoch": 44.00227848101266, "grad_norm": 0.19766800105571747, "learning_rate": 6.540084388185655e-06, "loss": 0.0536, "step": 13940 }, { "epoch": 44.00291139240506, "grad_norm": 0.0020373559091240168, "learning_rate": 6.504922644163151e-06, "loss": 0.0457, "step": 13950 }, { "epoch": 44.00354430379747, "grad_norm": 1.5892828702926636, "learning_rate": 6.469760900140648e-06, "loss": 0.091, "step": 13960 }, { "epoch": 44.00417721518988, "grad_norm": 0.002311758464202285, "learning_rate": 6.434599156118144e-06, "loss": 0.0358, "step": 13970 }, { "epoch": 44.00481012658228, "grad_norm": 1.6445496082305908, "learning_rate": 6.399437412095641e-06, "loss": 0.063, "step": 13980 }, { "epoch": 44.005443037974686, "grad_norm": 0.012705209665000439, "learning_rate": 6.3642756680731375e-06, "loss": 0.0365, "step": 13990 }, { "epoch": 44.00607594936709, "grad_norm": 2.040945053100586, "learning_rate": 6.329113924050633e-06, "loss": 0.094, "step": 14000 }, { "epoch": 44.006708860759495, "grad_norm": 0.0017350486014038324, "learning_rate": 6.29395218002813e-06, "loss": 0.0668, "step": 14010 }, { "epoch": 44.0073417721519, "grad_norm": 1.4773507118225098, "learning_rate": 6.2587904360056266e-06, "loss": 0.0389, "step": 14020 }, { "epoch": 44.007974683544305, "grad_norm": 1.701064109802246, "learning_rate": 6.223628691983123e-06, "loss": 0.0213, "step": 14030 }, { "epoch": 44.008607594936706, "grad_norm": 2.707864284515381, "learning_rate": 6.18846694796062e-06, "loss": 0.1098, "step": 14040 }, { "epoch": 44.009240506329114, "grad_norm": 0.10958682745695114, "learning_rate": 6.153305203938116e-06, "loss": 0.1058, "step": 14050 }, { "epoch": 44.00987341772152, "grad_norm": 1.8932132720947266, "learning_rate": 6.118143459915612e-06, "loss": 0.0379, "step": 14060 }, { "epoch": 44.01050632911392, "grad_norm": 0.0022401248570531607, "learning_rate": 6.082981715893109e-06, "loss": 0.0445, "step": 14070 }, { "epoch": 44.01113924050633, "grad_norm": 0.00331929256208241, "learning_rate": 6.0478199718706055e-06, "loss": 0.0807, "step": 14080 }, { "epoch": 44.01177215189873, "grad_norm": 1.2681527137756348, "learning_rate": 6.012658227848101e-06, "loss": 0.0283, "step": 14090 }, { "epoch": 44.01240506329114, "grad_norm": 0.0029717902652919292, "learning_rate": 5.977496483825598e-06, "loss": 0.0867, "step": 14100 }, { "epoch": 44.01303797468354, "grad_norm": 1.9497371912002563, "learning_rate": 5.9423347398030946e-06, "loss": 0.0512, "step": 14110 }, { "epoch": 44.01367088607595, "grad_norm": 0.0076379780657589436, "learning_rate": 5.907172995780591e-06, "loss": 0.0647, "step": 14120 }, { "epoch": 44.01430379746836, "grad_norm": 0.0024882254656404257, "learning_rate": 5.872011251758088e-06, "loss": 0.0205, "step": 14130 }, { "epoch": 44.01493670886076, "grad_norm": 0.0016955797327682376, "learning_rate": 5.836849507735584e-06, "loss": 0.0855, "step": 14140 }, { "epoch": 44.01556962025317, "grad_norm": 1.9731398820877075, "learning_rate": 5.80168776371308e-06, "loss": 0.0623, "step": 14150 }, { "epoch": 44.01620253164557, "grad_norm": 0.0020470027811825275, "learning_rate": 5.766526019690577e-06, "loss": 0.0151, "step": 14160 }, { "epoch": 44.01683544303798, "grad_norm": 0.0018880012212321162, "learning_rate": 5.7313642756680735e-06, "loss": 0.0393, "step": 14170 }, { "epoch": 44.01746835443038, "grad_norm": 0.003355634631589055, "learning_rate": 5.69620253164557e-06, "loss": 0.024, "step": 14180 }, { "epoch": 44.018101265822786, "grad_norm": 0.0030694929882884026, "learning_rate": 5.661040787623066e-06, "loss": 0.0285, "step": 14190 }, { "epoch": 44.01873417721519, "grad_norm": 0.001790488138794899, "learning_rate": 5.6258790436005626e-06, "loss": 0.0361, "step": 14200 }, { "epoch": 44.019367088607595, "grad_norm": 0.0015849280171096325, "learning_rate": 5.590717299578059e-06, "loss": 0.0828, "step": 14210 }, { "epoch": 44.02, "grad_norm": 1.4295841455459595, "learning_rate": 5.555555555555556e-06, "loss": 0.0455, "step": 14220 }, { "epoch": 44.02, "eval_accuracy": 0.9769673704414588, "eval_loss": 0.04359356313943863, "eval_runtime": 920.1088, "eval_samples_per_second": 0.566, "eval_steps_per_second": 0.072, "step": 14220 }, { "epoch": 45.00063291139241, "grad_norm": 0.006928425282239914, "learning_rate": 5.520393811533052e-06, "loss": 0.0489, "step": 14230 }, { "epoch": 45.00126582278481, "grad_norm": 0.0019474881701171398, "learning_rate": 5.485232067510549e-06, "loss": 0.0236, "step": 14240 }, { "epoch": 45.00189873417722, "grad_norm": 1.7010918855667114, "learning_rate": 5.450070323488046e-06, "loss": 0.0395, "step": 14250 }, { "epoch": 45.00253164556962, "grad_norm": 0.0026303452905267477, "learning_rate": 5.414908579465542e-06, "loss": 0.0223, "step": 14260 }, { "epoch": 45.00316455696203, "grad_norm": 0.0020397694315761328, "learning_rate": 5.379746835443038e-06, "loss": 0.0431, "step": 14270 }, { "epoch": 45.00379746835443, "grad_norm": 2.7060606479644775, "learning_rate": 5.344585091420535e-06, "loss": 0.1383, "step": 14280 }, { "epoch": 45.004430379746836, "grad_norm": 0.0019224517745897174, "learning_rate": 5.309423347398031e-06, "loss": 0.0401, "step": 14290 }, { "epoch": 45.00506329113924, "grad_norm": 1.5191059112548828, "learning_rate": 5.274261603375528e-06, "loss": 0.0563, "step": 14300 }, { "epoch": 45.005696202531645, "grad_norm": 1.7753527164459229, "learning_rate": 5.239099859353025e-06, "loss": 0.0452, "step": 14310 }, { "epoch": 45.00632911392405, "grad_norm": 0.0027101896703243256, "learning_rate": 5.2039381153305205e-06, "loss": 0.0733, "step": 14320 }, { "epoch": 45.006962025316454, "grad_norm": 0.0018217426259070635, "learning_rate": 5.168776371308017e-06, "loss": 0.0601, "step": 14330 }, { "epoch": 45.00759493670886, "grad_norm": 1.4243252277374268, "learning_rate": 5.133614627285514e-06, "loss": 0.0995, "step": 14340 }, { "epoch": 45.008227848101264, "grad_norm": 0.007520051673054695, "learning_rate": 5.09845288326301e-06, "loss": 0.0881, "step": 14350 }, { "epoch": 45.00886075949367, "grad_norm": 1.565136194229126, "learning_rate": 5.063291139240506e-06, "loss": 0.0648, "step": 14360 }, { "epoch": 45.00949367088607, "grad_norm": 0.0016765049658715725, "learning_rate": 5.028129395218003e-06, "loss": 0.0751, "step": 14370 }, { "epoch": 45.01012658227848, "grad_norm": 0.0014801392098888755, "learning_rate": 4.992967651195499e-06, "loss": 0.0246, "step": 14380 }, { "epoch": 45.01075949367089, "grad_norm": 0.0016110733849927783, "learning_rate": 4.957805907172996e-06, "loss": 0.0396, "step": 14390 }, { "epoch": 45.01139240506329, "grad_norm": 0.002271553035825491, "learning_rate": 4.922644163150493e-06, "loss": 0.0268, "step": 14400 }, { "epoch": 45.0120253164557, "grad_norm": 0.002145587233826518, "learning_rate": 4.8874824191279884e-06, "loss": 0.077, "step": 14410 }, { "epoch": 45.0126582278481, "grad_norm": 0.001231012400239706, "learning_rate": 4.852320675105485e-06, "loss": 0.0633, "step": 14420 }, { "epoch": 45.01329113924051, "grad_norm": 2.76226544380188, "learning_rate": 4.817158931082982e-06, "loss": 0.0718, "step": 14430 }, { "epoch": 45.01392405063291, "grad_norm": 0.002031494863331318, "learning_rate": 4.781997187060478e-06, "loss": 0.0752, "step": 14440 }, { "epoch": 45.01455696202532, "grad_norm": 2.092021942138672, "learning_rate": 4.746835443037975e-06, "loss": 0.0301, "step": 14450 }, { "epoch": 45.01518987341772, "grad_norm": 0.0018518840661272407, "learning_rate": 4.711673699015472e-06, "loss": 0.0203, "step": 14460 }, { "epoch": 45.015822784810126, "grad_norm": 0.0020814728923141956, "learning_rate": 4.676511954992968e-06, "loss": 0.0284, "step": 14470 }, { "epoch": 45.016455696202534, "grad_norm": 0.001710868556983769, "learning_rate": 4.641350210970465e-06, "loss": 0.0455, "step": 14480 }, { "epoch": 45.017088607594935, "grad_norm": 0.002616223180666566, "learning_rate": 4.606188466947961e-06, "loss": 0.0875, "step": 14490 }, { "epoch": 45.017721518987344, "grad_norm": 0.0022610658779740334, "learning_rate": 4.571026722925457e-06, "loss": 0.0313, "step": 14500 }, { "epoch": 45.018354430379745, "grad_norm": 2.640284776687622, "learning_rate": 4.535864978902954e-06, "loss": 0.073, "step": 14510 }, { "epoch": 45.01898734177215, "grad_norm": 0.0019336823606863618, "learning_rate": 4.5007032348804506e-06, "loss": 0.0416, "step": 14520 }, { "epoch": 45.019620253164554, "grad_norm": 0.0023440378718078136, "learning_rate": 4.465541490857947e-06, "loss": 0.0423, "step": 14530 }, { "epoch": 45.02, "eval_accuracy": 0.9750479846449136, "eval_loss": 0.043731071054935455, "eval_runtime": 852.3259, "eval_samples_per_second": 0.611, "eval_steps_per_second": 0.077, "step": 14536 }, { "epoch": 46.00025316455696, "grad_norm": 0.0025269004981964827, "learning_rate": 4.430379746835443e-06, "loss": 0.0336, "step": 14540 }, { "epoch": 46.00088607594937, "grad_norm": 0.0019006689544767141, "learning_rate": 4.39521800281294e-06, "loss": 0.006, "step": 14550 }, { "epoch": 46.001518987341775, "grad_norm": 0.0017826940165832639, "learning_rate": 4.360056258790436e-06, "loss": 0.0263, "step": 14560 }, { "epoch": 46.002151898734176, "grad_norm": 0.0013272733194753528, "learning_rate": 4.324894514767933e-06, "loss": 0.0522, "step": 14570 }, { "epoch": 46.002784810126585, "grad_norm": 1.84543776512146, "learning_rate": 4.289732770745429e-06, "loss": 0.0337, "step": 14580 }, { "epoch": 46.003417721518986, "grad_norm": 2.3933355808258057, "learning_rate": 4.254571026722925e-06, "loss": 0.0667, "step": 14590 }, { "epoch": 46.004050632911394, "grad_norm": 1.3483564853668213, "learning_rate": 4.219409282700422e-06, "loss": 0.0358, "step": 14600 }, { "epoch": 46.004683544303795, "grad_norm": 0.0019236382795497775, "learning_rate": 4.1842475386779186e-06, "loss": 0.105, "step": 14610 }, { "epoch": 46.0053164556962, "grad_norm": 1.8564695119857788, "learning_rate": 4.149085794655415e-06, "loss": 0.0618, "step": 14620 }, { "epoch": 46.005949367088604, "grad_norm": 0.0019053075229749084, "learning_rate": 4.113924050632911e-06, "loss": 0.0308, "step": 14630 }, { "epoch": 46.00658227848101, "grad_norm": 0.888851523399353, "learning_rate": 4.078762306610408e-06, "loss": 0.0046, "step": 14640 }, { "epoch": 46.00721518987342, "grad_norm": 0.0015094269765540957, "learning_rate": 4.043600562587905e-06, "loss": 0.0597, "step": 14650 }, { "epoch": 46.00784810126582, "grad_norm": 0.0023652520030736923, "learning_rate": 4.008438818565402e-06, "loss": 0.0719, "step": 14660 }, { "epoch": 46.00848101265823, "grad_norm": 0.0014642721507698298, "learning_rate": 3.9732770745428975e-06, "loss": 0.0657, "step": 14670 }, { "epoch": 46.00911392405063, "grad_norm": 2.2750301361083984, "learning_rate": 3.938115330520394e-06, "loss": 0.0324, "step": 14680 }, { "epoch": 46.00974683544304, "grad_norm": 0.007223762571811676, "learning_rate": 3.902953586497891e-06, "loss": 0.0269, "step": 14690 }, { "epoch": 46.01037974683544, "grad_norm": 0.0021259100176393986, "learning_rate": 3.867791842475387e-06, "loss": 0.0067, "step": 14700 }, { "epoch": 46.01101265822785, "grad_norm": 0.00181837088894099, "learning_rate": 3.832630098452883e-06, "loss": 0.0759, "step": 14710 }, { "epoch": 46.011645569620256, "grad_norm": 0.006951657589524984, "learning_rate": 3.7974683544303802e-06, "loss": 0.0472, "step": 14720 }, { "epoch": 46.01227848101266, "grad_norm": 0.0019946214742958546, "learning_rate": 3.7623066104078764e-06, "loss": 0.0398, "step": 14730 }, { "epoch": 46.012911392405066, "grad_norm": 0.0024334420450031757, "learning_rate": 3.727144866385373e-06, "loss": 0.0486, "step": 14740 }, { "epoch": 46.01354430379747, "grad_norm": 1.671848177909851, "learning_rate": 3.6919831223628693e-06, "loss": 0.0479, "step": 14750 }, { "epoch": 46.014177215189875, "grad_norm": 0.0020218833815306425, "learning_rate": 3.656821378340366e-06, "loss": 0.0696, "step": 14760 }, { "epoch": 46.014810126582276, "grad_norm": 2.070733070373535, "learning_rate": 3.621659634317862e-06, "loss": 0.0623, "step": 14770 }, { "epoch": 46.015443037974684, "grad_norm": 0.00212723552249372, "learning_rate": 3.5864978902953588e-06, "loss": 0.0344, "step": 14780 }, { "epoch": 46.016075949367085, "grad_norm": 0.007334825582802296, "learning_rate": 3.5513361462728554e-06, "loss": 0.0943, "step": 14790 }, { "epoch": 46.01670886075949, "grad_norm": 0.00477689690887928, "learning_rate": 3.5161744022503516e-06, "loss": 0.0391, "step": 14800 }, { "epoch": 46.0173417721519, "grad_norm": 0.0014756337041035295, "learning_rate": 3.4810126582278482e-06, "loss": 0.0578, "step": 14810 }, { "epoch": 46.0179746835443, "grad_norm": 0.0018279188079759479, "learning_rate": 3.4458509142053444e-06, "loss": 0.1147, "step": 14820 }, { "epoch": 46.01860759493671, "grad_norm": 2.317160129547119, "learning_rate": 3.410689170182841e-06, "loss": 0.0934, "step": 14830 }, { "epoch": 46.01924050632911, "grad_norm": 0.007261245045810938, "learning_rate": 3.3755274261603373e-06, "loss": 0.0732, "step": 14840 }, { "epoch": 46.01987341772152, "grad_norm": 2.7471020221710205, "learning_rate": 3.340365682137834e-06, "loss": 0.0602, "step": 14850 }, { "epoch": 46.02, "eval_accuracy": 0.9769673704414588, "eval_loss": 0.0437638945877552, "eval_runtime": 913.7302, "eval_samples_per_second": 0.57, "eval_steps_per_second": 0.072, "step": 14852 }, { "epoch": 47.000506329113925, "grad_norm": 1.6660641431808472, "learning_rate": 3.305203938115331e-06, "loss": 0.0939, "step": 14860 }, { "epoch": 47.001139240506326, "grad_norm": 1.8603593111038208, "learning_rate": 3.2700421940928276e-06, "loss": 0.1073, "step": 14870 }, { "epoch": 47.001772151898734, "grad_norm": 0.0009881872683763504, "learning_rate": 3.234880450070324e-06, "loss": 0.059, "step": 14880 }, { "epoch": 47.00240506329114, "grad_norm": 0.0015653116861358285, "learning_rate": 3.1997187060478204e-06, "loss": 0.0548, "step": 14890 }, { "epoch": 47.003037974683544, "grad_norm": 1.5998444557189941, "learning_rate": 3.1645569620253167e-06, "loss": 0.0465, "step": 14900 }, { "epoch": 47.00367088607595, "grad_norm": 0.0016954662278294563, "learning_rate": 3.1293952180028133e-06, "loss": 0.0684, "step": 14910 }, { "epoch": 47.00430379746835, "grad_norm": 1.6929208040237427, "learning_rate": 3.09423347398031e-06, "loss": 0.0489, "step": 14920 }, { "epoch": 47.00493670886076, "grad_norm": 1.6551687717437744, "learning_rate": 3.059071729957806e-06, "loss": 0.0799, "step": 14930 }, { "epoch": 47.00556962025316, "grad_norm": 0.0021378027740865946, "learning_rate": 3.0239099859353028e-06, "loss": 0.0199, "step": 14940 }, { "epoch": 47.00620253164557, "grad_norm": 0.001334396773017943, "learning_rate": 2.988748241912799e-06, "loss": 0.0181, "step": 14950 }, { "epoch": 47.00683544303797, "grad_norm": 0.007229403592646122, "learning_rate": 2.9535864978902956e-06, "loss": 0.0588, "step": 14960 }, { "epoch": 47.00746835443038, "grad_norm": 0.0016747723566368222, "learning_rate": 2.918424753867792e-06, "loss": 0.034, "step": 14970 }, { "epoch": 47.00810126582279, "grad_norm": 0.8091289401054382, "learning_rate": 2.8832630098452884e-06, "loss": 0.0476, "step": 14980 }, { "epoch": 47.00873417721519, "grad_norm": 1.8234349489212036, "learning_rate": 2.848101265822785e-06, "loss": 0.0711, "step": 14990 }, { "epoch": 47.0093670886076, "grad_norm": 1.9740657806396484, "learning_rate": 2.8129395218002813e-06, "loss": 0.0338, "step": 15000 }, { "epoch": 47.01, "grad_norm": 0.0016077395994216204, "learning_rate": 2.777777777777778e-06, "loss": 0.0454, "step": 15010 }, { "epoch": 47.010632911392406, "grad_norm": 0.004215624183416367, "learning_rate": 2.7426160337552745e-06, "loss": 0.0438, "step": 15020 }, { "epoch": 47.01126582278481, "grad_norm": 0.0015969170490279794, "learning_rate": 2.707454289732771e-06, "loss": 0.0418, "step": 15030 }, { "epoch": 47.011898734177215, "grad_norm": 1.9868484735488892, "learning_rate": 2.6722925457102674e-06, "loss": 0.0777, "step": 15040 }, { "epoch": 47.012531645569624, "grad_norm": 0.006545115727931261, "learning_rate": 2.637130801687764e-06, "loss": 0.0402, "step": 15050 }, { "epoch": 47.013164556962025, "grad_norm": 0.008419793099164963, "learning_rate": 2.6019690576652602e-06, "loss": 0.0617, "step": 15060 }, { "epoch": 47.01379746835443, "grad_norm": 1.8613364696502686, "learning_rate": 2.566807313642757e-06, "loss": 0.0477, "step": 15070 }, { "epoch": 47.014430379746834, "grad_norm": 1.6971709728240967, "learning_rate": 2.531645569620253e-06, "loss": 0.0393, "step": 15080 }, { "epoch": 47.01506329113924, "grad_norm": 0.0011392009910196066, "learning_rate": 2.4964838255977497e-06, "loss": 0.0481, "step": 15090 }, { "epoch": 47.01569620253164, "grad_norm": 2.0638561248779297, "learning_rate": 2.4613220815752463e-06, "loss": 0.0648, "step": 15100 }, { "epoch": 47.01632911392405, "grad_norm": 0.007941323332488537, "learning_rate": 2.4261603375527425e-06, "loss": 0.07, "step": 15110 }, { "epoch": 47.01696202531645, "grad_norm": 0.0015764615964144468, "learning_rate": 2.390998593530239e-06, "loss": 0.0537, "step": 15120 }, { "epoch": 47.01759493670886, "grad_norm": 1.6305996179580688, "learning_rate": 2.355836849507736e-06, "loss": 0.0408, "step": 15130 }, { "epoch": 47.01822784810127, "grad_norm": 1.698041558265686, "learning_rate": 2.3206751054852324e-06, "loss": 0.0421, "step": 15140 }, { "epoch": 47.01886075949367, "grad_norm": 1.406259536743164, "learning_rate": 2.2855133614627286e-06, "loss": 0.0367, "step": 15150 }, { "epoch": 47.01949367088608, "grad_norm": 0.0014187191845849156, "learning_rate": 2.2503516174402253e-06, "loss": 0.0407, "step": 15160 }, { "epoch": 47.02, "eval_accuracy": 0.9750479846449136, "eval_loss": 0.04366590827703476, "eval_runtime": 877.8986, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.075, "step": 15168 }, { "epoch": 48.00012658227848, "grad_norm": 0.0016180831007659435, "learning_rate": 2.2151898734177215e-06, "loss": 0.0144, "step": 15170 }, { "epoch": 48.000759493670884, "grad_norm": 1.924263596534729, "learning_rate": 2.180028129395218e-06, "loss": 0.0882, "step": 15180 }, { "epoch": 48.00139240506329, "grad_norm": 1.9763191938400269, "learning_rate": 2.1448663853727143e-06, "loss": 0.051, "step": 15190 }, { "epoch": 48.00202531645569, "grad_norm": 2.0363032817840576, "learning_rate": 2.109704641350211e-06, "loss": 0.048, "step": 15200 }, { "epoch": 48.0026582278481, "grad_norm": 1.593593716621399, "learning_rate": 2.0745428973277076e-06, "loss": 0.0373, "step": 15210 }, { "epoch": 48.00329113924051, "grad_norm": 1.4322104454040527, "learning_rate": 2.039381153305204e-06, "loss": 0.0681, "step": 15220 }, { "epoch": 48.00392405063291, "grad_norm": 0.0023278342559933662, "learning_rate": 2.004219409282701e-06, "loss": 0.0085, "step": 15230 }, { "epoch": 48.00455696202532, "grad_norm": 2.485316514968872, "learning_rate": 1.969057665260197e-06, "loss": 0.0902, "step": 15240 }, { "epoch": 48.00518987341772, "grad_norm": 0.0025991136208176613, "learning_rate": 1.9338959212376937e-06, "loss": 0.0845, "step": 15250 }, { "epoch": 48.00582278481013, "grad_norm": 0.001590245054103434, "learning_rate": 1.8987341772151901e-06, "loss": 0.0218, "step": 15260 }, { "epoch": 48.00645569620253, "grad_norm": 0.001682686386629939, "learning_rate": 1.8635724331926865e-06, "loss": 0.0623, "step": 15270 }, { "epoch": 48.00708860759494, "grad_norm": 1.803706169128418, "learning_rate": 1.828410689170183e-06, "loss": 0.0568, "step": 15280 }, { "epoch": 48.00772151898734, "grad_norm": 0.003340956987813115, "learning_rate": 1.7932489451476794e-06, "loss": 0.0419, "step": 15290 }, { "epoch": 48.00835443037975, "grad_norm": 0.0032536042854189873, "learning_rate": 1.7580872011251758e-06, "loss": 0.0861, "step": 15300 }, { "epoch": 48.008987341772155, "grad_norm": 1.4462651014328003, "learning_rate": 1.7229254571026722e-06, "loss": 0.0639, "step": 15310 }, { "epoch": 48.009620253164556, "grad_norm": 0.0021976635325700045, "learning_rate": 1.6877637130801686e-06, "loss": 0.0624, "step": 15320 }, { "epoch": 48.010253164556964, "grad_norm": 0.07273641228675842, "learning_rate": 1.6526019690576655e-06, "loss": 0.0662, "step": 15330 }, { "epoch": 48.010886075949365, "grad_norm": 0.0021416894160211086, "learning_rate": 1.617440225035162e-06, "loss": 0.0562, "step": 15340 }, { "epoch": 48.01151898734177, "grad_norm": 1.788458228111267, "learning_rate": 1.5822784810126583e-06, "loss": 0.047, "step": 15350 }, { "epoch": 48.012151898734174, "grad_norm": 0.03488551825284958, "learning_rate": 1.547116736990155e-06, "loss": 0.0406, "step": 15360 }, { "epoch": 48.01278481012658, "grad_norm": 0.002074979245662689, "learning_rate": 1.5119549929676514e-06, "loss": 0.0363, "step": 15370 }, { "epoch": 48.01341772151899, "grad_norm": 0.00142192211933434, "learning_rate": 1.4767932489451478e-06, "loss": 0.0473, "step": 15380 }, { "epoch": 48.01405063291139, "grad_norm": 1.6816853284835815, "learning_rate": 1.4416315049226442e-06, "loss": 0.0692, "step": 15390 }, { "epoch": 48.0146835443038, "grad_norm": 0.0019629504531621933, "learning_rate": 1.4064697609001406e-06, "loss": 0.0566, "step": 15400 }, { "epoch": 48.0153164556962, "grad_norm": 1.898903250694275, "learning_rate": 1.3713080168776373e-06, "loss": 0.0501, "step": 15410 }, { "epoch": 48.01594936708861, "grad_norm": 0.0017211647937074304, "learning_rate": 1.3361462728551337e-06, "loss": 0.0268, "step": 15420 }, { "epoch": 48.01658227848101, "grad_norm": 0.0072492752224206924, "learning_rate": 1.3009845288326301e-06, "loss": 0.0376, "step": 15430 }, { "epoch": 48.01721518987342, "grad_norm": 0.0022765600588172674, "learning_rate": 1.2658227848101265e-06, "loss": 0.0423, "step": 15440 }, { "epoch": 48.01784810126582, "grad_norm": 0.001874367124401033, "learning_rate": 1.2306610407876232e-06, "loss": 0.0247, "step": 15450 }, { "epoch": 48.01848101265823, "grad_norm": 0.0012237579794600606, "learning_rate": 1.1954992967651196e-06, "loss": 0.0355, "step": 15460 }, { "epoch": 48.019113924050636, "grad_norm": 1.6173341274261475, "learning_rate": 1.1603375527426162e-06, "loss": 0.07, "step": 15470 }, { "epoch": 48.01974683544304, "grad_norm": 0.001786409760825336, "learning_rate": 1.1251758087201126e-06, "loss": 0.0435, "step": 15480 }, { "epoch": 48.02, "eval_accuracy": 0.9769673704414588, "eval_loss": 0.043503157794475555, "eval_runtime": 856.2754, "eval_samples_per_second": 0.608, "eval_steps_per_second": 0.077, "step": 15484 }, { "epoch": 49.00037974683544, "grad_norm": 1.8200874328613281, "learning_rate": 1.090014064697609e-06, "loss": 0.0552, "step": 15490 }, { "epoch": 49.00101265822785, "grad_norm": 0.002268126467242837, "learning_rate": 1.0548523206751055e-06, "loss": 0.0194, "step": 15500 }, { "epoch": 49.00164556962025, "grad_norm": 0.7153782844543457, "learning_rate": 1.019690576652602e-06, "loss": 0.104, "step": 15510 }, { "epoch": 49.00227848101266, "grad_norm": 0.0011382299708202481, "learning_rate": 9.845288326300985e-07, "loss": 0.0602, "step": 15520 }, { "epoch": 49.00291139240506, "grad_norm": 1.8290748596191406, "learning_rate": 9.493670886075951e-07, "loss": 0.0151, "step": 15530 }, { "epoch": 49.00354430379747, "grad_norm": 0.0012674119789153337, "learning_rate": 9.142053445850915e-07, "loss": 0.0333, "step": 15540 }, { "epoch": 49.00417721518988, "grad_norm": 1.7633377313613892, "learning_rate": 8.790436005625879e-07, "loss": 0.0616, "step": 15550 }, { "epoch": 49.00481012658228, "grad_norm": 1.8792239427566528, "learning_rate": 8.438818565400843e-07, "loss": 0.0621, "step": 15560 }, { "epoch": 49.005443037974686, "grad_norm": 0.002730746753513813, "learning_rate": 8.08720112517581e-07, "loss": 0.0706, "step": 15570 }, { "epoch": 49.00607594936709, "grad_norm": 0.0031078814063221216, "learning_rate": 7.735583684950775e-07, "loss": 0.0644, "step": 15580 }, { "epoch": 49.006708860759495, "grad_norm": 0.00169693015050143, "learning_rate": 7.383966244725739e-07, "loss": 0.0387, "step": 15590 }, { "epoch": 49.0073417721519, "grad_norm": 0.0015373198548331857, "learning_rate": 7.032348804500703e-07, "loss": 0.0601, "step": 15600 }, { "epoch": 49.007974683544305, "grad_norm": 0.008119100704789162, "learning_rate": 6.680731364275668e-07, "loss": 0.0289, "step": 15610 }, { "epoch": 49.008607594936706, "grad_norm": 2.2671926021575928, "learning_rate": 6.329113924050633e-07, "loss": 0.0466, "step": 15620 }, { "epoch": 49.009240506329114, "grad_norm": 0.0015134834684431553, "learning_rate": 5.977496483825598e-07, "loss": 0.0606, "step": 15630 }, { "epoch": 49.00987341772152, "grad_norm": 1.3951467275619507, "learning_rate": 5.625879043600563e-07, "loss": 0.0563, "step": 15640 }, { "epoch": 49.01050632911392, "grad_norm": 0.0020348818507045507, "learning_rate": 5.274261603375527e-07, "loss": 0.0662, "step": 15650 }, { "epoch": 49.01113924050633, "grad_norm": 2.6892151832580566, "learning_rate": 4.922644163150493e-07, "loss": 0.0594, "step": 15660 }, { "epoch": 49.01177215189873, "grad_norm": 0.0018980724271386862, "learning_rate": 4.5710267229254574e-07, "loss": 0.0116, "step": 15670 }, { "epoch": 49.01240506329114, "grad_norm": 0.0024212722200900316, "learning_rate": 4.2194092827004216e-07, "loss": 0.0489, "step": 15680 }, { "epoch": 49.01303797468354, "grad_norm": 0.0014871886232867837, "learning_rate": 3.8677918424753874e-07, "loss": 0.0409, "step": 15690 }, { "epoch": 49.01367088607595, "grad_norm": 0.0018424575682729483, "learning_rate": 3.5161744022503516e-07, "loss": 0.0436, "step": 15700 }, { "epoch": 49.01430379746836, "grad_norm": 0.0016427412629127502, "learning_rate": 3.1645569620253163e-07, "loss": 0.0623, "step": 15710 }, { "epoch": 49.01493670886076, "grad_norm": 0.00207283697091043, "learning_rate": 2.8129395218002816e-07, "loss": 0.099, "step": 15720 }, { "epoch": 49.01556962025317, "grad_norm": 0.001088398857973516, "learning_rate": 2.4613220815752463e-07, "loss": 0.0557, "step": 15730 }, { "epoch": 49.01620253164557, "grad_norm": 1.5488436222076416, "learning_rate": 2.1097046413502108e-07, "loss": 0.0514, "step": 15740 }, { "epoch": 49.01683544303798, "grad_norm": 0.0019001478794962168, "learning_rate": 1.7580872011251758e-07, "loss": 0.0426, "step": 15750 }, { "epoch": 49.01746835443038, "grad_norm": 0.0017688849475234747, "learning_rate": 1.4064697609001408e-07, "loss": 0.0131, "step": 15760 }, { "epoch": 49.018101265822786, "grad_norm": 0.0013972530141472816, "learning_rate": 1.0548523206751054e-07, "loss": 0.0471, "step": 15770 }, { "epoch": 49.01873417721519, "grad_norm": 0.0011928863823413849, "learning_rate": 7.032348804500704e-08, "loss": 0.0233, "step": 15780 }, { "epoch": 49.019367088607595, "grad_norm": 0.0023855739273130894, "learning_rate": 3.516174402250352e-08, "loss": 0.0767, "step": 15790 }, { "epoch": 49.02, "grad_norm": 0.0015424606390297413, "learning_rate": 0.0, "loss": 0.0463, "step": 15800 }, { "epoch": 49.02, "eval_accuracy": 0.9769673704414588, "eval_loss": 0.04361777752637863, "eval_runtime": 919.5227, "eval_samples_per_second": 0.567, "eval_steps_per_second": 0.072, "step": 15800 }, { "epoch": 49.02, "step": 15800, "total_flos": 1.5772028175840707e+20, "train_loss": 0.3727006455846838, "train_runtime": 274155.4834, "train_samples_per_second": 0.461, "train_steps_per_second": 0.058 }, { "epoch": 49.02, "eval_accuracy": 0.8875, "eval_loss": 0.4278368353843689, "eval_runtime": 1123.5362, "eval_samples_per_second": 0.57, "eval_steps_per_second": 0.071, "step": 15800 }, { "epoch": 49.02, "eval_accuracy": 0.8875, "eval_loss": 0.4278368353843689, "eval_runtime": 1123.0215, "eval_samples_per_second": 0.57, "eval_steps_per_second": 0.071, "step": 15800 } ], "logging_steps": 10, "max_steps": 15800, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "total_flos": 1.5772028175840707e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null }