{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 13459, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007429972509101717, "grad_norm": 0.00010385477071395144, "learning_rate": 7.998613003095975e-06, "loss": 0.0006, "step": 10 }, { "epoch": 0.0014859945018203433, "grad_norm": 4.563978267714411e-15, "learning_rate": 7.996631578947367e-06, "loss": 0.005, "step": 20 }, { "epoch": 0.0022289917527305147, "grad_norm": 0.004395724274218082, "learning_rate": 7.99465015479876e-06, "loss": 0.0001, "step": 30 }, { "epoch": 0.0029719890036406867, "grad_norm": 9.154265717370436e-05, "learning_rate": 7.992668730650154e-06, "loss": 0.0019, "step": 40 }, { "epoch": 0.0037149862545508582, "grad_norm": 6.284219011831738e-08, "learning_rate": 7.990687306501548e-06, "loss": 0.0, "step": 50 }, { "epoch": 0.004457983505461029, "grad_norm": 2.8876378536224365, "learning_rate": 7.988705882352941e-06, "loss": 0.0004, "step": 60 }, { "epoch": 0.005200980756371202, "grad_norm": 3.022668124685879e-07, "learning_rate": 7.986724458204335e-06, "loss": 0.0102, "step": 70 }, { "epoch": 0.005943978007281373, "grad_norm": 2.450720355817726e-11, "learning_rate": 7.984743034055728e-06, "loss": 0.0004, "step": 80 }, { "epoch": 0.006686975258191545, "grad_norm": 0.0042392173781991005, "learning_rate": 7.98276160990712e-06, "loss": 0.0, "step": 90 }, { "epoch": 0.0074299725091017165, "grad_norm": 5.379395133786602e-07, "learning_rate": 7.980780185758513e-06, "loss": 0.6625, "step": 100 }, { "epoch": 0.008172969760011887, "grad_norm": 1.2094908952713013, "learning_rate": 7.978798761609907e-06, "loss": 0.0003, "step": 110 }, { "epoch": 0.008915967010922059, "grad_norm": 1.9755477807243115e-16, "learning_rate": 7.9768173374613e-06, "loss": 0.0006, "step": 120 }, { "epoch": 0.009658964261832232, "grad_norm": 0.0009601509082131088, "learning_rate": 7.974835913312694e-06, "loss": 0.0, "step": 130 }, { "epoch": 0.010401961512742404, "grad_norm": 4.124756287637865e-07, "learning_rate": 7.972854489164087e-06, "loss": 0.0001, "step": 140 }, { "epoch": 0.011144958763652575, "grad_norm": 2.0438874344108626e-06, "learning_rate": 7.97087306501548e-06, "loss": 0.0, "step": 150 }, { "epoch": 0.011887956014562747, "grad_norm": 0.004855783190578222, "learning_rate": 7.968891640866872e-06, "loss": 0.0, "step": 160 }, { "epoch": 0.012630953265472918, "grad_norm": 9.68717372984429e-08, "learning_rate": 7.966910216718266e-06, "loss": 0.0027, "step": 170 }, { "epoch": 0.01337395051638309, "grad_norm": 2.21520691330232e-13, "learning_rate": 7.96492879256966e-06, "loss": 0.0, "step": 180 }, { "epoch": 0.014116947767293261, "grad_norm": 0.0012627718970179558, "learning_rate": 7.962947368421053e-06, "loss": 0.0, "step": 190 }, { "epoch": 0.014859945018203433, "grad_norm": 0.03864503651857376, "learning_rate": 7.960965944272446e-06, "loss": 0.2508, "step": 200 }, { "epoch": 0.015602942269113605, "grad_norm": 0.004445089027285576, "learning_rate": 7.958984520123838e-06, "loss": 0.0, "step": 210 }, { "epoch": 0.016345939520023774, "grad_norm": 2.5098437217430103e-10, "learning_rate": 7.957003095975232e-06, "loss": 0.0083, "step": 220 }, { "epoch": 0.017088936770933948, "grad_norm": 1.1035450331320362e-09, "learning_rate": 7.955021671826625e-06, "loss": 0.2813, "step": 230 }, { "epoch": 0.017831934021844117, "grad_norm": 2.1936466044255448e-14, "learning_rate": 7.953040247678018e-06, "loss": 0.0, "step": 240 }, { "epoch": 0.01857493127275429, "grad_norm": 1.7874631907943694e-07, "learning_rate": 7.951058823529412e-06, "loss": 0.0, "step": 250 }, { "epoch": 0.019317928523664464, "grad_norm": 3.844103775918484e-05, "learning_rate": 7.949077399380804e-06, "loss": 0.0, "step": 260 }, { "epoch": 0.020060925774574634, "grad_norm": 3.3935991723410552e-06, "learning_rate": 7.947095975232197e-06, "loss": 0.0, "step": 270 }, { "epoch": 0.020803923025484807, "grad_norm": 9.538988349211408e-11, "learning_rate": 7.94511455108359e-06, "loss": 0.0101, "step": 280 }, { "epoch": 0.021546920276394977, "grad_norm": 19.88799476623535, "learning_rate": 7.943133126934984e-06, "loss": 0.0056, "step": 290 }, { "epoch": 0.02228991752730515, "grad_norm": 3.265330406065914e-07, "learning_rate": 7.941151702786378e-06, "loss": 0.0031, "step": 300 }, { "epoch": 0.02303291477821532, "grad_norm": 0.00042152145761065185, "learning_rate": 7.93917027863777e-06, "loss": 0.0009, "step": 310 }, { "epoch": 0.023775912029125493, "grad_norm": 0.0003180626081302762, "learning_rate": 7.937188854489163e-06, "loss": 0.0001, "step": 320 }, { "epoch": 0.024518909280035663, "grad_norm": 15.94360065460205, "learning_rate": 7.935207430340556e-06, "loss": 0.0042, "step": 330 }, { "epoch": 0.025261906530945837, "grad_norm": 0.00019892396812792867, "learning_rate": 7.93322600619195e-06, "loss": 0.0, "step": 340 }, { "epoch": 0.026004903781856006, "grad_norm": 1.282918549172507e-09, "learning_rate": 7.931244582043343e-06, "loss": 0.0003, "step": 350 }, { "epoch": 0.02674790103276618, "grad_norm": 0.0013202573172748089, "learning_rate": 7.929263157894737e-06, "loss": 0.0, "step": 360 }, { "epoch": 0.02749089828367635, "grad_norm": 6.613396053012366e-09, "learning_rate": 7.92728173374613e-06, "loss": 0.0024, "step": 370 }, { "epoch": 0.028233895534586523, "grad_norm": 4.278405754121195e-07, "learning_rate": 7.925300309597524e-06, "loss": 0.0001, "step": 380 }, { "epoch": 0.028976892785496693, "grad_norm": 1.5300608557922146e-09, "learning_rate": 7.923318885448915e-06, "loss": 0.0, "step": 390 }, { "epoch": 0.029719890036406866, "grad_norm": 0.00011159482528455555, "learning_rate": 7.921337461300309e-06, "loss": 0.0, "step": 400 }, { "epoch": 0.030462887287317036, "grad_norm": 3.2086046530821477e-07, "learning_rate": 7.919356037151702e-06, "loss": 0.0, "step": 410 }, { "epoch": 0.03120588453822721, "grad_norm": 1.5088854610212366e-09, "learning_rate": 7.917374613003096e-06, "loss": 0.0016, "step": 420 }, { "epoch": 0.03194888178913738, "grad_norm": 1.0666635977263361e-11, "learning_rate": 7.91539318885449e-06, "loss": 0.0, "step": 430 }, { "epoch": 0.03269187904004755, "grad_norm": 0.054007645696401596, "learning_rate": 7.913411764705883e-06, "loss": 0.0055, "step": 440 }, { "epoch": 0.033434876290957725, "grad_norm": 3.2980437825180786e-10, "learning_rate": 7.911430340557276e-06, "loss": 0.0, "step": 450 }, { "epoch": 0.034177873541867895, "grad_norm": 2.243474227725528e-06, "learning_rate": 7.909448916408668e-06, "loss": 0.0, "step": 460 }, { "epoch": 0.034920870792778065, "grad_norm": 9.300708825321635e-08, "learning_rate": 7.907467492260061e-06, "loss": 0.0, "step": 470 }, { "epoch": 0.035663868043688235, "grad_norm": 4.2875363703842595e-08, "learning_rate": 7.905486068111455e-06, "loss": 0.0, "step": 480 }, { "epoch": 0.03640686529459841, "grad_norm": 2.9632696296744143e-10, "learning_rate": 7.903504643962848e-06, "loss": 0.0, "step": 490 }, { "epoch": 0.03714986254550858, "grad_norm": 1.3973850288701017e-12, "learning_rate": 7.901523219814242e-06, "loss": 0.0001, "step": 500 }, { "epoch": 0.03789285979641875, "grad_norm": 1.6329781971080592e-08, "learning_rate": 7.899541795665634e-06, "loss": 0.0, "step": 510 }, { "epoch": 0.03863585704732893, "grad_norm": 0.03805861622095108, "learning_rate": 7.897560371517027e-06, "loss": 0.0, "step": 520 }, { "epoch": 0.0393788542982391, "grad_norm": 0.0034207587596029043, "learning_rate": 7.89557894736842e-06, "loss": 0.0085, "step": 530 }, { "epoch": 0.04012185154914927, "grad_norm": 2.4390969883825164e-06, "learning_rate": 7.893597523219814e-06, "loss": 0.0, "step": 540 }, { "epoch": 0.04086484880005944, "grad_norm": 0.48387411236763, "learning_rate": 7.891616099071208e-06, "loss": 0.0001, "step": 550 }, { "epoch": 0.041607846050969614, "grad_norm": 0.6060431003570557, "learning_rate": 7.8896346749226e-06, "loss": 0.0001, "step": 560 }, { "epoch": 0.042350843301879784, "grad_norm": 1.1747123456265496e-12, "learning_rate": 7.887653250773993e-06, "loss": 0.0, "step": 570 }, { "epoch": 0.043093840552789954, "grad_norm": 1.5409673892463616e-07, "learning_rate": 7.885671826625386e-06, "loss": 0.0, "step": 580 }, { "epoch": 0.043836837803700124, "grad_norm": 0.006107355002313852, "learning_rate": 7.88369040247678e-06, "loss": 0.0, "step": 590 }, { "epoch": 0.0445798350546103, "grad_norm": 0.05192660912871361, "learning_rate": 7.881708978328173e-06, "loss": 0.432, "step": 600 }, { "epoch": 0.04532283230552047, "grad_norm": 0.015981607139110565, "learning_rate": 7.879727554179565e-06, "loss": 0.0, "step": 610 }, { "epoch": 0.04606582955643064, "grad_norm": 0.5638545751571655, "learning_rate": 7.877746130030958e-06, "loss": 0.0003, "step": 620 }, { "epoch": 0.04680882680734081, "grad_norm": 0.010829356499016285, "learning_rate": 7.875764705882352e-06, "loss": 0.0002, "step": 630 }, { "epoch": 0.04755182405825099, "grad_norm": 3.3820993228239615e-10, "learning_rate": 7.873783281733745e-06, "loss": 0.0, "step": 640 }, { "epoch": 0.04829482130916116, "grad_norm": 0.0002504562435206026, "learning_rate": 7.871801857585139e-06, "loss": 0.0, "step": 650 }, { "epoch": 0.049037818560071327, "grad_norm": 0.0005545741878449917, "learning_rate": 7.869820433436532e-06, "loss": 0.0762, "step": 660 }, { "epoch": 0.049780815810981496, "grad_norm": 1.4491934776306152, "learning_rate": 7.867839009287926e-06, "loss": 0.0008, "step": 670 }, { "epoch": 0.05052381306189167, "grad_norm": 3.143786955206451e-07, "learning_rate": 7.86585758513932e-06, "loss": 0.0, "step": 680 }, { "epoch": 0.05126681031280184, "grad_norm": 2.5064883857339737e-07, "learning_rate": 7.863876160990711e-06, "loss": 0.0002, "step": 690 }, { "epoch": 0.05200980756371201, "grad_norm": 0.18320602178573608, "learning_rate": 7.861894736842105e-06, "loss": 0.0001, "step": 700 }, { "epoch": 0.05275280481462218, "grad_norm": 9.015591008118551e-14, "learning_rate": 7.859913312693498e-06, "loss": 0.0001, "step": 710 }, { "epoch": 0.05349580206553236, "grad_norm": 1.2313448678469285e-05, "learning_rate": 7.857931888544891e-06, "loss": 0.0019, "step": 720 }, { "epoch": 0.05423879931644253, "grad_norm": 0.0003290008462499827, "learning_rate": 7.855950464396285e-06, "loss": 0.0, "step": 730 }, { "epoch": 0.0549817965673527, "grad_norm": 1.2707545238299645e-06, "learning_rate": 7.853969040247678e-06, "loss": 0.0, "step": 740 }, { "epoch": 0.055724793818262876, "grad_norm": 3.6495288670453097e-13, "learning_rate": 7.851987616099072e-06, "loss": 0.0, "step": 750 }, { "epoch": 0.056467791069173046, "grad_norm": 5.683506856257736e-07, "learning_rate": 7.850006191950464e-06, "loss": 0.0, "step": 760 }, { "epoch": 0.057210788320083215, "grad_norm": 2.9714978154515848e-05, "learning_rate": 7.848024767801857e-06, "loss": 0.0, "step": 770 }, { "epoch": 0.057953785570993385, "grad_norm": 0.0001725340262055397, "learning_rate": 7.84604334365325e-06, "loss": 0.0002, "step": 780 }, { "epoch": 0.05869678282190356, "grad_norm": 1.278101402490961e-09, "learning_rate": 7.844061919504644e-06, "loss": 0.0017, "step": 790 }, { "epoch": 0.05943978007281373, "grad_norm": 7.755837032163981e-06, "learning_rate": 7.842080495356038e-06, "loss": 0.0131, "step": 800 }, { "epoch": 0.0601827773237239, "grad_norm": 3.5995307712255453e-07, "learning_rate": 7.840099071207431e-06, "loss": 0.0, "step": 810 }, { "epoch": 0.06092577457463407, "grad_norm": 2.9060316819595755e-07, "learning_rate": 7.838117647058823e-06, "loss": 0.0015, "step": 820 }, { "epoch": 0.06166877182554425, "grad_norm": 4.868264680268908e-10, "learning_rate": 7.836136222910216e-06, "loss": 0.0, "step": 830 }, { "epoch": 0.06241176907645442, "grad_norm": 1.069614654625184e-06, "learning_rate": 7.83415479876161e-06, "loss": 0.0003, "step": 840 }, { "epoch": 0.0631547663273646, "grad_norm": 1.7572237150805355e-16, "learning_rate": 7.832173374613003e-06, "loss": 0.005, "step": 850 }, { "epoch": 0.06389776357827476, "grad_norm": 3.3068809557335044e-07, "learning_rate": 7.830191950464397e-06, "loss": 0.0, "step": 860 }, { "epoch": 0.06464076082918493, "grad_norm": 2.708250013139235e-10, "learning_rate": 7.828210526315788e-06, "loss": 0.0, "step": 870 }, { "epoch": 0.0653837580800951, "grad_norm": 2.331471316328144e-15, "learning_rate": 7.826229102167182e-06, "loss": 0.0, "step": 880 }, { "epoch": 0.06612675533100527, "grad_norm": 2.6286988258361816, "learning_rate": 7.824247678018575e-06, "loss": 0.0004, "step": 890 }, { "epoch": 0.06686975258191545, "grad_norm": 41.682403564453125, "learning_rate": 7.822266253869969e-06, "loss": 0.006, "step": 900 }, { "epoch": 0.06761274983282561, "grad_norm": 4.3656709749484435e-06, "learning_rate": 7.820284829721362e-06, "loss": 0.0001, "step": 910 }, { "epoch": 0.06835574708373579, "grad_norm": 1.1170855906961208e-10, "learning_rate": 7.818303405572754e-06, "loss": 0.0002, "step": 920 }, { "epoch": 0.06909874433464597, "grad_norm": 0.00012403882283251733, "learning_rate": 7.816321981424148e-06, "loss": 0.0002, "step": 930 }, { "epoch": 0.06984174158555613, "grad_norm": 4.688263857133279e-07, "learning_rate": 7.814340557275541e-06, "loss": 0.0, "step": 940 }, { "epoch": 0.07058473883646631, "grad_norm": 1.400913607341181e-08, "learning_rate": 7.812359133126934e-06, "loss": 0.0, "step": 950 }, { "epoch": 0.07132773608737647, "grad_norm": 0.2331230193376541, "learning_rate": 7.810377708978328e-06, "loss": 0.0, "step": 960 }, { "epoch": 0.07207073333828665, "grad_norm": 1.1808974804961014e-11, "learning_rate": 7.808396284829721e-06, "loss": 0.447, "step": 970 }, { "epoch": 0.07281373058919682, "grad_norm": 1.6760554055750276e-09, "learning_rate": 7.806414860681113e-06, "loss": 0.0012, "step": 980 }, { "epoch": 0.07355672784010699, "grad_norm": 3.625258315764768e-08, "learning_rate": 7.804433436532507e-06, "loss": 0.0024, "step": 990 }, { "epoch": 0.07429972509101716, "grad_norm": 1.1911829744803981e-07, "learning_rate": 7.8024520123839e-06, "loss": 0.0007, "step": 1000 }, { "epoch": 0.07504272234192734, "grad_norm": 0.028750954195857048, "learning_rate": 7.800470588235294e-06, "loss": 0.0002, "step": 1010 }, { "epoch": 0.0757857195928375, "grad_norm": 4.465448455448495e-08, "learning_rate": 7.798489164086687e-06, "loss": 0.0009, "step": 1020 }, { "epoch": 0.07652871684374768, "grad_norm": 0.009596824645996094, "learning_rate": 7.79650773993808e-06, "loss": 0.0, "step": 1030 }, { "epoch": 0.07727171409465786, "grad_norm": 7.303714824047347e-07, "learning_rate": 7.794526315789474e-06, "loss": 0.0394, "step": 1040 }, { "epoch": 0.07801471134556802, "grad_norm": 2.3878586219439057e-08, "learning_rate": 7.792544891640867e-06, "loss": 0.001, "step": 1050 }, { "epoch": 0.0787577085964782, "grad_norm": 5.425561653282784e-07, "learning_rate": 7.79056346749226e-06, "loss": 0.0, "step": 1060 }, { "epoch": 0.07950070584738836, "grad_norm": 6.217923242957113e-08, "learning_rate": 7.788582043343653e-06, "loss": 0.0, "step": 1070 }, { "epoch": 0.08024370309829854, "grad_norm": 3.1601064875985685e-08, "learning_rate": 7.786600619195046e-06, "loss": 0.494, "step": 1080 }, { "epoch": 0.08098670034920871, "grad_norm": 1.6355619116004139e-12, "learning_rate": 7.78461919504644e-06, "loss": 0.0, "step": 1090 }, { "epoch": 0.08172969760011888, "grad_norm": 0.0006804847507737577, "learning_rate": 7.782637770897833e-06, "loss": 0.0, "step": 1100 }, { "epoch": 0.08247269485102905, "grad_norm": 0.001234274241141975, "learning_rate": 7.780656346749227e-06, "loss": 0.0001, "step": 1110 }, { "epoch": 0.08321569210193923, "grad_norm": 4.684539817390032e-05, "learning_rate": 7.778674922600618e-06, "loss": 0.0, "step": 1120 }, { "epoch": 0.08395868935284939, "grad_norm": 1.5053738877668366e-07, "learning_rate": 7.776693498452012e-06, "loss": 0.0, "step": 1130 }, { "epoch": 0.08470168660375957, "grad_norm": 0.03478812798857689, "learning_rate": 7.774712074303405e-06, "loss": 0.0, "step": 1140 }, { "epoch": 0.08544468385466973, "grad_norm": 7.189959433162585e-06, "learning_rate": 7.772730650154799e-06, "loss": 0.0, "step": 1150 }, { "epoch": 0.08618768110557991, "grad_norm": 0.027123047038912773, "learning_rate": 7.770749226006192e-06, "loss": 0.0, "step": 1160 }, { "epoch": 0.08693067835649008, "grad_norm": 5.199531916133537e-09, "learning_rate": 7.768767801857584e-06, "loss": 0.0, "step": 1170 }, { "epoch": 0.08767367560740025, "grad_norm": 0.003123380709439516, "learning_rate": 7.766786377708978e-06, "loss": 0.0003, "step": 1180 }, { "epoch": 0.08841667285831042, "grad_norm": 0.021892402321100235, "learning_rate": 7.764804953560371e-06, "loss": 0.0008, "step": 1190 }, { "epoch": 0.0891596701092206, "grad_norm": 0.005974195431917906, "learning_rate": 7.762823529411764e-06, "loss": 0.0001, "step": 1200 }, { "epoch": 0.08990266736013076, "grad_norm": 0.001358061213977635, "learning_rate": 7.760842105263158e-06, "loss": 0.0, "step": 1210 }, { "epoch": 0.09064566461104094, "grad_norm": 0.0023142509162425995, "learning_rate": 7.75886068111455e-06, "loss": 0.0, "step": 1220 }, { "epoch": 0.09138866186195112, "grad_norm": 1.2186575077066664e-05, "learning_rate": 7.756879256965943e-06, "loss": 0.0, "step": 1230 }, { "epoch": 0.09213165911286128, "grad_norm": 0.014682281762361526, "learning_rate": 7.754897832817337e-06, "loss": 0.0, "step": 1240 }, { "epoch": 0.09287465636377146, "grad_norm": 1.2789274237627428e-14, "learning_rate": 7.75291640866873e-06, "loss": 0.0, "step": 1250 }, { "epoch": 0.09361765361468162, "grad_norm": 1.6032302379608154, "learning_rate": 7.750934984520124e-06, "loss": 0.0002, "step": 1260 }, { "epoch": 0.0943606508655918, "grad_norm": 3.663368053352656e-09, "learning_rate": 7.748953560371517e-06, "loss": 0.0, "step": 1270 }, { "epoch": 0.09510364811650197, "grad_norm": 1.1438256564133553e-07, "learning_rate": 7.746972136222909e-06, "loss": 0.0007, "step": 1280 }, { "epoch": 0.09584664536741214, "grad_norm": 3.3626159634536634e-09, "learning_rate": 7.744990712074302e-06, "loss": 0.0, "step": 1290 }, { "epoch": 0.09658964261832231, "grad_norm": 0.005567495245486498, "learning_rate": 7.743009287925696e-06, "loss": 0.0194, "step": 1300 }, { "epoch": 0.09733263986923249, "grad_norm": 2.046826466539642e-06, "learning_rate": 7.74102786377709e-06, "loss": 0.0, "step": 1310 }, { "epoch": 0.09807563712014265, "grad_norm": 0.0006443510064855218, "learning_rate": 7.739046439628483e-06, "loss": 0.0007, "step": 1320 }, { "epoch": 0.09881863437105283, "grad_norm": 0.015108049847185612, "learning_rate": 7.737065015479876e-06, "loss": 0.0, "step": 1330 }, { "epoch": 0.09956163162196299, "grad_norm": 7.848172806079745e-11, "learning_rate": 7.73508359133127e-06, "loss": 0.0002, "step": 1340 }, { "epoch": 0.10030462887287317, "grad_norm": 0.09598542749881744, "learning_rate": 7.733102167182663e-06, "loss": 0.0, "step": 1350 }, { "epoch": 0.10104762612378335, "grad_norm": 5.5748870408933726e-08, "learning_rate": 7.731120743034055e-06, "loss": 0.0004, "step": 1360 }, { "epoch": 0.10179062337469351, "grad_norm": 0.0037470445968210697, "learning_rate": 7.729139318885448e-06, "loss": 0.0, "step": 1370 }, { "epoch": 0.10253362062560369, "grad_norm": 6.308543682098389, "learning_rate": 7.727157894736842e-06, "loss": 0.0013, "step": 1380 }, { "epoch": 0.10327661787651386, "grad_norm": 1.3679978039249363e-08, "learning_rate": 7.725176470588235e-06, "loss": 0.1172, "step": 1390 }, { "epoch": 0.10401961512742403, "grad_norm": 9.08298261492746e-06, "learning_rate": 7.723195046439629e-06, "loss": 0.0001, "step": 1400 }, { "epoch": 0.1047626123783342, "grad_norm": 0.005799862556159496, "learning_rate": 7.721213622291022e-06, "loss": 0.0, "step": 1410 }, { "epoch": 0.10550560962924437, "grad_norm": 0.08250636607408524, "learning_rate": 7.719232198142416e-06, "loss": 0.0, "step": 1420 }, { "epoch": 0.10624860688015454, "grad_norm": 5.5853923916493464e-12, "learning_rate": 7.717250773993807e-06, "loss": 0.0, "step": 1430 }, { "epoch": 0.10699160413106472, "grad_norm": 0.00015770482423249632, "learning_rate": 7.715269349845201e-06, "loss": 0.0, "step": 1440 }, { "epoch": 0.10773460138197488, "grad_norm": 4.323689409974074e-12, "learning_rate": 7.713287925696594e-06, "loss": 0.0, "step": 1450 }, { "epoch": 0.10847759863288506, "grad_norm": 5.947108002146706e-05, "learning_rate": 7.711306501547988e-06, "loss": 0.0001, "step": 1460 }, { "epoch": 0.10922059588379524, "grad_norm": 9.074782592044528e-11, "learning_rate": 7.709325077399381e-06, "loss": 0.0, "step": 1470 }, { "epoch": 0.1099635931347054, "grad_norm": 5.806957454377004e-17, "learning_rate": 7.707343653250773e-06, "loss": 0.0, "step": 1480 }, { "epoch": 0.11070659038561557, "grad_norm": 6.07878937444184e-05, "learning_rate": 7.705362229102167e-06, "loss": 0.0101, "step": 1490 }, { "epoch": 0.11144958763652575, "grad_norm": 4.540555892162956e-05, "learning_rate": 7.70338080495356e-06, "loss": 0.0, "step": 1500 }, { "epoch": 0.11219258488743591, "grad_norm": 0.056265659630298615, "learning_rate": 7.701399380804954e-06, "loss": 0.0, "step": 1510 }, { "epoch": 0.11293558213834609, "grad_norm": 0.0005196752608753741, "learning_rate": 7.699417956656347e-06, "loss": 0.0, "step": 1520 }, { "epoch": 0.11367857938925625, "grad_norm": 0.03548453003168106, "learning_rate": 7.697436532507739e-06, "loss": 0.0, "step": 1530 }, { "epoch": 0.11442157664016643, "grad_norm": 4.1336588765261695e-05, "learning_rate": 7.695455108359132e-06, "loss": 0.0002, "step": 1540 }, { "epoch": 0.11516457389107661, "grad_norm": 2.133566425754907e-07, "learning_rate": 7.693473684210526e-06, "loss": 0.0, "step": 1550 }, { "epoch": 0.11590757114198677, "grad_norm": 2.0231007979987226e-08, "learning_rate": 7.69149226006192e-06, "loss": 0.0002, "step": 1560 }, { "epoch": 0.11665056839289695, "grad_norm": 0.0052889259532094, "learning_rate": 7.689510835913313e-06, "loss": 0.0005, "step": 1570 }, { "epoch": 0.11739356564380712, "grad_norm": 1.7077846337087976e-07, "learning_rate": 7.687529411764704e-06, "loss": 0.0015, "step": 1580 }, { "epoch": 0.11813656289471729, "grad_norm": 1.012412896572723e-11, "learning_rate": 7.685547987616098e-06, "loss": 0.1523, "step": 1590 }, { "epoch": 0.11887956014562746, "grad_norm": 4.0092782001011074e-05, "learning_rate": 7.683566563467491e-06, "loss": 0.0031, "step": 1600 }, { "epoch": 0.11962255739653763, "grad_norm": 3.0688421247759834e-05, "learning_rate": 7.681585139318885e-06, "loss": 0.0, "step": 1610 }, { "epoch": 0.1203655546474478, "grad_norm": 3.730522113443158e-09, "learning_rate": 7.679603715170278e-06, "loss": 0.0, "step": 1620 }, { "epoch": 0.12110855189835798, "grad_norm": 3.952808037865907e-05, "learning_rate": 7.677622291021672e-06, "loss": 0.0001, "step": 1630 }, { "epoch": 0.12185154914926814, "grad_norm": 1.1576513259115018e-07, "learning_rate": 7.675640866873065e-06, "loss": 0.0016, "step": 1640 }, { "epoch": 0.12259454640017832, "grad_norm": 86.52330780029297, "learning_rate": 7.673659442724457e-06, "loss": 0.0127, "step": 1650 }, { "epoch": 0.1233375436510885, "grad_norm": 0.00014943482528906316, "learning_rate": 7.67167801857585e-06, "loss": 0.0001, "step": 1660 }, { "epoch": 0.12408054090199866, "grad_norm": 1.1372956976174464e-07, "learning_rate": 7.669696594427244e-06, "loss": 0.0, "step": 1670 }, { "epoch": 0.12482353815290884, "grad_norm": 0.00010248956823488697, "learning_rate": 7.667715170278637e-06, "loss": 0.0, "step": 1680 }, { "epoch": 0.125566535403819, "grad_norm": 3.712263822555542, "learning_rate": 7.665733746130031e-06, "loss": 0.0085, "step": 1690 }, { "epoch": 0.1263095326547292, "grad_norm": 5.190120688780553e-08, "learning_rate": 7.663752321981424e-06, "loss": 0.0, "step": 1700 }, { "epoch": 0.12705252990563934, "grad_norm": 1.343535586784128e-06, "learning_rate": 7.661770897832818e-06, "loss": 0.0, "step": 1710 }, { "epoch": 0.12779552715654952, "grad_norm": 1.0656785320861673e-07, "learning_rate": 7.659789473684211e-06, "loss": 0.0002, "step": 1720 }, { "epoch": 0.1285385244074597, "grad_norm": 9.045345722435982e-11, "learning_rate": 7.657808049535603e-06, "loss": 0.0, "step": 1730 }, { "epoch": 0.12928152165836987, "grad_norm": 448.4335021972656, "learning_rate": 7.655826625386997e-06, "loss": 0.3125, "step": 1740 }, { "epoch": 0.13002451890928005, "grad_norm": 1.0219524781973632e-08, "learning_rate": 7.65384520123839e-06, "loss": 0.1119, "step": 1750 }, { "epoch": 0.1307675161601902, "grad_norm": 2.662309483980607e-09, "learning_rate": 7.651863777089784e-06, "loss": 0.0703, "step": 1760 }, { "epoch": 0.13151051341110037, "grad_norm": 0.00010581756941974163, "learning_rate": 7.649882352941177e-06, "loss": 0.0, "step": 1770 }, { "epoch": 0.13225351066201055, "grad_norm": 0.0008816443150863051, "learning_rate": 7.647900928792569e-06, "loss": 0.0, "step": 1780 }, { "epoch": 0.13299650791292073, "grad_norm": 7.215510827718319e-11, "learning_rate": 7.645919504643962e-06, "loss": 0.0, "step": 1790 }, { "epoch": 0.1337395051638309, "grad_norm": 9.388672879140358e-06, "learning_rate": 7.643938080495356e-06, "loss": 0.0008, "step": 1800 }, { "epoch": 0.13448250241474108, "grad_norm": 1.2932947627327523e-13, "learning_rate": 7.64195665634675e-06, "loss": 0.0, "step": 1810 }, { "epoch": 0.13522549966565123, "grad_norm": 1.3407135384113644e-06, "learning_rate": 7.639975232198143e-06, "loss": 0.0238, "step": 1820 }, { "epoch": 0.1359684969165614, "grad_norm": 1.5387153625488281, "learning_rate": 7.637993808049534e-06, "loss": 0.0003, "step": 1830 }, { "epoch": 0.13671149416747158, "grad_norm": 8.180334407370538e-05, "learning_rate": 7.636012383900928e-06, "loss": 0.0, "step": 1840 }, { "epoch": 0.13745449141838176, "grad_norm": 2.7656782774858855e-14, "learning_rate": 7.634030959752321e-06, "loss": 0.0006, "step": 1850 }, { "epoch": 0.13819748866929193, "grad_norm": 3.9247534004971385e-05, "learning_rate": 7.632049535603715e-06, "loss": 0.0, "step": 1860 }, { "epoch": 0.13894048592020208, "grad_norm": 4.855285595795067e-08, "learning_rate": 7.630068111455108e-06, "loss": 0.0, "step": 1870 }, { "epoch": 0.13968348317111226, "grad_norm": 7.705154736470377e-09, "learning_rate": 7.628086687306501e-06, "loss": 0.0, "step": 1880 }, { "epoch": 0.14042648042202244, "grad_norm": 4.911003461532948e-12, "learning_rate": 7.626105263157894e-06, "loss": 0.0001, "step": 1890 }, { "epoch": 0.14116947767293261, "grad_norm": 0.00017059467791114002, "learning_rate": 7.624123839009287e-06, "loss": 0.0, "step": 1900 }, { "epoch": 0.1419124749238428, "grad_norm": 0.00011203711619600654, "learning_rate": 7.6221424148606805e-06, "loss": 0.0, "step": 1910 }, { "epoch": 0.14265547217475294, "grad_norm": 1.9092891712091387e-09, "learning_rate": 7.620160990712074e-06, "loss": 0.0, "step": 1920 }, { "epoch": 0.14339846942566312, "grad_norm": 8.958792022895068e-06, "learning_rate": 7.6181795665634674e-06, "loss": 0.0, "step": 1930 }, { "epoch": 0.1441414666765733, "grad_norm": 1.2491418033278023e-07, "learning_rate": 7.616198142414861e-06, "loss": 0.0171, "step": 1940 }, { "epoch": 0.14488446392748347, "grad_norm": 5.721012712456286e-06, "learning_rate": 7.614216718266253e-06, "loss": 0.0239, "step": 1950 }, { "epoch": 0.14562746117839365, "grad_norm": 0.0054746391251683235, "learning_rate": 7.612235294117646e-06, "loss": 0.0301, "step": 1960 }, { "epoch": 0.14637045842930382, "grad_norm": 3.511092927510617e-08, "learning_rate": 7.61025386996904e-06, "loss": 0.0, "step": 1970 }, { "epoch": 0.14711345568021397, "grad_norm": 1.8267021228979274e-10, "learning_rate": 7.608272445820433e-06, "loss": 0.0, "step": 1980 }, { "epoch": 0.14785645293112415, "grad_norm": 5.355617395252921e-05, "learning_rate": 7.6062910216718266e-06, "loss": 0.0004, "step": 1990 }, { "epoch": 0.14859945018203433, "grad_norm": 2.2129184595875984e-10, "learning_rate": 7.60430959752322e-06, "loss": 0.0034, "step": 2000 }, { "epoch": 0.1493424474329445, "grad_norm": 5.593456080532633e-05, "learning_rate": 7.602328173374613e-06, "loss": 0.0, "step": 2010 }, { "epoch": 0.15008544468385468, "grad_norm": 1.159690896201937e-06, "learning_rate": 7.600346749226006e-06, "loss": 0.0001, "step": 2020 }, { "epoch": 0.15082844193476483, "grad_norm": 6.678877980448306e-05, "learning_rate": 7.598365325077399e-06, "loss": 0.0, "step": 2030 }, { "epoch": 0.151571439185675, "grad_norm": 0.003582374192774296, "learning_rate": 7.596383900928792e-06, "loss": 0.0, "step": 2040 }, { "epoch": 0.15231443643658518, "grad_norm": 4.533937669748411e-08, "learning_rate": 7.594402476780186e-06, "loss": 0.0, "step": 2050 }, { "epoch": 0.15305743368749536, "grad_norm": 6.575902600225447e-10, "learning_rate": 7.592421052631578e-06, "loss": 0.0013, "step": 2060 }, { "epoch": 0.15380043093840554, "grad_norm": 0.00012366347073111683, "learning_rate": 7.590439628482972e-06, "loss": 0.0003, "step": 2070 }, { "epoch": 0.1545434281893157, "grad_norm": 0.0004162261029705405, "learning_rate": 7.588458204334365e-06, "loss": 0.0, "step": 2080 }, { "epoch": 0.15528642544022586, "grad_norm": 2.139036041626241e-06, "learning_rate": 7.586476780185759e-06, "loss": 0.0, "step": 2090 }, { "epoch": 0.15602942269113604, "grad_norm": 1.136829723691335e-05, "learning_rate": 7.584495356037151e-06, "loss": 0.0, "step": 2100 }, { "epoch": 0.15677241994204622, "grad_norm": 1.0184769694276952e-10, "learning_rate": 7.582513931888544e-06, "loss": 0.0034, "step": 2110 }, { "epoch": 0.1575154171929564, "grad_norm": 2.0259339805761556e-07, "learning_rate": 7.5805325077399375e-06, "loss": 0.0, "step": 2120 }, { "epoch": 0.15825841444386657, "grad_norm": 3.384361377811729e-07, "learning_rate": 7.578551083591331e-06, "loss": 0.0, "step": 2130 }, { "epoch": 0.15900141169477672, "grad_norm": 9.992221494670162e-10, "learning_rate": 7.576569659442724e-06, "loss": 0.0, "step": 2140 }, { "epoch": 0.1597444089456869, "grad_norm": 3.0385496341267526e-09, "learning_rate": 7.574588235294118e-06, "loss": 0.0001, "step": 2150 }, { "epoch": 0.16048740619659707, "grad_norm": 0.037069354206323624, "learning_rate": 7.5726068111455105e-06, "loss": 0.0, "step": 2160 }, { "epoch": 0.16123040344750725, "grad_norm": 4.4423256739811734e-11, "learning_rate": 7.570625386996904e-06, "loss": 0.0, "step": 2170 }, { "epoch": 0.16197340069841742, "grad_norm": 1.3337089512788225e-05, "learning_rate": 7.5686439628482966e-06, "loss": 0.0001, "step": 2180 }, { "epoch": 0.16271639794932757, "grad_norm": 1.3611509075417416e-07, "learning_rate": 7.56666253869969e-06, "loss": 0.0, "step": 2190 }, { "epoch": 0.16345939520023775, "grad_norm": 1.0967656782767676e-09, "learning_rate": 7.5646811145510835e-06, "loss": 0.0, "step": 2200 }, { "epoch": 0.16420239245114793, "grad_norm": 3.9811261842714885e-08, "learning_rate": 7.562699690402476e-06, "loss": 0.0004, "step": 2210 }, { "epoch": 0.1649453897020581, "grad_norm": 1.1965091461021982e-12, "learning_rate": 7.56071826625387e-06, "loss": 0.0486, "step": 2220 }, { "epoch": 0.16568838695296828, "grad_norm": 1.3393683673257328e-07, "learning_rate": 7.558736842105263e-06, "loss": 0.0, "step": 2230 }, { "epoch": 0.16643138420387846, "grad_norm": 0.04837249964475632, "learning_rate": 7.5567554179566565e-06, "loss": 0.0001, "step": 2240 }, { "epoch": 0.1671743814547886, "grad_norm": 5.5592218328737175e-11, "learning_rate": 7.554773993808049e-06, "loss": 0.0, "step": 2250 }, { "epoch": 0.16791737870569878, "grad_norm": 4.1985980914205356e-08, "learning_rate": 7.552792569659442e-06, "loss": 0.0, "step": 2260 }, { "epoch": 0.16866037595660896, "grad_norm": 7.395916290422327e-14, "learning_rate": 7.550811145510835e-06, "loss": 0.0001, "step": 2270 }, { "epoch": 0.16940337320751914, "grad_norm": 0.00016479882469866425, "learning_rate": 7.548829721362229e-06, "loss": 0.0, "step": 2280 }, { "epoch": 0.1701463704584293, "grad_norm": 2.2074048144915537e-10, "learning_rate": 7.546848297213622e-06, "loss": 0.0003, "step": 2290 }, { "epoch": 0.17088936770933946, "grad_norm": 3.0681096063744917e-07, "learning_rate": 7.544866873065016e-06, "loss": 0.1289, "step": 2300 }, { "epoch": 0.17163236496024964, "grad_norm": 0.0030301350634545088, "learning_rate": 7.542885448916408e-06, "loss": 0.0, "step": 2310 }, { "epoch": 0.17237536221115982, "grad_norm": 0.0003774948127102107, "learning_rate": 7.540904024767801e-06, "loss": 0.0, "step": 2320 }, { "epoch": 0.17311835946207, "grad_norm": 0.004968355875462294, "learning_rate": 7.538922600619194e-06, "loss": 0.0, "step": 2330 }, { "epoch": 0.17386135671298017, "grad_norm": 0.008657529018819332, "learning_rate": 7.536941176470588e-06, "loss": 0.0346, "step": 2340 }, { "epoch": 0.17460435396389035, "grad_norm": 7.136741149221457e-14, "learning_rate": 7.534959752321981e-06, "loss": 0.0009, "step": 2350 }, { "epoch": 0.1753473512148005, "grad_norm": 9.943051537675274e-08, "learning_rate": 7.532978328173374e-06, "loss": 0.0002, "step": 2360 }, { "epoch": 0.17609034846571067, "grad_norm": 8.572484055946461e-09, "learning_rate": 7.5309969040247674e-06, "loss": 0.0001, "step": 2370 }, { "epoch": 0.17683334571662085, "grad_norm": 1.3230641116024344e-07, "learning_rate": 7.529015479876161e-06, "loss": 0.0012, "step": 2380 }, { "epoch": 0.17757634296753103, "grad_norm": 0.01425761729478836, "learning_rate": 7.527034055727554e-06, "loss": 0.0, "step": 2390 }, { "epoch": 0.1783193402184412, "grad_norm": 0.00020573021902237087, "learning_rate": 7.525052631578947e-06, "loss": 0.0002, "step": 2400 }, { "epoch": 0.17906233746935135, "grad_norm": 2.427914369117906e-13, "learning_rate": 7.52307120743034e-06, "loss": 0.0031, "step": 2410 }, { "epoch": 0.17980533472026153, "grad_norm": 0.018485544249415398, "learning_rate": 7.521089783281733e-06, "loss": 0.0, "step": 2420 }, { "epoch": 0.1805483319711717, "grad_norm": 1.0412462518182182e-14, "learning_rate": 7.5191083591331266e-06, "loss": 0.0, "step": 2430 }, { "epoch": 0.18129132922208188, "grad_norm": 1.20909349199394e-09, "learning_rate": 7.51712693498452e-06, "loss": 0.0001, "step": 2440 }, { "epoch": 0.18203432647299206, "grad_norm": 1.2379430813780345e-08, "learning_rate": 7.5151455108359135e-06, "loss": 0.0, "step": 2450 }, { "epoch": 0.18277732372390224, "grad_norm": 0.0006929032388143241, "learning_rate": 7.513164086687306e-06, "loss": 0.0, "step": 2460 }, { "epoch": 0.18352032097481238, "grad_norm": 0.0043249172158539295, "learning_rate": 7.511182662538699e-06, "loss": 0.0002, "step": 2470 }, { "epoch": 0.18426331822572256, "grad_norm": 0.00012627607793547213, "learning_rate": 7.509201238390092e-06, "loss": 0.0, "step": 2480 }, { "epoch": 0.18500631547663274, "grad_norm": 6.282176036620513e-05, "learning_rate": 7.507219814241486e-06, "loss": 0.0017, "step": 2490 }, { "epoch": 0.18574931272754291, "grad_norm": 0.0003875169495586306, "learning_rate": 7.505238390092879e-06, "loss": 0.0, "step": 2500 }, { "epoch": 0.1864923099784531, "grad_norm": 1.4061700312595349e-05, "learning_rate": 7.503256965944272e-06, "loss": 0.0003, "step": 2510 }, { "epoch": 0.18723530722936324, "grad_norm": 6.303745525571003e-09, "learning_rate": 7.501275541795665e-06, "loss": 0.0, "step": 2520 }, { "epoch": 0.18797830448027342, "grad_norm": 2.0919899270666065e-06, "learning_rate": 7.499294117647059e-06, "loss": 0.0007, "step": 2530 }, { "epoch": 0.1887213017311836, "grad_norm": 6.483439340954078e-10, "learning_rate": 7.497312693498452e-06, "loss": 0.0001, "step": 2540 }, { "epoch": 0.18946429898209377, "grad_norm": 5.77274875013245e-07, "learning_rate": 7.495331269349845e-06, "loss": 0.2125, "step": 2550 }, { "epoch": 0.19020729623300395, "grad_norm": 2.5007325987758122e-08, "learning_rate": 7.4933498452012374e-06, "loss": 0.0009, "step": 2560 }, { "epoch": 0.1909502934839141, "grad_norm": 2.8974034648854285e-05, "learning_rate": 7.491368421052631e-06, "loss": 0.0, "step": 2570 }, { "epoch": 0.19169329073482427, "grad_norm": 1.454089783692325e-06, "learning_rate": 7.489386996904024e-06, "loss": 0.008, "step": 2580 }, { "epoch": 0.19243628798573445, "grad_norm": 2.5754836769920075e-06, "learning_rate": 7.487405572755418e-06, "loss": 0.0, "step": 2590 }, { "epoch": 0.19317928523664463, "grad_norm": 2.4697496883163694e-06, "learning_rate": 7.485424148606811e-06, "loss": 0.0002, "step": 2600 }, { "epoch": 0.1939222824875548, "grad_norm": 3.207186338727297e-08, "learning_rate": 7.483442724458204e-06, "loss": 0.0002, "step": 2610 }, { "epoch": 0.19466527973846498, "grad_norm": 4.193345670699955e-09, "learning_rate": 7.4814613003095966e-06, "loss": 0.0012, "step": 2620 }, { "epoch": 0.19540827698937513, "grad_norm": 0.00033508302294649184, "learning_rate": 7.47947987616099e-06, "loss": 0.0002, "step": 2630 }, { "epoch": 0.1961512742402853, "grad_norm": 0.00018775548960547894, "learning_rate": 7.4774984520123835e-06, "loss": 0.0022, "step": 2640 }, { "epoch": 0.19689427149119548, "grad_norm": 0.4688859283924103, "learning_rate": 7.475517027863777e-06, "loss": 0.0001, "step": 2650 }, { "epoch": 0.19763726874210566, "grad_norm": 2.2857616407273573e-12, "learning_rate": 7.47353560371517e-06, "loss": 0.0, "step": 2660 }, { "epoch": 0.19838026599301584, "grad_norm": 0.0008000964298844337, "learning_rate": 7.471554179566563e-06, "loss": 0.0, "step": 2670 }, { "epoch": 0.19912326324392599, "grad_norm": 1.2022927364796487e-07, "learning_rate": 7.4695727554179565e-06, "loss": 0.0052, "step": 2680 }, { "epoch": 0.19986626049483616, "grad_norm": 3.823735096375458e-05, "learning_rate": 7.46759133126935e-06, "loss": 0.0, "step": 2690 }, { "epoch": 0.20060925774574634, "grad_norm": 2.2583767744777106e-08, "learning_rate": 7.465609907120743e-06, "loss": 0.0, "step": 2700 }, { "epoch": 0.20135225499665652, "grad_norm": 1.20020843041857e-06, "learning_rate": 7.463628482972135e-06, "loss": 0.0001, "step": 2710 }, { "epoch": 0.2020952522475667, "grad_norm": 2.3552229611034647e-11, "learning_rate": 7.461647058823529e-06, "loss": 0.0002, "step": 2720 }, { "epoch": 0.20283824949847687, "grad_norm": 9.126129851821929e-10, "learning_rate": 7.459665634674922e-06, "loss": 0.0, "step": 2730 }, { "epoch": 0.20358124674938702, "grad_norm": 0.00043995113810524344, "learning_rate": 7.457684210526316e-06, "loss": 0.0, "step": 2740 }, { "epoch": 0.2043242440002972, "grad_norm": 8.676198604007368e-08, "learning_rate": 7.455702786377709e-06, "loss": 0.0063, "step": 2750 }, { "epoch": 0.20506724125120737, "grad_norm": 4.0740233089309186e-05, "learning_rate": 7.453721362229103e-06, "loss": 0.0011, "step": 2760 }, { "epoch": 0.20581023850211755, "grad_norm": 0.000288845767499879, "learning_rate": 7.451739938080494e-06, "loss": 0.0003, "step": 2770 }, { "epoch": 0.20655323575302773, "grad_norm": 4.202624026561352e-09, "learning_rate": 7.449758513931888e-06, "loss": 0.0, "step": 2780 }, { "epoch": 0.20729623300393787, "grad_norm": 103.76226043701172, "learning_rate": 7.447777089783281e-06, "loss": 0.0193, "step": 2790 }, { "epoch": 0.20803923025484805, "grad_norm": 9.665535571912187e-08, "learning_rate": 7.445795665634675e-06, "loss": 0.0108, "step": 2800 }, { "epoch": 0.20878222750575823, "grad_norm": 0.00030493756639771163, "learning_rate": 7.443814241486068e-06, "loss": 0.0018, "step": 2810 }, { "epoch": 0.2095252247566684, "grad_norm": 0.003484158543869853, "learning_rate": 7.441832817337461e-06, "loss": 0.0, "step": 2820 }, { "epoch": 0.21026822200757858, "grad_norm": 5.778713226318359, "learning_rate": 7.439851393188854e-06, "loss": 0.0014, "step": 2830 }, { "epoch": 0.21101121925848873, "grad_norm": 8.40686981834482e-12, "learning_rate": 7.437869969040248e-06, "loss": 0.0, "step": 2840 }, { "epoch": 0.2117542165093989, "grad_norm": 2.305590093842902e-11, "learning_rate": 7.4358885448916404e-06, "loss": 0.0003, "step": 2850 }, { "epoch": 0.21249721376030908, "grad_norm": 331.42401123046875, "learning_rate": 7.433907120743034e-06, "loss": 0.1063, "step": 2860 }, { "epoch": 0.21324021101121926, "grad_norm": 1.6884000864081372e-09, "learning_rate": 7.4319256965944265e-06, "loss": 0.0, "step": 2870 }, { "epoch": 0.21398320826212944, "grad_norm": 0.001587887411005795, "learning_rate": 7.42994427244582e-06, "loss": 0.4975, "step": 2880 }, { "epoch": 0.21472620551303961, "grad_norm": 10.196301460266113, "learning_rate": 7.4279628482972135e-06, "loss": 0.0917, "step": 2890 }, { "epoch": 0.21546920276394976, "grad_norm": 8.437519127824089e-09, "learning_rate": 7.425981424148607e-06, "loss": 0.0, "step": 2900 }, { "epoch": 0.21621220001485994, "grad_norm": 1.5240602806443349e-06, "learning_rate": 7.424e-06, "loss": 0.0015, "step": 2910 }, { "epoch": 0.21695519726577012, "grad_norm": 9.368338214699179e-05, "learning_rate": 7.422018575851392e-06, "loss": 0.0, "step": 2920 }, { "epoch": 0.2176981945166803, "grad_norm": 6.325107278826181e-06, "learning_rate": 7.420037151702786e-06, "loss": 0.0, "step": 2930 }, { "epoch": 0.21844119176759047, "grad_norm": 0.0021217784378677607, "learning_rate": 7.418055727554179e-06, "loss": 0.0, "step": 2940 }, { "epoch": 0.21918418901850062, "grad_norm": 2.9190502166748047, "learning_rate": 7.416074303405573e-06, "loss": 0.0005, "step": 2950 }, { "epoch": 0.2199271862694108, "grad_norm": 7.694290538040605e-09, "learning_rate": 7.414092879256966e-06, "loss": 0.0, "step": 2960 }, { "epoch": 0.22067018352032097, "grad_norm": 8.037525311976879e-09, "learning_rate": 7.412111455108359e-06, "loss": 0.0, "step": 2970 }, { "epoch": 0.22141318077123115, "grad_norm": 3.8374295234680176, "learning_rate": 7.410130030959752e-06, "loss": 0.0012, "step": 2980 }, { "epoch": 0.22215617802214133, "grad_norm": 5.6447107404933305e-11, "learning_rate": 7.408148606811146e-06, "loss": 0.0, "step": 2990 }, { "epoch": 0.2228991752730515, "grad_norm": 0.0012383813736960292, "learning_rate": 7.406167182662538e-06, "loss": 0.0, "step": 3000 }, { "epoch": 0.22364217252396165, "grad_norm": 0.00030792522011324763, "learning_rate": 7.404185758513932e-06, "loss": 0.322, "step": 3010 }, { "epoch": 0.22438516977487183, "grad_norm": 1.200901067477389e-07, "learning_rate": 7.402204334365324e-06, "loss": 0.005, "step": 3020 }, { "epoch": 0.225128167025782, "grad_norm": 6.967216563680267e-07, "learning_rate": 7.400222910216718e-06, "loss": 0.0, "step": 3030 }, { "epoch": 0.22587116427669218, "grad_norm": 0.0004759286530315876, "learning_rate": 7.398241486068111e-06, "loss": 0.0064, "step": 3040 }, { "epoch": 0.22661416152760236, "grad_norm": 1.3682764254951962e-08, "learning_rate": 7.396260061919505e-06, "loss": 0.0001, "step": 3050 }, { "epoch": 0.2273571587785125, "grad_norm": 4.955604993028828e-08, "learning_rate": 7.394278637770898e-06, "loss": 0.0193, "step": 3060 }, { "epoch": 0.22810015602942268, "grad_norm": 4.805761363968486e-06, "learning_rate": 7.39229721362229e-06, "loss": 0.0, "step": 3070 }, { "epoch": 0.22884315328033286, "grad_norm": 3.040829142264556e-06, "learning_rate": 7.3903157894736835e-06, "loss": 0.0007, "step": 3080 }, { "epoch": 0.22958615053124304, "grad_norm": 2.702171514101792e-06, "learning_rate": 7.388334365325077e-06, "loss": 0.0001, "step": 3090 }, { "epoch": 0.23032914778215322, "grad_norm": 0.11783476173877716, "learning_rate": 7.38635294117647e-06, "loss": 0.0, "step": 3100 }, { "epoch": 0.23107214503306336, "grad_norm": 0.0003163772926200181, "learning_rate": 7.384371517027864e-06, "loss": 0.0001, "step": 3110 }, { "epoch": 0.23181514228397354, "grad_norm": 1.135177444666624e-05, "learning_rate": 7.3823900928792565e-06, "loss": 0.0001, "step": 3120 }, { "epoch": 0.23255813953488372, "grad_norm": 9.65922936302377e-07, "learning_rate": 7.38040866873065e-06, "loss": 0.0126, "step": 3130 }, { "epoch": 0.2333011367857939, "grad_norm": 1.5842469480373267e-12, "learning_rate": 7.378427244582043e-06, "loss": 0.0005, "step": 3140 }, { "epoch": 0.23404413403670407, "grad_norm": 6.334481383873936e-08, "learning_rate": 7.376445820433436e-06, "loss": 0.0, "step": 3150 }, { "epoch": 0.23478713128761425, "grad_norm": 3.8505598354277026e-07, "learning_rate": 7.3744643962848295e-06, "loss": 0.0045, "step": 3160 }, { "epoch": 0.2355301285385244, "grad_norm": 2.5923464264110407e-08, "learning_rate": 7.372482972136222e-06, "loss": 0.0, "step": 3170 }, { "epoch": 0.23627312578943457, "grad_norm": 8.035100584891097e-09, "learning_rate": 7.370501547987616e-06, "loss": 0.0, "step": 3180 }, { "epoch": 0.23701612304034475, "grad_norm": 3.3618693123571575e-05, "learning_rate": 7.368520123839009e-06, "loss": 0.0156, "step": 3190 }, { "epoch": 0.23775912029125493, "grad_norm": 1.1407921048783853e-14, "learning_rate": 7.3665386996904026e-06, "loss": 0.0, "step": 3200 }, { "epoch": 0.2385021175421651, "grad_norm": 4.4759764250557055e-07, "learning_rate": 7.364557275541796e-06, "loss": 0.0, "step": 3210 }, { "epoch": 0.23924511479307525, "grad_norm": 5.56491686065641e-10, "learning_rate": 7.362575851393188e-06, "loss": 0.0, "step": 3220 }, { "epoch": 0.23998811204398543, "grad_norm": 6.132345065773537e-13, "learning_rate": 7.360594427244581e-06, "loss": 0.0002, "step": 3230 }, { "epoch": 0.2407311092948956, "grad_norm": 6.728667813149514e-06, "learning_rate": 7.358613003095975e-06, "loss": 0.0, "step": 3240 }, { "epoch": 0.24147410654580578, "grad_norm": 7.179497885090314e-08, "learning_rate": 7.356631578947368e-06, "loss": 0.0, "step": 3250 }, { "epoch": 0.24221710379671596, "grad_norm": 8.062368550554311e-08, "learning_rate": 7.354650154798762e-06, "loss": 0.0, "step": 3260 }, { "epoch": 0.24296010104762614, "grad_norm": 0.1792200654745102, "learning_rate": 7.352668730650154e-06, "loss": 0.0001, "step": 3270 }, { "epoch": 0.24370309829853629, "grad_norm": 2.1773093744625882e-11, "learning_rate": 7.350687306501548e-06, "loss": 0.0, "step": 3280 }, { "epoch": 0.24444609554944646, "grad_norm": 7.176682174758753e-06, "learning_rate": 7.34870588235294e-06, "loss": 0.0046, "step": 3290 }, { "epoch": 0.24518909280035664, "grad_norm": 2.61812864721378e-08, "learning_rate": 7.346724458204334e-06, "loss": 0.0, "step": 3300 }, { "epoch": 0.24593209005126682, "grad_norm": 8.351117575244871e-10, "learning_rate": 7.344743034055727e-06, "loss": 0.0001, "step": 3310 }, { "epoch": 0.246675087302177, "grad_norm": 5.397264612838626e-05, "learning_rate": 7.34276160990712e-06, "loss": 0.0, "step": 3320 }, { "epoch": 0.24741808455308714, "grad_norm": 1.0713654319260968e-06, "learning_rate": 7.3407801857585134e-06, "loss": 0.0157, "step": 3330 }, { "epoch": 0.24816108180399732, "grad_norm": 5.793017454625016e-11, "learning_rate": 7.338798761609907e-06, "loss": 0.075, "step": 3340 }, { "epoch": 0.2489040790549075, "grad_norm": 0.0012511357199400663, "learning_rate": 7.3368173374613e-06, "loss": 0.0, "step": 3350 }, { "epoch": 0.24964707630581767, "grad_norm": 0.08630578964948654, "learning_rate": 7.334835913312694e-06, "loss": 0.0, "step": 3360 }, { "epoch": 0.2503900735567278, "grad_norm": 0.32965099811553955, "learning_rate": 7.332854489164086e-06, "loss": 0.0001, "step": 3370 }, { "epoch": 0.251133070807638, "grad_norm": 3.0208922030539043e-09, "learning_rate": 7.330873065015479e-06, "loss": 0.1127, "step": 3380 }, { "epoch": 0.2518760680585482, "grad_norm": 2.5396579284020504e-10, "learning_rate": 7.3288916408668726e-06, "loss": 0.0001, "step": 3390 }, { "epoch": 0.2526190653094584, "grad_norm": 3.82775144913694e-09, "learning_rate": 7.326910216718266e-06, "loss": 0.0001, "step": 3400 }, { "epoch": 0.25336206256036853, "grad_norm": 1.0872491795765882e-09, "learning_rate": 7.3249287925696595e-06, "loss": 0.0, "step": 3410 }, { "epoch": 0.2541050598112787, "grad_norm": 5.411799914450954e-12, "learning_rate": 7.322947368421052e-06, "loss": 0.0009, "step": 3420 }, { "epoch": 0.2548480570621889, "grad_norm": 1.2066847521197133e-09, "learning_rate": 7.320965944272446e-06, "loss": 0.0, "step": 3430 }, { "epoch": 0.25559105431309903, "grad_norm": 0.00037910256651230156, "learning_rate": 7.318984520123838e-06, "loss": 0.0005, "step": 3440 }, { "epoch": 0.25633405156400924, "grad_norm": 2.2202501989698753e-11, "learning_rate": 7.317003095975232e-06, "loss": 0.0, "step": 3450 }, { "epoch": 0.2570770488149194, "grad_norm": 3.7975578237592345e-08, "learning_rate": 7.315021671826625e-06, "loss": 0.0, "step": 3460 }, { "epoch": 0.25782004606582953, "grad_norm": 0.00020925186981912702, "learning_rate": 7.313040247678018e-06, "loss": 0.0287, "step": 3470 }, { "epoch": 0.25856304331673974, "grad_norm": 3.655731461549294e-06, "learning_rate": 7.311058823529411e-06, "loss": 0.0001, "step": 3480 }, { "epoch": 0.2593060405676499, "grad_norm": 1.9072173174095042e-08, "learning_rate": 7.309077399380805e-06, "loss": 0.0, "step": 3490 }, { "epoch": 0.2600490378185601, "grad_norm": 2.2218841877474915e-07, "learning_rate": 7.307095975232198e-06, "loss": 0.0, "step": 3500 }, { "epoch": 0.26079203506947024, "grad_norm": 2.4029860767882383e-08, "learning_rate": 7.305114551083592e-06, "loss": 0.0, "step": 3510 }, { "epoch": 0.2615350323203804, "grad_norm": 4.358967089501675e-06, "learning_rate": 7.3031331269349834e-06, "loss": 0.0, "step": 3520 }, { "epoch": 0.2622780295712906, "grad_norm": 0.0002249205717816949, "learning_rate": 7.301151702786377e-06, "loss": 0.0006, "step": 3530 }, { "epoch": 0.26302102682220074, "grad_norm": 0.00798623263835907, "learning_rate": 7.29917027863777e-06, "loss": 0.0, "step": 3540 }, { "epoch": 0.26376402407311095, "grad_norm": 2.6784143447875977, "learning_rate": 7.297188854489164e-06, "loss": 0.0004, "step": 3550 }, { "epoch": 0.2645070213240211, "grad_norm": 0.02445812337100506, "learning_rate": 7.295207430340557e-06, "loss": 0.0, "step": 3560 }, { "epoch": 0.26525001857493125, "grad_norm": 0.00033516684197820723, "learning_rate": 7.293226006191951e-06, "loss": 0.0022, "step": 3570 }, { "epoch": 0.26599301582584145, "grad_norm": 9.361514941019777e-08, "learning_rate": 7.291244582043343e-06, "loss": 0.0, "step": 3580 }, { "epoch": 0.2667360130767516, "grad_norm": 4.4283584429649636e-05, "learning_rate": 7.289263157894736e-06, "loss": 0.0001, "step": 3590 }, { "epoch": 0.2674790103276618, "grad_norm": 0.00022894414723850787, "learning_rate": 7.2872817337461295e-06, "loss": 0.0, "step": 3600 }, { "epoch": 0.26822200757857195, "grad_norm": 0.00011575646203709766, "learning_rate": 7.285300309597523e-06, "loss": 0.0, "step": 3610 }, { "epoch": 0.26896500482948216, "grad_norm": 2.3543227598565863e-06, "learning_rate": 7.2833188854489165e-06, "loss": 0.0, "step": 3620 }, { "epoch": 0.2697080020803923, "grad_norm": 1.1656931420134242e-08, "learning_rate": 7.281337461300309e-06, "loss": 0.0856, "step": 3630 }, { "epoch": 0.27045099933130246, "grad_norm": 3.938530426239595e-07, "learning_rate": 7.2793560371517025e-06, "loss": 0.0, "step": 3640 }, { "epoch": 0.27119399658221266, "grad_norm": 8.80844897022115e-10, "learning_rate": 7.277374613003096e-06, "loss": 0.0, "step": 3650 }, { "epoch": 0.2719369938331228, "grad_norm": 3.0943141609895974e-05, "learning_rate": 7.2753931888544895e-06, "loss": 0.0, "step": 3660 }, { "epoch": 0.272679991084033, "grad_norm": 2.1428675651550293, "learning_rate": 7.273411764705882e-06, "loss": 0.0005, "step": 3670 }, { "epoch": 0.27342298833494316, "grad_norm": 2.741811158557539e-06, "learning_rate": 7.271430340557275e-06, "loss": 0.0002, "step": 3680 }, { "epoch": 0.2741659855858533, "grad_norm": 4.492115230139704e-14, "learning_rate": 7.269448916408668e-06, "loss": 0.0193, "step": 3690 }, { "epoch": 0.2749089828367635, "grad_norm": 1.6595483884884743e-06, "learning_rate": 7.267467492260062e-06, "loss": 0.0019, "step": 3700 }, { "epoch": 0.27565198008767366, "grad_norm": 0.0002628306392580271, "learning_rate": 7.265486068111455e-06, "loss": 0.0012, "step": 3710 }, { "epoch": 0.27639497733858387, "grad_norm": 1.2856664344251811e-10, "learning_rate": 7.263504643962849e-06, "loss": 0.0, "step": 3720 }, { "epoch": 0.277137974589494, "grad_norm": 2.224500306180488e-10, "learning_rate": 7.261523219814241e-06, "loss": 0.0, "step": 3730 }, { "epoch": 0.27788097184040417, "grad_norm": 3.624904543197971e-10, "learning_rate": 7.259541795665634e-06, "loss": 0.0157, "step": 3740 }, { "epoch": 0.27862396909131437, "grad_norm": 0.001769436290487647, "learning_rate": 7.257560371517027e-06, "loss": 0.0, "step": 3750 }, { "epoch": 0.2793669663422245, "grad_norm": 0.010282271541655064, "learning_rate": 7.255578947368421e-06, "loss": 0.0351, "step": 3760 }, { "epoch": 0.2801099635931347, "grad_norm": 0.00011507143790367991, "learning_rate": 7.253597523219814e-06, "loss": 0.0003, "step": 3770 }, { "epoch": 0.2808529608440449, "grad_norm": 3.043537089020276e-10, "learning_rate": 7.251616099071207e-06, "loss": 0.0003, "step": 3780 }, { "epoch": 0.281595958094955, "grad_norm": 0.01486095879226923, "learning_rate": 7.2496346749226e-06, "loss": 0.0, "step": 3790 }, { "epoch": 0.28233895534586523, "grad_norm": 4.221281798503368e-13, "learning_rate": 7.247653250773994e-06, "loss": 0.0004, "step": 3800 }, { "epoch": 0.2830819525967754, "grad_norm": 4.941714593087454e-08, "learning_rate": 7.2456718266253865e-06, "loss": 0.0, "step": 3810 }, { "epoch": 0.2838249498476856, "grad_norm": 1.3407380240748612e-11, "learning_rate": 7.24369040247678e-06, "loss": 0.0287, "step": 3820 }, { "epoch": 0.28456794709859573, "grad_norm": 9.786994269234128e-06, "learning_rate": 7.2417089783281726e-06, "loss": 0.0003, "step": 3830 }, { "epoch": 0.2853109443495059, "grad_norm": 9.791513086443437e-12, "learning_rate": 7.239727554179566e-06, "loss": 0.0, "step": 3840 }, { "epoch": 0.2860539416004161, "grad_norm": 0.002707146806642413, "learning_rate": 7.2377461300309595e-06, "loss": 0.6876, "step": 3850 }, { "epoch": 0.28679693885132623, "grad_norm": 0.003479096107184887, "learning_rate": 7.235764705882353e-06, "loss": 0.004, "step": 3860 }, { "epoch": 0.28753993610223644, "grad_norm": 0.00021903423476032913, "learning_rate": 7.2337832817337464e-06, "loss": 0.0025, "step": 3870 }, { "epoch": 0.2882829333531466, "grad_norm": 1.5418025967051108e-08, "learning_rate": 7.231801857585139e-06, "loss": 0.0003, "step": 3880 }, { "epoch": 0.2890259306040568, "grad_norm": 0.0002156332047889009, "learning_rate": 7.229820433436532e-06, "loss": 0.0096, "step": 3890 }, { "epoch": 0.28976892785496694, "grad_norm": 2.9010775506321806e-06, "learning_rate": 7.227839009287925e-06, "loss": 0.0, "step": 3900 }, { "epoch": 0.2905119251058771, "grad_norm": 3.130305736842587e-10, "learning_rate": 7.225857585139319e-06, "loss": 0.0001, "step": 3910 }, { "epoch": 0.2912549223567873, "grad_norm": 1.0393840966571588e-05, "learning_rate": 7.223876160990712e-06, "loss": 0.0, "step": 3920 }, { "epoch": 0.29199791960769744, "grad_norm": 2.190824588410578e-08, "learning_rate": 7.221894736842105e-06, "loss": 0.0, "step": 3930 }, { "epoch": 0.29274091685860765, "grad_norm": 1.211483038332517e-07, "learning_rate": 7.219913312693498e-06, "loss": 0.008, "step": 3940 }, { "epoch": 0.2934839141095178, "grad_norm": 1.393378965985903e-06, "learning_rate": 7.217931888544892e-06, "loss": 0.0007, "step": 3950 }, { "epoch": 0.29422691136042795, "grad_norm": 0.0003949375241063535, "learning_rate": 7.215950464396284e-06, "loss": 0.0, "step": 3960 }, { "epoch": 0.29496990861133815, "grad_norm": 7.934625873007306e-11, "learning_rate": 7.213969040247678e-06, "loss": 0.0, "step": 3970 }, { "epoch": 0.2957129058622483, "grad_norm": 3.1010600650915876e-05, "learning_rate": 7.21198761609907e-06, "loss": 0.0064, "step": 3980 }, { "epoch": 0.2964559031131585, "grad_norm": 1.9996752517736915e-11, "learning_rate": 7.210006191950464e-06, "loss": 0.0077, "step": 3990 }, { "epoch": 0.29719890036406865, "grad_norm": 0.025723395869135857, "learning_rate": 7.208024767801857e-06, "loss": 0.0, "step": 4000 }, { "epoch": 0.2979418976149788, "grad_norm": 5.5564622347681336e-11, "learning_rate": 7.206043343653251e-06, "loss": 0.0002, "step": 4010 }, { "epoch": 0.298684894865889, "grad_norm": 3.570405127106824e-08, "learning_rate": 7.204061919504644e-06, "loss": 0.0, "step": 4020 }, { "epoch": 0.29942789211679915, "grad_norm": 0.021746689453721046, "learning_rate": 7.202080495356037e-06, "loss": 0.0066, "step": 4030 }, { "epoch": 0.30017088936770936, "grad_norm": 7.729664019962001e-09, "learning_rate": 7.2000990712074295e-06, "loss": 0.0063, "step": 4040 }, { "epoch": 0.3009138866186195, "grad_norm": 5.0783500668560855e-12, "learning_rate": 7.198117647058823e-06, "loss": 0.0001, "step": 4050 }, { "epoch": 0.30165688386952966, "grad_norm": 1.9478371541481465e-05, "learning_rate": 7.1961362229102164e-06, "loss": 0.0, "step": 4060 }, { "epoch": 0.30239988112043986, "grad_norm": 3.861693130602362e-06, "learning_rate": 7.19415479876161e-06, "loss": 0.0, "step": 4070 }, { "epoch": 0.30314287837135, "grad_norm": 5.184643470101946e-08, "learning_rate": 7.1921733746130025e-06, "loss": 0.0, "step": 4080 }, { "epoch": 0.3038858756222602, "grad_norm": 0.0015780467074364424, "learning_rate": 7.190191950464396e-06, "loss": 0.0059, "step": 4090 }, { "epoch": 0.30462887287317036, "grad_norm": 1.5719375312528427e-07, "learning_rate": 7.1882105263157895e-06, "loss": 0.0006, "step": 4100 }, { "epoch": 0.3053718701240805, "grad_norm": 2.565387235975436e-09, "learning_rate": 7.186229102167182e-06, "loss": 0.0, "step": 4110 }, { "epoch": 0.3061148673749907, "grad_norm": 2.202836220724369e-12, "learning_rate": 7.1842476780185756e-06, "loss": 0.0002, "step": 4120 }, { "epoch": 0.30685786462590087, "grad_norm": 4.890848686045501e-07, "learning_rate": 7.182266253869968e-06, "loss": 0.0001, "step": 4130 }, { "epoch": 0.30760086187681107, "grad_norm": 0.00040632340824231505, "learning_rate": 7.180284829721362e-06, "loss": 0.0, "step": 4140 }, { "epoch": 0.3083438591277212, "grad_norm": 9.83693971647881e-05, "learning_rate": 7.178303405572755e-06, "loss": 0.0, "step": 4150 }, { "epoch": 0.3090868563786314, "grad_norm": 1.2538682980789417e-08, "learning_rate": 7.176321981424149e-06, "loss": 0.0, "step": 4160 }, { "epoch": 0.3098298536295416, "grad_norm": 8.935343203120283e-07, "learning_rate": 7.174340557275542e-06, "loss": 0.004, "step": 4170 }, { "epoch": 0.3105728508804517, "grad_norm": 1.339043575490867e-15, "learning_rate": 7.172359133126935e-06, "loss": 0.0, "step": 4180 }, { "epoch": 0.3113158481313619, "grad_norm": 0.12025681138038635, "learning_rate": 7.170377708978327e-06, "loss": 0.0, "step": 4190 }, { "epoch": 0.3120588453822721, "grad_norm": 5.727154306356397e-09, "learning_rate": 7.168396284829721e-06, "loss": 0.0, "step": 4200 }, { "epoch": 0.3128018426331823, "grad_norm": 1.4451013718996819e-08, "learning_rate": 7.166414860681114e-06, "loss": 0.0008, "step": 4210 }, { "epoch": 0.31354483988409243, "grad_norm": 4.175752565060975e-06, "learning_rate": 7.164433436532508e-06, "loss": 0.0, "step": 4220 }, { "epoch": 0.3142878371350026, "grad_norm": 2.3422212520191055e-12, "learning_rate": 7.1624520123839e-06, "loss": 0.0, "step": 4230 }, { "epoch": 0.3150308343859128, "grad_norm": 3.7076246371725574e-05, "learning_rate": 7.160470588235294e-06, "loss": 0.0, "step": 4240 }, { "epoch": 0.31577383163682293, "grad_norm": 16.01180648803711, "learning_rate": 7.158489164086687e-06, "loss": 0.0035, "step": 4250 }, { "epoch": 0.31651682888773314, "grad_norm": 9.703868819599393e-10, "learning_rate": 7.15650773993808e-06, "loss": 0.0024, "step": 4260 }, { "epoch": 0.3172598261386433, "grad_norm": 2.3315933503909037e-06, "learning_rate": 7.154526315789473e-06, "loss": 0.0001, "step": 4270 }, { "epoch": 0.31800282338955344, "grad_norm": 0.0010988021967932582, "learning_rate": 7.152544891640866e-06, "loss": 0.0, "step": 4280 }, { "epoch": 0.31874582064046364, "grad_norm": 4.942657324136235e-05, "learning_rate": 7.1505634674922595e-06, "loss": 0.0, "step": 4290 }, { "epoch": 0.3194888178913738, "grad_norm": 1.976124986032262e-14, "learning_rate": 7.148582043343653e-06, "loss": 0.0, "step": 4300 }, { "epoch": 0.320231815142284, "grad_norm": 3.5629429362415976e-07, "learning_rate": 7.146600619195046e-06, "loss": 0.0, "step": 4310 }, { "epoch": 0.32097481239319414, "grad_norm": 1.67028418873727e-10, "learning_rate": 7.14461919504644e-06, "loss": 0.0006, "step": 4320 }, { "epoch": 0.3217178096441043, "grad_norm": 0.0003845445462502539, "learning_rate": 7.142637770897833e-06, "loss": 0.0, "step": 4330 }, { "epoch": 0.3224608068950145, "grad_norm": 0.0002894588396884501, "learning_rate": 7.140656346749225e-06, "loss": 0.0024, "step": 4340 }, { "epoch": 0.32320380414592464, "grad_norm": 1.587673250469379e-05, "learning_rate": 7.138674922600619e-06, "loss": 0.0, "step": 4350 }, { "epoch": 0.32394680139683485, "grad_norm": 2.5997421104761997e-08, "learning_rate": 7.136693498452012e-06, "loss": 0.008, "step": 4360 }, { "epoch": 0.324689798647745, "grad_norm": 1.2604004950844683e-07, "learning_rate": 7.1347120743034055e-06, "loss": 0.0, "step": 4370 }, { "epoch": 0.32543279589865515, "grad_norm": 6.384315853757982e-10, "learning_rate": 7.132730650154798e-06, "loss": 0.0487, "step": 4380 }, { "epoch": 0.32617579314956535, "grad_norm": 4.527398414211348e-05, "learning_rate": 7.130749226006192e-06, "loss": 0.0, "step": 4390 }, { "epoch": 0.3269187904004755, "grad_norm": 0.02074258029460907, "learning_rate": 7.128767801857585e-06, "loss": 0.0741, "step": 4400 }, { "epoch": 0.3276617876513857, "grad_norm": 4.103547325939871e-06, "learning_rate": 7.126786377708978e-06, "loss": 0.0001, "step": 4410 }, { "epoch": 0.32840478490229585, "grad_norm": 1.9218396318798625e-10, "learning_rate": 7.124804953560371e-06, "loss": 0.0051, "step": 4420 }, { "epoch": 0.32914778215320606, "grad_norm": 1.8832202464125203e-09, "learning_rate": 7.122823529411764e-06, "loss": 0.0, "step": 4430 }, { "epoch": 0.3298907794041162, "grad_norm": 2.8247484906707143e-10, "learning_rate": 7.120842105263157e-06, "loss": 0.0, "step": 4440 }, { "epoch": 0.33063377665502636, "grad_norm": 0.002574792131781578, "learning_rate": 7.118860681114551e-06, "loss": 0.0002, "step": 4450 }, { "epoch": 0.33137677390593656, "grad_norm": 4.308467032387853e-05, "learning_rate": 7.116879256965944e-06, "loss": 0.0001, "step": 4460 }, { "epoch": 0.3321197711568467, "grad_norm": 1.8344776991419988e-14, "learning_rate": 7.114897832817338e-06, "loss": 0.0, "step": 4470 }, { "epoch": 0.3328627684077569, "grad_norm": 2.9208226948185256e-08, "learning_rate": 7.112916408668731e-06, "loss": 0.0001, "step": 4480 }, { "epoch": 0.33360576565866706, "grad_norm": 0.0019464093493297696, "learning_rate": 7.110934984520123e-06, "loss": 0.0, "step": 4490 }, { "epoch": 0.3343487629095772, "grad_norm": 9.83138237486969e-10, "learning_rate": 7.108953560371516e-06, "loss": 0.0001, "step": 4500 }, { "epoch": 0.3350917601604874, "grad_norm": 3.0732514311182513e-09, "learning_rate": 7.10697213622291e-06, "loss": 0.0087, "step": 4510 }, { "epoch": 0.33583475741139757, "grad_norm": 0.003226591506972909, "learning_rate": 7.104990712074303e-06, "loss": 0.0002, "step": 4520 }, { "epoch": 0.33657775466230777, "grad_norm": 1.969762841724787e-08, "learning_rate": 7.103009287925697e-06, "loss": 0.0031, "step": 4530 }, { "epoch": 0.3373207519132179, "grad_norm": 0.0002061890554614365, "learning_rate": 7.1010278637770894e-06, "loss": 0.0012, "step": 4540 }, { "epoch": 0.33806374916412807, "grad_norm": 1.1964850727963494e-06, "learning_rate": 7.099046439628483e-06, "loss": 0.0, "step": 4550 }, { "epoch": 0.3388067464150383, "grad_norm": 6.131295776867773e-06, "learning_rate": 7.0970650154798755e-06, "loss": 0.0004, "step": 4560 }, { "epoch": 0.3395497436659484, "grad_norm": 9.627686949897907e-07, "learning_rate": 7.095083591331269e-06, "loss": 0.0, "step": 4570 }, { "epoch": 0.3402927409168586, "grad_norm": 2.2394174015172297e-10, "learning_rate": 7.0931021671826625e-06, "loss": 0.0, "step": 4580 }, { "epoch": 0.3410357381677688, "grad_norm": 9.344408908873447e-07, "learning_rate": 7.091120743034055e-06, "loss": 0.0, "step": 4590 }, { "epoch": 0.3417787354186789, "grad_norm": 1.1174482779097161e-06, "learning_rate": 7.0891393188854486e-06, "loss": 0.0001, "step": 4600 }, { "epoch": 0.34252173266958913, "grad_norm": 1.1277123332376071e-11, "learning_rate": 7.087157894736842e-06, "loss": 0.0, "step": 4610 }, { "epoch": 0.3432647299204993, "grad_norm": 2.5332178665848915e-07, "learning_rate": 7.0851764705882355e-06, "loss": 0.0, "step": 4620 }, { "epoch": 0.3440077271714095, "grad_norm": 0.0018959958106279373, "learning_rate": 7.083195046439628e-06, "loss": 0.0, "step": 4630 }, { "epoch": 0.34475072442231963, "grad_norm": 263.5782470703125, "learning_rate": 7.081213622291021e-06, "loss": 0.1407, "step": 4640 }, { "epoch": 0.34549372167322984, "grad_norm": 0.0006825370364822447, "learning_rate": 7.079232198142414e-06, "loss": 0.0001, "step": 4650 }, { "epoch": 0.34623671892414, "grad_norm": 2.5293205908383243e-05, "learning_rate": 7.077250773993808e-06, "loss": 0.0, "step": 4660 }, { "epoch": 0.34697971617505013, "grad_norm": 1.2313051689361032e-09, "learning_rate": 7.075269349845201e-06, "loss": 0.0091, "step": 4670 }, { "epoch": 0.34772271342596034, "grad_norm": 36.351505279541016, "learning_rate": 7.073287925696595e-06, "loss": 0.0071, "step": 4680 }, { "epoch": 0.3484657106768705, "grad_norm": 1.8500725218473235e-06, "learning_rate": 7.071306501547987e-06, "loss": 0.0, "step": 4690 }, { "epoch": 0.3492087079277807, "grad_norm": 6.325044112596989e-13, "learning_rate": 7.069325077399381e-06, "loss": 0.0, "step": 4700 }, { "epoch": 0.34995170517869084, "grad_norm": 0.004471507389098406, "learning_rate": 7.067343653250773e-06, "loss": 0.0081, "step": 4710 }, { "epoch": 0.350694702429601, "grad_norm": 0.09461221098899841, "learning_rate": 7.065362229102167e-06, "loss": 0.0, "step": 4720 }, { "epoch": 0.3514376996805112, "grad_norm": 1.9487440567900194e-06, "learning_rate": 7.06338080495356e-06, "loss": 0.1672, "step": 4730 }, { "epoch": 0.35218069693142134, "grad_norm": 1.4805665374262311e-12, "learning_rate": 7.061399380804953e-06, "loss": 0.0002, "step": 4740 }, { "epoch": 0.35292369418233155, "grad_norm": 0.0320793017745018, "learning_rate": 7.059417956656346e-06, "loss": 0.0, "step": 4750 }, { "epoch": 0.3536666914332417, "grad_norm": 2.7217818114877446e-06, "learning_rate": 7.05743653250774e-06, "loss": 0.0, "step": 4760 }, { "epoch": 0.35440968868415185, "grad_norm": 1.9429008091265132e-07, "learning_rate": 7.055455108359133e-06, "loss": 0.0096, "step": 4770 }, { "epoch": 0.35515268593506205, "grad_norm": 4.879334714047445e-10, "learning_rate": 7.053473684210526e-06, "loss": 0.0, "step": 4780 }, { "epoch": 0.3558956831859722, "grad_norm": 3.141030546771617e-09, "learning_rate": 7.0514922600619186e-06, "loss": 0.0, "step": 4790 }, { "epoch": 0.3566386804368824, "grad_norm": 0.0020067994482815266, "learning_rate": 7.049510835913312e-06, "loss": 0.0, "step": 4800 }, { "epoch": 0.35738167768779255, "grad_norm": 0.0001746823254507035, "learning_rate": 7.0475294117647055e-06, "loss": 0.0012, "step": 4810 }, { "epoch": 0.3581246749387027, "grad_norm": 2.1653225878992544e-09, "learning_rate": 7.045547987616099e-06, "loss": 0.0, "step": 4820 }, { "epoch": 0.3588676721896129, "grad_norm": 0.0002915837103500962, "learning_rate": 7.0435665634674925e-06, "loss": 0.0, "step": 4830 }, { "epoch": 0.35961066944052306, "grad_norm": 16.670625686645508, "learning_rate": 7.041585139318885e-06, "loss": 0.0033, "step": 4840 }, { "epoch": 0.36035366669143326, "grad_norm": 0.00013888382818549871, "learning_rate": 7.0396037151702785e-06, "loss": 0.2469, "step": 4850 }, { "epoch": 0.3610966639423434, "grad_norm": 8.168048876155964e-11, "learning_rate": 7.037622291021671e-06, "loss": 0.0101, "step": 4860 }, { "epoch": 0.36183966119325356, "grad_norm": 7.678132760702283e-07, "learning_rate": 7.035640866873065e-06, "loss": 0.0002, "step": 4870 }, { "epoch": 0.36258265844416376, "grad_norm": 0.9612240195274353, "learning_rate": 7.033659442724458e-06, "loss": 0.0002, "step": 4880 }, { "epoch": 0.3633256556950739, "grad_norm": 4.064975023254647e-09, "learning_rate": 7.031678018575851e-06, "loss": 0.0006, "step": 4890 }, { "epoch": 0.3640686529459841, "grad_norm": 0.002011739881709218, "learning_rate": 7.029696594427244e-06, "loss": 0.0012, "step": 4900 }, { "epoch": 0.36481165019689427, "grad_norm": 0.0014305433724075556, "learning_rate": 7.027715170278638e-06, "loss": 0.0019, "step": 4910 }, { "epoch": 0.36555464744780447, "grad_norm": 9.859310921456199e-06, "learning_rate": 7.025733746130031e-06, "loss": 0.0008, "step": 4920 }, { "epoch": 0.3662976446987146, "grad_norm": 0.0002149749343516305, "learning_rate": 7.023752321981424e-06, "loss": 0.0, "step": 4930 }, { "epoch": 0.36704064194962477, "grad_norm": 1.698508620262146, "learning_rate": 7.021770897832816e-06, "loss": 0.0, "step": 4940 }, { "epoch": 0.367783639200535, "grad_norm": 3.907800419256091e-06, "learning_rate": 7.01978947368421e-06, "loss": 0.1772, "step": 4950 }, { "epoch": 0.3685266364514451, "grad_norm": 2.663685734205501e-07, "learning_rate": 7.017808049535603e-06, "loss": 0.0001, "step": 4960 }, { "epoch": 0.3692696337023553, "grad_norm": 0.15934020280838013, "learning_rate": 7.015826625386997e-06, "loss": 0.0001, "step": 4970 }, { "epoch": 0.3700126309532655, "grad_norm": 0.00096990040037781, "learning_rate": 7.01384520123839e-06, "loss": 0.0, "step": 4980 }, { "epoch": 0.3707556282041756, "grad_norm": 3.473046850110961e-11, "learning_rate": 7.011863777089783e-06, "loss": 0.0019, "step": 4990 }, { "epoch": 0.37149862545508583, "grad_norm": 1.712970314638601e-13, "learning_rate": 7.009882352941176e-06, "loss": 0.0, "step": 5000 }, { "epoch": 0.372241622705996, "grad_norm": 1.5438759159991378e-09, "learning_rate": 7.007900928792569e-06, "loss": 0.0, "step": 5010 }, { "epoch": 0.3729846199569062, "grad_norm": 1.08165924134207e-11, "learning_rate": 7.0059195046439625e-06, "loss": 0.0, "step": 5020 }, { "epoch": 0.37372761720781633, "grad_norm": 3.3489581596768403e-07, "learning_rate": 7.003938080495356e-06, "loss": 0.0001, "step": 5030 }, { "epoch": 0.3744706144587265, "grad_norm": 3.380467292809358e-13, "learning_rate": 7.0019566563467485e-06, "loss": 0.0, "step": 5040 }, { "epoch": 0.3752136117096367, "grad_norm": 0.00491594010964036, "learning_rate": 6.999975232198142e-06, "loss": 0.0, "step": 5050 }, { "epoch": 0.37595660896054683, "grad_norm": 1.1983046777075401e-09, "learning_rate": 6.9979938080495355e-06, "loss": 0.0, "step": 5060 }, { "epoch": 0.37669960621145704, "grad_norm": 7.54259644963895e-06, "learning_rate": 6.996012383900929e-06, "loss": 0.0, "step": 5070 }, { "epoch": 0.3774426034623672, "grad_norm": 17.51377296447754, "learning_rate": 6.994030959752322e-06, "loss": 0.0921, "step": 5080 }, { "epoch": 0.37818560071327734, "grad_norm": 0.0025829547084867954, "learning_rate": 6.992049535603714e-06, "loss": 0.0, "step": 5090 }, { "epoch": 0.37892859796418754, "grad_norm": 5.726691654217575e-08, "learning_rate": 6.990068111455108e-06, "loss": 0.0, "step": 5100 }, { "epoch": 0.3796715952150977, "grad_norm": 3.653300106876145e-09, "learning_rate": 6.988086687306501e-06, "loss": 0.0, "step": 5110 }, { "epoch": 0.3804145924660079, "grad_norm": 0.00018701088265515864, "learning_rate": 6.986105263157895e-06, "loss": 0.8, "step": 5120 }, { "epoch": 0.38115758971691804, "grad_norm": 2.7478718038764782e-06, "learning_rate": 6.984123839009288e-06, "loss": 0.0, "step": 5130 }, { "epoch": 0.3819005869678282, "grad_norm": 0.6950036287307739, "learning_rate": 6.982142414860681e-06, "loss": 0.0002, "step": 5140 }, { "epoch": 0.3826435842187384, "grad_norm": 6.326157745206729e-05, "learning_rate": 6.980160990712074e-06, "loss": 0.0001, "step": 5150 }, { "epoch": 0.38338658146964855, "grad_norm": 0.8677387833595276, "learning_rate": 6.978179566563467e-06, "loss": 0.1311, "step": 5160 }, { "epoch": 0.38412957872055875, "grad_norm": 3.761034895433113e-05, "learning_rate": 6.97619814241486e-06, "loss": 0.0, "step": 5170 }, { "epoch": 0.3848725759714689, "grad_norm": 7.776556594762951e-05, "learning_rate": 6.974216718266254e-06, "loss": 1.5582, "step": 5180 }, { "epoch": 0.3856155732223791, "grad_norm": 0.0033169640228152275, "learning_rate": 6.972235294117646e-06, "loss": 0.0001, "step": 5190 }, { "epoch": 0.38635857047328925, "grad_norm": 1.2518678316197906e-09, "learning_rate": 6.97025386996904e-06, "loss": 0.0, "step": 5200 }, { "epoch": 0.3871015677241994, "grad_norm": 0.10035420209169388, "learning_rate": 6.968272445820433e-06, "loss": 0.0, "step": 5210 }, { "epoch": 0.3878445649751096, "grad_norm": 0.004997360520064831, "learning_rate": 6.966291021671827e-06, "loss": 0.0, "step": 5220 }, { "epoch": 0.38858756222601976, "grad_norm": 7.0449887061840855e-06, "learning_rate": 6.964309597523219e-06, "loss": 0.0001, "step": 5230 }, { "epoch": 0.38933055947692996, "grad_norm": 0.014547607861459255, "learning_rate": 6.962328173374612e-06, "loss": 0.0003, "step": 5240 }, { "epoch": 0.3900735567278401, "grad_norm": 1.9456061295386462e-07, "learning_rate": 6.9603467492260055e-06, "loss": 0.0, "step": 5250 }, { "epoch": 0.39081655397875026, "grad_norm": 5.10857767763509e-14, "learning_rate": 6.958365325077399e-06, "loss": 0.0002, "step": 5260 }, { "epoch": 0.39155955122966046, "grad_norm": 0.026187723502516747, "learning_rate": 6.9563839009287924e-06, "loss": 0.0001, "step": 5270 }, { "epoch": 0.3923025484805706, "grad_norm": 6.517245492432266e-05, "learning_rate": 6.954402476780186e-06, "loss": 0.0, "step": 5280 }, { "epoch": 0.3930455457314808, "grad_norm": 2.26207362175046e-06, "learning_rate": 6.952421052631579e-06, "loss": 0.0, "step": 5290 }, { "epoch": 0.39378854298239097, "grad_norm": 1.4322849892778322e-05, "learning_rate": 6.950439628482971e-06, "loss": 0.0, "step": 5300 }, { "epoch": 0.3945315402333011, "grad_norm": 4.22254945107157e-15, "learning_rate": 6.948458204334365e-06, "loss": 0.0005, "step": 5310 }, { "epoch": 0.3952745374842113, "grad_norm": 3.7941536024099776e-10, "learning_rate": 6.946476780185758e-06, "loss": 0.0002, "step": 5320 }, { "epoch": 0.39601753473512147, "grad_norm": 1.2225576107027791e-09, "learning_rate": 6.9444953560371516e-06, "loss": 0.0, "step": 5330 }, { "epoch": 0.3967605319860317, "grad_norm": 1.4432759432025932e-09, "learning_rate": 6.942513931888545e-06, "loss": 0.0, "step": 5340 }, { "epoch": 0.3975035292369418, "grad_norm": 1.8727871520241024e-06, "learning_rate": 6.940532507739938e-06, "loss": 0.1063, "step": 5350 }, { "epoch": 0.39824652648785197, "grad_norm": 7.686650276184082, "learning_rate": 6.938551083591331e-06, "loss": 0.0012, "step": 5360 }, { "epoch": 0.3989895237387622, "grad_norm": 5.906994893223327e-10, "learning_rate": 6.936569659442725e-06, "loss": 0.0, "step": 5370 }, { "epoch": 0.3997325209896723, "grad_norm": 3.396497805230325e-10, "learning_rate": 6.934588235294117e-06, "loss": 0.4313, "step": 5380 }, { "epoch": 0.40047551824058253, "grad_norm": 0.0007529680733568966, "learning_rate": 6.932606811145511e-06, "loss": 0.0, "step": 5390 }, { "epoch": 0.4012185154914927, "grad_norm": 22.393823623657227, "learning_rate": 6.930625386996903e-06, "loss": 0.0031, "step": 5400 }, { "epoch": 0.4019615127424028, "grad_norm": 1.4750337868463248e-05, "learning_rate": 6.928643962848297e-06, "loss": 0.0, "step": 5410 }, { "epoch": 0.40270450999331303, "grad_norm": 6.947700512682786e-07, "learning_rate": 6.92666253869969e-06, "loss": 0.0, "step": 5420 }, { "epoch": 0.4034475072442232, "grad_norm": 1.853780418059614e-06, "learning_rate": 6.924681114551084e-06, "loss": 0.0287, "step": 5430 }, { "epoch": 0.4041905044951334, "grad_norm": 2.6784264042789552e-11, "learning_rate": 6.922699690402477e-06, "loss": 0.0003, "step": 5440 }, { "epoch": 0.40493350174604353, "grad_norm": 1.7792588550946675e-05, "learning_rate": 6.920718266253869e-06, "loss": 0.0004, "step": 5450 }, { "epoch": 0.40567649899695374, "grad_norm": 3.525993663711091e-10, "learning_rate": 6.9187368421052624e-06, "loss": 0.0002, "step": 5460 }, { "epoch": 0.4064194962478639, "grad_norm": 2.765553341887994e-09, "learning_rate": 6.916755417956656e-06, "loss": 0.0, "step": 5470 }, { "epoch": 0.40716249349877404, "grad_norm": 6.824161374874294e-11, "learning_rate": 6.914773993808049e-06, "loss": 0.0006, "step": 5480 }, { "epoch": 0.40790549074968424, "grad_norm": 1.957044105438399e-06, "learning_rate": 6.912792569659443e-06, "loss": 0.0, "step": 5490 }, { "epoch": 0.4086484880005944, "grad_norm": 1.2777289448706597e-08, "learning_rate": 6.9108111455108355e-06, "loss": 0.0091, "step": 5500 }, { "epoch": 0.4093914852515046, "grad_norm": 2.3734808562991816e-10, "learning_rate": 6.908829721362229e-06, "loss": 0.0, "step": 5510 }, { "epoch": 0.41013448250241474, "grad_norm": 8.059724762587983e-13, "learning_rate": 6.906848297213622e-06, "loss": 0.0, "step": 5520 }, { "epoch": 0.4108774797533249, "grad_norm": 0.00029414703021757305, "learning_rate": 6.904866873065015e-06, "loss": 0.0, "step": 5530 }, { "epoch": 0.4116204770042351, "grad_norm": 1.9257813121953404e-09, "learning_rate": 6.9028854489164085e-06, "loss": 0.0, "step": 5540 }, { "epoch": 0.41236347425514525, "grad_norm": 1.3367191968427505e-05, "learning_rate": 6.900904024767801e-06, "loss": 0.0002, "step": 5550 }, { "epoch": 0.41310647150605545, "grad_norm": 3.3565922876732657e-06, "learning_rate": 6.898922600619195e-06, "loss": 0.0, "step": 5560 }, { "epoch": 0.4138494687569656, "grad_norm": 3.3529643133078935e-06, "learning_rate": 6.896941176470588e-06, "loss": 0.0, "step": 5570 }, { "epoch": 0.41459246600787575, "grad_norm": 1.233872626471566e-05, "learning_rate": 6.8949597523219815e-06, "loss": 0.0013, "step": 5580 }, { "epoch": 0.41533546325878595, "grad_norm": 0.002580970758572221, "learning_rate": 6.892978328173375e-06, "loss": 0.0, "step": 5590 }, { "epoch": 0.4160784605096961, "grad_norm": 74.71837615966797, "learning_rate": 6.890996904024767e-06, "loss": 0.0134, "step": 5600 }, { "epoch": 0.4168214577606063, "grad_norm": 0.0019390626111999154, "learning_rate": 6.88901547987616e-06, "loss": 0.0001, "step": 5610 }, { "epoch": 0.41756445501151646, "grad_norm": 2.1002911125833634e-06, "learning_rate": 6.887034055727554e-06, "loss": 0.0576, "step": 5620 }, { "epoch": 0.4183074522624266, "grad_norm": 2.0986158233426977e-06, "learning_rate": 6.885052631578947e-06, "loss": 0.0, "step": 5630 }, { "epoch": 0.4190504495133368, "grad_norm": 0.0003597211907617748, "learning_rate": 6.883071207430341e-06, "loss": 0.0, "step": 5640 }, { "epoch": 0.41979344676424696, "grad_norm": 0.010736849159002304, "learning_rate": 6.881089783281733e-06, "loss": 0.0, "step": 5650 }, { "epoch": 0.42053644401515716, "grad_norm": 1.336089462711243e-05, "learning_rate": 6.879108359133127e-06, "loss": 0.0004, "step": 5660 }, { "epoch": 0.4212794412660673, "grad_norm": 1.8295757270303525e-11, "learning_rate": 6.87712693498452e-06, "loss": 0.0001, "step": 5670 }, { "epoch": 0.42202243851697746, "grad_norm": 0.006875059101730585, "learning_rate": 6.875145510835913e-06, "loss": 0.0003, "step": 5680 }, { "epoch": 0.42276543576788767, "grad_norm": 5.821933427796466e-06, "learning_rate": 6.873164086687306e-06, "loss": 0.1422, "step": 5690 }, { "epoch": 0.4235084330187978, "grad_norm": 0.00013570136798080057, "learning_rate": 6.871182662538699e-06, "loss": 0.0, "step": 5700 }, { "epoch": 0.424251430269708, "grad_norm": 1.257210373878479, "learning_rate": 6.869201238390092e-06, "loss": 0.0002, "step": 5710 }, { "epoch": 0.42499442752061817, "grad_norm": 3.070278253858305e-09, "learning_rate": 6.867219814241486e-06, "loss": 0.1172, "step": 5720 }, { "epoch": 0.42573742477152837, "grad_norm": 3.802934952545911e-05, "learning_rate": 6.865238390092879e-06, "loss": 0.0, "step": 5730 }, { "epoch": 0.4264804220224385, "grad_norm": 1.4289688139365597e-12, "learning_rate": 6.863256965944273e-06, "loss": 0.0001, "step": 5740 }, { "epoch": 0.42722341927334867, "grad_norm": 0.10254601389169693, "learning_rate": 6.861275541795665e-06, "loss": 0.0002, "step": 5750 }, { "epoch": 0.4279664165242589, "grad_norm": 1.0660895668479498e-06, "learning_rate": 6.859294117647058e-06, "loss": 0.0001, "step": 5760 }, { "epoch": 0.428709413775169, "grad_norm": 1.01205168689944e-08, "learning_rate": 6.8573126934984515e-06, "loss": 0.0001, "step": 5770 }, { "epoch": 0.42945241102607923, "grad_norm": 0.0004645303124561906, "learning_rate": 6.855331269349845e-06, "loss": 0.0006, "step": 5780 }, { "epoch": 0.4301954082769894, "grad_norm": 5.209638498371305e-09, "learning_rate": 6.8533498452012385e-06, "loss": 0.0001, "step": 5790 }, { "epoch": 0.4309384055278995, "grad_norm": 0.6885000467300415, "learning_rate": 6.851368421052631e-06, "loss": 0.0026, "step": 5800 }, { "epoch": 0.43168140277880973, "grad_norm": 2.8175127975062914e-15, "learning_rate": 6.8493869969040246e-06, "loss": 0.0008, "step": 5810 }, { "epoch": 0.4324244000297199, "grad_norm": 5.283308013304122e-09, "learning_rate": 6.847405572755418e-06, "loss": 0.0, "step": 5820 }, { "epoch": 0.4331673972806301, "grad_norm": 9.203136031032955e-09, "learning_rate": 6.845424148606811e-06, "loss": 0.0041, "step": 5830 }, { "epoch": 0.43391039453154023, "grad_norm": 4.921022878079384e-07, "learning_rate": 6.843442724458204e-06, "loss": 0.0, "step": 5840 }, { "epoch": 0.4346533917824504, "grad_norm": 7.141732094595454e-09, "learning_rate": 6.841461300309597e-06, "loss": 0.2125, "step": 5850 }, { "epoch": 0.4353963890333606, "grad_norm": 3.595269859602013e-08, "learning_rate": 6.83947987616099e-06, "loss": 0.0034, "step": 5860 }, { "epoch": 0.43613938628427074, "grad_norm": 0.17007999122142792, "learning_rate": 6.837498452012384e-06, "loss": 0.0005, "step": 5870 }, { "epoch": 0.43688238353518094, "grad_norm": 0.00031011132523417473, "learning_rate": 6.835517027863777e-06, "loss": 0.0, "step": 5880 }, { "epoch": 0.4376253807860911, "grad_norm": 0.0007888266700319946, "learning_rate": 6.833535603715171e-06, "loss": 0.0003, "step": 5890 }, { "epoch": 0.43836837803700124, "grad_norm": 3.0393576366805064e-07, "learning_rate": 6.831554179566562e-06, "loss": 0.0, "step": 5900 }, { "epoch": 0.43911137528791144, "grad_norm": 2.520565658414853e-06, "learning_rate": 6.829572755417956e-06, "loss": 0.0, "step": 5910 }, { "epoch": 0.4398543725388216, "grad_norm": 2.70310107630678e-10, "learning_rate": 6.827591331269349e-06, "loss": 0.0214, "step": 5920 }, { "epoch": 0.4405973697897318, "grad_norm": 2.0473902521273013e-12, "learning_rate": 6.825609907120743e-06, "loss": 0.0003, "step": 5930 }, { "epoch": 0.44134036704064195, "grad_norm": 2.9224142394923547e-07, "learning_rate": 6.823628482972136e-06, "loss": 0.0006, "step": 5940 }, { "epoch": 0.4420833642915521, "grad_norm": 0.003711557714268565, "learning_rate": 6.821647058823529e-06, "loss": 0.0001, "step": 5950 }, { "epoch": 0.4428263615424623, "grad_norm": 0.3994269073009491, "learning_rate": 6.819665634674922e-06, "loss": 0.0002, "step": 5960 }, { "epoch": 0.44356935879337245, "grad_norm": 1.4680202583064461e-12, "learning_rate": 6.817684210526315e-06, "loss": 0.0102, "step": 5970 }, { "epoch": 0.44431235604428265, "grad_norm": 0.00024169511743821204, "learning_rate": 6.8157027863777085e-06, "loss": 0.0004, "step": 5980 }, { "epoch": 0.4450553532951928, "grad_norm": 2.7734412497441765e-15, "learning_rate": 6.813721362229102e-06, "loss": 0.0, "step": 5990 }, { "epoch": 0.445798350546103, "grad_norm": 4.2833755287574604e-05, "learning_rate": 6.8117399380804946e-06, "loss": 0.0001, "step": 6000 }, { "epoch": 0.44654134779701316, "grad_norm": 6.269217465160182e-07, "learning_rate": 6.809758513931888e-06, "loss": 0.0114, "step": 6010 }, { "epoch": 0.4472843450479233, "grad_norm": 4.9403010052628815e-05, "learning_rate": 6.8077770897832815e-06, "loss": 0.0, "step": 6020 }, { "epoch": 0.4480273422988335, "grad_norm": 1469.22412109375, "learning_rate": 6.805795665634675e-06, "loss": 0.2021, "step": 6030 }, { "epoch": 0.44877033954974366, "grad_norm": 0.12646107375621796, "learning_rate": 6.8038142414860684e-06, "loss": 0.0, "step": 6040 }, { "epoch": 0.44951333680065386, "grad_norm": 0.01065218634903431, "learning_rate": 6.80183281733746e-06, "loss": 0.0, "step": 6050 }, { "epoch": 0.450256334051564, "grad_norm": 3.4290242867385246e-10, "learning_rate": 6.799851393188854e-06, "loss": 0.1641, "step": 6060 }, { "epoch": 0.45099933130247416, "grad_norm": 1.0118425564087374e-07, "learning_rate": 6.797869969040247e-06, "loss": 0.0, "step": 6070 }, { "epoch": 0.45174232855338436, "grad_norm": 0.0007906552054919302, "learning_rate": 6.795888544891641e-06, "loss": 0.0486, "step": 6080 }, { "epoch": 0.4524853258042945, "grad_norm": 3.751475742319599e-05, "learning_rate": 6.793907120743034e-06, "loss": 0.0, "step": 6090 }, { "epoch": 0.4532283230552047, "grad_norm": 6.846178735031572e-07, "learning_rate": 6.7919256965944276e-06, "loss": 0.0, "step": 6100 }, { "epoch": 0.45397132030611487, "grad_norm": 7.500820942141218e-08, "learning_rate": 6.78994427244582e-06, "loss": 0.0001, "step": 6110 }, { "epoch": 0.454714317557025, "grad_norm": 1.576177954673767, "learning_rate": 6.787962848297213e-06, "loss": 0.0003, "step": 6120 }, { "epoch": 0.4554573148079352, "grad_norm": 3.63332947017625e-05, "learning_rate": 6.785981424148606e-06, "loss": 0.0002, "step": 6130 }, { "epoch": 0.45620031205884537, "grad_norm": 6.951583406133999e-12, "learning_rate": 6.784e-06, "loss": 0.0, "step": 6140 }, { "epoch": 0.4569433093097556, "grad_norm": 0.4472675323486328, "learning_rate": 6.782018575851393e-06, "loss": 0.0001, "step": 6150 }, { "epoch": 0.4576863065606657, "grad_norm": 0.003570985049009323, "learning_rate": 6.780037151702786e-06, "loss": 0.0081, "step": 6160 }, { "epoch": 0.4584293038115759, "grad_norm": 3.4591221553803564e-10, "learning_rate": 6.778055727554179e-06, "loss": 0.0, "step": 6170 }, { "epoch": 0.4591723010624861, "grad_norm": 0.0007409548852592707, "learning_rate": 6.776074303405573e-06, "loss": 0.0, "step": 6180 }, { "epoch": 0.4599152983133962, "grad_norm": 3.6000451473228168e-06, "learning_rate": 6.774092879256966e-06, "loss": 0.0012, "step": 6190 }, { "epoch": 0.46065829556430643, "grad_norm": 8.330457745842068e-08, "learning_rate": 6.772111455108358e-06, "loss": 0.0, "step": 6200 }, { "epoch": 0.4614012928152166, "grad_norm": 2.8183758331579156e-06, "learning_rate": 6.7701300309597515e-06, "loss": 0.0, "step": 6210 }, { "epoch": 0.46214429006612673, "grad_norm": 3.6684799852082506e-05, "learning_rate": 6.768148606811145e-06, "loss": 0.4219, "step": 6220 }, { "epoch": 0.46288728731703693, "grad_norm": 3.2216822543063017e-09, "learning_rate": 6.7661671826625385e-06, "loss": 0.0004, "step": 6230 }, { "epoch": 0.4636302845679471, "grad_norm": 9.26785673982522e-07, "learning_rate": 6.764185758513932e-06, "loss": 0.0016, "step": 6240 }, { "epoch": 0.4643732818188573, "grad_norm": 0.044501885771751404, "learning_rate": 6.762204334365325e-06, "loss": 0.0, "step": 6250 }, { "epoch": 0.46511627906976744, "grad_norm": 2.947477639736462e-07, "learning_rate": 6.760222910216718e-06, "loss": 0.1016, "step": 6260 }, { "epoch": 0.46585927632067764, "grad_norm": 4.6588407712988555e-05, "learning_rate": 6.758241486068111e-06, "loss": 0.0012, "step": 6270 }, { "epoch": 0.4666022735715878, "grad_norm": 4.834658895447319e-08, "learning_rate": 6.756260061919504e-06, "loss": 0.0, "step": 6280 }, { "epoch": 0.46734527082249794, "grad_norm": 1.4594309050153242e-07, "learning_rate": 6.754278637770898e-06, "loss": 0.0, "step": 6290 }, { "epoch": 0.46808826807340814, "grad_norm": 1.2910660416309838e-06, "learning_rate": 6.752297213622291e-06, "loss": 0.0001, "step": 6300 }, { "epoch": 0.4688312653243183, "grad_norm": 4.898447438146138e-15, "learning_rate": 6.750315789473684e-06, "loss": 0.0, "step": 6310 }, { "epoch": 0.4695742625752285, "grad_norm": 7.288541326033737e-08, "learning_rate": 6.748334365325077e-06, "loss": 0.0, "step": 6320 }, { "epoch": 0.47031725982613865, "grad_norm": 0.13941597938537598, "learning_rate": 6.746352941176471e-06, "loss": 0.0, "step": 6330 }, { "epoch": 0.4710602570770488, "grad_norm": 7.502061038361374e-13, "learning_rate": 6.744371517027864e-06, "loss": 0.0, "step": 6340 }, { "epoch": 0.471803254327959, "grad_norm": 3.5279601462434584e-09, "learning_rate": 6.742390092879257e-06, "loss": 0.0878, "step": 6350 }, { "epoch": 0.47254625157886915, "grad_norm": 0.0094557274132967, "learning_rate": 6.740408668730649e-06, "loss": 0.0005, "step": 6360 }, { "epoch": 0.47328924882977935, "grad_norm": 1.2584079911448498e-07, "learning_rate": 6.738427244582043e-06, "loss": 0.0, "step": 6370 }, { "epoch": 0.4740322460806895, "grad_norm": 1.3283269612290177e-13, "learning_rate": 6.736445820433436e-06, "loss": 0.0, "step": 6380 }, { "epoch": 0.47477524333159965, "grad_norm": 3.707304949140955e-11, "learning_rate": 6.73446439628483e-06, "loss": 0.0, "step": 6390 }, { "epoch": 0.47551824058250985, "grad_norm": 9.16432253976647e-14, "learning_rate": 6.732482972136223e-06, "loss": 0.0, "step": 6400 }, { "epoch": 0.47626123783342, "grad_norm": 0.0001218633187818341, "learning_rate": 6.730501547987616e-06, "loss": 0.0, "step": 6410 }, { "epoch": 0.4770042350843302, "grad_norm": 3.55296720044862e-06, "learning_rate": 6.7285201238390085e-06, "loss": 0.0, "step": 6420 }, { "epoch": 0.47774723233524036, "grad_norm": 1.7938030618183665e-12, "learning_rate": 6.726538699690402e-06, "loss": 0.0, "step": 6430 }, { "epoch": 0.4784902295861505, "grad_norm": 5.849318313266849e-06, "learning_rate": 6.724557275541795e-06, "loss": 0.0, "step": 6440 }, { "epoch": 0.4792332268370607, "grad_norm": 0.29423731565475464, "learning_rate": 6.722575851393189e-06, "loss": 0.0004, "step": 6450 }, { "epoch": 0.47997622408797086, "grad_norm": 5.883005727014279e-08, "learning_rate": 6.7205944272445815e-06, "loss": 0.0024, "step": 6460 }, { "epoch": 0.48071922133888106, "grad_norm": 1.4346632269734982e-05, "learning_rate": 6.718613003095975e-06, "loss": 0.0001, "step": 6470 }, { "epoch": 0.4814622185897912, "grad_norm": 2.7425844928430365e-10, "learning_rate": 6.7166315789473684e-06, "loss": 0.0, "step": 6480 }, { "epoch": 0.48220521584070136, "grad_norm": 0.0009846725733950734, "learning_rate": 6.714650154798762e-06, "loss": 0.0, "step": 6490 }, { "epoch": 0.48294821309161157, "grad_norm": 1.8820895775206736e-07, "learning_rate": 6.7126687306501545e-06, "loss": 0.0, "step": 6500 }, { "epoch": 0.4836912103425217, "grad_norm": 0.00012058517313562334, "learning_rate": 6.710687306501547e-06, "loss": 0.0004, "step": 6510 }, { "epoch": 0.4844342075934319, "grad_norm": 0.027900589630007744, "learning_rate": 6.708705882352941e-06, "loss": 0.0, "step": 6520 }, { "epoch": 0.48517720484434207, "grad_norm": 7.882667851788483e-09, "learning_rate": 6.706724458204334e-06, "loss": 0.0, "step": 6530 }, { "epoch": 0.4859202020952523, "grad_norm": 2.724511915089999e-16, "learning_rate": 6.7047430340557276e-06, "loss": 0.0, "step": 6540 }, { "epoch": 0.4866631993461624, "grad_norm": 1.6570087524314658e-09, "learning_rate": 6.702761609907121e-06, "loss": 0.0, "step": 6550 }, { "epoch": 0.48740619659707257, "grad_norm": 4.709846401773632e-11, "learning_rate": 6.700780185758514e-06, "loss": 0.0001, "step": 6560 }, { "epoch": 0.4881491938479828, "grad_norm": 2.0699836511539615e-08, "learning_rate": 6.698798761609906e-06, "loss": 0.0045, "step": 6570 }, { "epoch": 0.4888921910988929, "grad_norm": 2.465218074121367e-07, "learning_rate": 6.6968173374613e-06, "loss": 0.045, "step": 6580 }, { "epoch": 0.48963518834980313, "grad_norm": 8.000343143521604e-08, "learning_rate": 6.694835913312693e-06, "loss": 0.0001, "step": 6590 }, { "epoch": 0.4903781856007133, "grad_norm": 1.5706042177043855e-05, "learning_rate": 6.692854489164087e-06, "loss": 0.0, "step": 6600 }, { "epoch": 0.49112118285162343, "grad_norm": 0.009890247136354446, "learning_rate": 6.690873065015479e-06, "loss": 0.1289, "step": 6610 }, { "epoch": 0.49186418010253363, "grad_norm": 8.138137386204392e-10, "learning_rate": 6.688891640866873e-06, "loss": 0.0001, "step": 6620 }, { "epoch": 0.4926071773534438, "grad_norm": 4.947295565216336e-06, "learning_rate": 6.686910216718266e-06, "loss": 0.041, "step": 6630 }, { "epoch": 0.493350174604354, "grad_norm": 0.00032445151009596884, "learning_rate": 6.68492879256966e-06, "loss": 0.0001, "step": 6640 }, { "epoch": 0.49409317185526413, "grad_norm": 0.09742851555347443, "learning_rate": 6.682947368421052e-06, "loss": 0.0, "step": 6650 }, { "epoch": 0.4948361691061743, "grad_norm": 286.12921142578125, "learning_rate": 6.680965944272445e-06, "loss": 0.2375, "step": 6660 }, { "epoch": 0.4955791663570845, "grad_norm": 0.6281022429466248, "learning_rate": 6.6789845201238384e-06, "loss": 0.0001, "step": 6670 }, { "epoch": 0.49632216360799464, "grad_norm": 1.925554382609107e-09, "learning_rate": 6.677003095975232e-06, "loss": 0.0, "step": 6680 }, { "epoch": 0.49706516085890484, "grad_norm": 2.0597525463017519e-07, "learning_rate": 6.675021671826625e-06, "loss": 0.166, "step": 6690 }, { "epoch": 0.497808158109815, "grad_norm": 1.5587842128184093e-10, "learning_rate": 6.673040247678019e-06, "loss": 0.0141, "step": 6700 }, { "epoch": 0.49855115536072514, "grad_norm": 2.9893923056079075e-05, "learning_rate": 6.6710588235294115e-06, "loss": 0.0, "step": 6710 }, { "epoch": 0.49929415261163534, "grad_norm": 3.63746949005872e-05, "learning_rate": 6.669077399380804e-06, "loss": 0.0001, "step": 6720 }, { "epoch": 0.5000371498625455, "grad_norm": 5.945497832726687e-06, "learning_rate": 6.6670959752321976e-06, "loss": 0.0, "step": 6730 }, { "epoch": 0.5007801471134556, "grad_norm": 1.9269871245342074e-06, "learning_rate": 6.665114551083591e-06, "loss": 0.0, "step": 6740 }, { "epoch": 0.5015231443643658, "grad_norm": 2.9682192689506337e-05, "learning_rate": 6.6631331269349845e-06, "loss": 0.0013, "step": 6750 }, { "epoch": 0.502266141615276, "grad_norm": 1.8240900345745104e-08, "learning_rate": 6.661151702786377e-06, "loss": 0.0, "step": 6760 }, { "epoch": 0.5030091388661861, "grad_norm": 1.822359990910627e-06, "learning_rate": 6.659170278637771e-06, "loss": 0.0, "step": 6770 }, { "epoch": 0.5037521361170963, "grad_norm": 1.3937438980704542e-09, "learning_rate": 6.657188854489164e-06, "loss": 0.0001, "step": 6780 }, { "epoch": 0.5044951333680066, "grad_norm": 1.1711266722017055e-12, "learning_rate": 6.655207430340557e-06, "loss": 0.0, "step": 6790 }, { "epoch": 0.5052381306189168, "grad_norm": 2.1927473942487397e-14, "learning_rate": 6.65322600619195e-06, "loss": 0.0, "step": 6800 }, { "epoch": 0.5059811278698269, "grad_norm": 4.244204099279614e-09, "learning_rate": 6.651244582043343e-06, "loss": 0.0, "step": 6810 }, { "epoch": 0.5067241251207371, "grad_norm": 1.6244641187768138e-09, "learning_rate": 6.649263157894736e-06, "loss": 0.0, "step": 6820 }, { "epoch": 0.5074671223716473, "grad_norm": 1.811256083783519e-06, "learning_rate": 6.64728173374613e-06, "loss": 0.0, "step": 6830 }, { "epoch": 0.5082101196225574, "grad_norm": 0.0016115694306790829, "learning_rate": 6.645300309597523e-06, "loss": 0.0, "step": 6840 }, { "epoch": 0.5089531168734676, "grad_norm": 1.7542394914471515e-07, "learning_rate": 6.643318885448917e-06, "loss": 0.0002, "step": 6850 }, { "epoch": 0.5096961141243778, "grad_norm": 4.117651997148641e-09, "learning_rate": 6.641337461300309e-06, "loss": 0.0, "step": 6860 }, { "epoch": 0.5104391113752879, "grad_norm": 1.200024524194987e-10, "learning_rate": 6.639356037151702e-06, "loss": 0.0015, "step": 6870 }, { "epoch": 0.5111821086261981, "grad_norm": 0.0003677874046843499, "learning_rate": 6.637374613003095e-06, "loss": 0.0, "step": 6880 }, { "epoch": 0.5119251058771083, "grad_norm": 2.9272594019857934e-06, "learning_rate": 6.635393188854489e-06, "loss": 0.0, "step": 6890 }, { "epoch": 0.5126681031280185, "grad_norm": 1.3174949344829656e-06, "learning_rate": 6.633411764705882e-06, "loss": 0.0002, "step": 6900 }, { "epoch": 0.5134111003789286, "grad_norm": 9.046566664186462e-14, "learning_rate": 6.631430340557275e-06, "loss": 0.0021, "step": 6910 }, { "epoch": 0.5141540976298388, "grad_norm": 1.222252699051296e-08, "learning_rate": 6.629448916408668e-06, "loss": 0.0, "step": 6920 }, { "epoch": 0.514897094880749, "grad_norm": 1.251371713806293e-06, "learning_rate": 6.627467492260062e-06, "loss": 0.0, "step": 6930 }, { "epoch": 0.5156400921316591, "grad_norm": 1.3245629304492468e-07, "learning_rate": 6.6254860681114545e-06, "loss": 0.0, "step": 6940 }, { "epoch": 0.5163830893825693, "grad_norm": 0.01259794645011425, "learning_rate": 6.623504643962848e-06, "loss": 0.0, "step": 6950 }, { "epoch": 0.5171260866334795, "grad_norm": 1.3491393247022643e-07, "learning_rate": 6.621523219814241e-06, "loss": 0.0, "step": 6960 }, { "epoch": 0.5178690838843897, "grad_norm": 2.9449500971168163e-07, "learning_rate": 6.619541795665634e-06, "loss": 0.001, "step": 6970 }, { "epoch": 0.5186120811352998, "grad_norm": 1.7297703607255244e-06, "learning_rate": 6.6175603715170275e-06, "loss": 0.0, "step": 6980 }, { "epoch": 0.51935507838621, "grad_norm": 0.0009799367981031537, "learning_rate": 6.615578947368421e-06, "loss": 0.0015, "step": 6990 }, { "epoch": 0.5200980756371202, "grad_norm": 0.03194209188222885, "learning_rate": 6.6135975232198145e-06, "loss": 0.0, "step": 7000 }, { "epoch": 0.5208410728880303, "grad_norm": 0.6720610857009888, "learning_rate": 6.611616099071208e-06, "loss": 0.0021, "step": 7010 }, { "epoch": 0.5215840701389405, "grad_norm": 2.619980404502953e-12, "learning_rate": 6.6096346749226e-06, "loss": 0.0001, "step": 7020 }, { "epoch": 0.5223270673898507, "grad_norm": 1.4625645875930786, "learning_rate": 6.607653250773993e-06, "loss": 0.0003, "step": 7030 }, { "epoch": 0.5230700646407608, "grad_norm": 2.0288728475037487e-09, "learning_rate": 6.605671826625387e-06, "loss": 0.0, "step": 7040 }, { "epoch": 0.523813061891671, "grad_norm": 2.0943677347950995e-11, "learning_rate": 6.60369040247678e-06, "loss": 0.0001, "step": 7050 }, { "epoch": 0.5245560591425812, "grad_norm": 2.5892422971310336e-13, "learning_rate": 6.601708978328174e-06, "loss": 0.0, "step": 7060 }, { "epoch": 0.5252990563934914, "grad_norm": 7.075490367214332e-13, "learning_rate": 6.599727554179566e-06, "loss": 0.0001, "step": 7070 }, { "epoch": 0.5260420536444015, "grad_norm": 3.7365855405369075e-06, "learning_rate": 6.59774613003096e-06, "loss": 0.0101, "step": 7080 }, { "epoch": 0.5267850508953117, "grad_norm": 0.00039109590579755604, "learning_rate": 6.595764705882352e-06, "loss": 0.0, "step": 7090 }, { "epoch": 0.5275280481462219, "grad_norm": 3.791037215705728e-06, "learning_rate": 6.593783281733746e-06, "loss": 0.0193, "step": 7100 }, { "epoch": 0.528271045397132, "grad_norm": 4.051130417792592e-06, "learning_rate": 6.591801857585139e-06, "loss": 0.0004, "step": 7110 }, { "epoch": 0.5290140426480422, "grad_norm": 5.169516043679323e-06, "learning_rate": 6.589820433436532e-06, "loss": 0.0, "step": 7120 }, { "epoch": 0.5297570398989524, "grad_norm": 1.3115494766680058e-05, "learning_rate": 6.587839009287925e-06, "loss": 0.0, "step": 7130 }, { "epoch": 0.5305000371498625, "grad_norm": 3.2502930480404757e-06, "learning_rate": 6.585857585139319e-06, "loss": 0.0003, "step": 7140 }, { "epoch": 0.5312430344007727, "grad_norm": 4.68359517835637e-10, "learning_rate": 6.583876160990712e-06, "loss": 0.0001, "step": 7150 }, { "epoch": 0.5319860316516829, "grad_norm": 8.31640122435709e-14, "learning_rate": 6.581894736842106e-06, "loss": 0.0, "step": 7160 }, { "epoch": 0.5327290289025931, "grad_norm": 9.484141969551274e-07, "learning_rate": 6.5799133126934975e-06, "loss": 0.0001, "step": 7170 }, { "epoch": 0.5334720261535032, "grad_norm": 3.77766446035821e-05, "learning_rate": 6.577931888544891e-06, "loss": 0.0, "step": 7180 }, { "epoch": 0.5342150234044134, "grad_norm": 1.5792127783242904e-07, "learning_rate": 6.5759504643962845e-06, "loss": 0.0064, "step": 7190 }, { "epoch": 0.5349580206553236, "grad_norm": 0.0004631171759683639, "learning_rate": 6.573969040247678e-06, "loss": 0.0, "step": 7200 }, { "epoch": 0.5357010179062337, "grad_norm": 0.006241864059120417, "learning_rate": 6.571987616099071e-06, "loss": 0.0016, "step": 7210 }, { "epoch": 0.5364440151571439, "grad_norm": 0.000653514638543129, "learning_rate": 6.570006191950464e-06, "loss": 0.0, "step": 7220 }, { "epoch": 0.5371870124080541, "grad_norm": 0.000969011103734374, "learning_rate": 6.5680247678018575e-06, "loss": 0.0, "step": 7230 }, { "epoch": 0.5379300096589643, "grad_norm": 7.111579380936206e-11, "learning_rate": 6.56604334365325e-06, "loss": 0.0, "step": 7240 }, { "epoch": 0.5386730069098744, "grad_norm": 8.403596439165995e-05, "learning_rate": 6.564061919504644e-06, "loss": 0.0, "step": 7250 }, { "epoch": 0.5394160041607846, "grad_norm": 2.31096244363016e-08, "learning_rate": 6.562080495356037e-06, "loss": 0.0, "step": 7260 }, { "epoch": 0.5401590014116948, "grad_norm": 3.426975681009026e-08, "learning_rate": 6.56009907120743e-06, "loss": 0.0412, "step": 7270 }, { "epoch": 0.5409019986626049, "grad_norm": 0.0019007496302947402, "learning_rate": 6.558117647058823e-06, "loss": 0.0, "step": 7280 }, { "epoch": 0.5416449959135151, "grad_norm": 2.3429200801672323e-09, "learning_rate": 6.556136222910217e-06, "loss": 0.0, "step": 7290 }, { "epoch": 0.5423879931644253, "grad_norm": 7.970580639948821e-08, "learning_rate": 6.55415479876161e-06, "loss": 0.0, "step": 7300 }, { "epoch": 0.5431309904153354, "grad_norm": 3.526418268506859e-08, "learning_rate": 6.5521733746130036e-06, "loss": 0.0, "step": 7310 }, { "epoch": 0.5438739876662456, "grad_norm": 1.103866296148226e-07, "learning_rate": 6.550191950464395e-06, "loss": 0.0, "step": 7320 }, { "epoch": 0.5446169849171558, "grad_norm": 2.9826600551605225, "learning_rate": 6.548210526315789e-06, "loss": 0.0009, "step": 7330 }, { "epoch": 0.545359982168066, "grad_norm": 8.243339379987447e-07, "learning_rate": 6.546229102167182e-06, "loss": 0.0, "step": 7340 }, { "epoch": 0.5461029794189761, "grad_norm": 2.262101894690005e-10, "learning_rate": 6.544247678018576e-06, "loss": 0.0, "step": 7350 }, { "epoch": 0.5468459766698863, "grad_norm": 4.169824023847468e-07, "learning_rate": 6.542266253869969e-06, "loss": 0.0, "step": 7360 }, { "epoch": 0.5475889739207965, "grad_norm": 2.662619291715629e-10, "learning_rate": 6.540284829721362e-06, "loss": 0.0, "step": 7370 }, { "epoch": 0.5483319711717066, "grad_norm": 2.2780800463806372e-06, "learning_rate": 6.538303405572755e-06, "loss": 0.0, "step": 7380 }, { "epoch": 0.5490749684226168, "grad_norm": 2.3555922837315002e-11, "learning_rate": 6.536321981424148e-06, "loss": 0.0005, "step": 7390 }, { "epoch": 0.549817965673527, "grad_norm": 1.2454486053581348e-10, "learning_rate": 6.534340557275541e-06, "loss": 0.0, "step": 7400 }, { "epoch": 0.5505609629244371, "grad_norm": 3.204684510382111e-13, "learning_rate": 6.532359133126935e-06, "loss": 0.1064, "step": 7410 }, { "epoch": 0.5513039601753473, "grad_norm": 8.245896054859259e-08, "learning_rate": 6.5303777089783275e-06, "loss": 0.0, "step": 7420 }, { "epoch": 0.5520469574262575, "grad_norm": 4.138836084166542e-05, "learning_rate": 6.528396284829721e-06, "loss": 0.0002, "step": 7430 }, { "epoch": 0.5527899546771677, "grad_norm": 3.50336108567717e-07, "learning_rate": 6.5264148606811144e-06, "loss": 0.0, "step": 7440 }, { "epoch": 0.5535329519280778, "grad_norm": 6.963732175790938e-06, "learning_rate": 6.524433436532508e-06, "loss": 0.0, "step": 7450 }, { "epoch": 0.554275949178988, "grad_norm": 0.02392680197954178, "learning_rate": 6.5224520123839005e-06, "loss": 0.0, "step": 7460 }, { "epoch": 0.5550189464298982, "grad_norm": 1.0736823081970215, "learning_rate": 6.520470588235293e-06, "loss": 0.0002, "step": 7470 }, { "epoch": 0.5557619436808083, "grad_norm": 1.4198141116139595e-06, "learning_rate": 6.518489164086687e-06, "loss": 0.0, "step": 7480 }, { "epoch": 0.5565049409317185, "grad_norm": 8.396304984614744e-09, "learning_rate": 6.51650773993808e-06, "loss": 0.0, "step": 7490 }, { "epoch": 0.5572479381826287, "grad_norm": 0.26944324374198914, "learning_rate": 6.5145263157894736e-06, "loss": 0.0, "step": 7500 }, { "epoch": 0.557990935433539, "grad_norm": 0.0027476013638079166, "learning_rate": 6.512544891640867e-06, "loss": 0.0187, "step": 7510 }, { "epoch": 0.558733932684449, "grad_norm": 1.4518497664539609e-05, "learning_rate": 6.51056346749226e-06, "loss": 0.0019, "step": 7520 }, { "epoch": 0.5594769299353592, "grad_norm": 1.9529774419879686e-07, "learning_rate": 6.508582043343653e-06, "loss": 0.0001, "step": 7530 }, { "epoch": 0.5602199271862695, "grad_norm": 0.019307488575577736, "learning_rate": 6.506600619195046e-06, "loss": 0.0002, "step": 7540 }, { "epoch": 0.5609629244371795, "grad_norm": 5.1547067414503545e-05, "learning_rate": 6.504619195046439e-06, "loss": 0.0, "step": 7550 }, { "epoch": 0.5617059216880897, "grad_norm": 4.6475885362440295e-09, "learning_rate": 6.502637770897833e-06, "loss": 0.0, "step": 7560 }, { "epoch": 0.562448918939, "grad_norm": 6.795319495722651e-05, "learning_rate": 6.500656346749225e-06, "loss": 0.1641, "step": 7570 }, { "epoch": 0.56319191618991, "grad_norm": 0.00038371336995624006, "learning_rate": 6.498674922600619e-06, "loss": 0.009, "step": 7580 }, { "epoch": 0.5639349134408203, "grad_norm": 1.8073613786140363e-11, "learning_rate": 6.496693498452012e-06, "loss": 0.3375, "step": 7590 }, { "epoch": 0.5646779106917305, "grad_norm": 2.039313271202936e-13, "learning_rate": 6.494712074303406e-06, "loss": 0.0002, "step": 7600 }, { "epoch": 0.5654209079426407, "grad_norm": 0.0026209026109427214, "learning_rate": 6.492730650154798e-06, "loss": 0.0, "step": 7610 }, { "epoch": 0.5661639051935508, "grad_norm": 2.0994968963350402e-07, "learning_rate": 6.490749226006191e-06, "loss": 0.0019, "step": 7620 }, { "epoch": 0.566906902444461, "grad_norm": 0.00016428482194896787, "learning_rate": 6.4887678018575844e-06, "loss": 0.0, "step": 7630 }, { "epoch": 0.5676498996953712, "grad_norm": 3.3536443311277253e-07, "learning_rate": 6.486786377708978e-06, "loss": 0.001, "step": 7640 }, { "epoch": 0.5683928969462813, "grad_norm": 0.12914980947971344, "learning_rate": 6.484804953560371e-06, "loss": 0.0071, "step": 7650 }, { "epoch": 0.5691358941971915, "grad_norm": 4.699152850662358e-05, "learning_rate": 6.482823529411765e-06, "loss": 0.0, "step": 7660 }, { "epoch": 0.5698788914481017, "grad_norm": 0.030384084209799767, "learning_rate": 6.4808421052631575e-06, "loss": 0.0004, "step": 7670 }, { "epoch": 0.5706218886990118, "grad_norm": 3.967921477499026e-12, "learning_rate": 6.478860681114551e-06, "loss": 0.0, "step": 7680 }, { "epoch": 0.571364885949922, "grad_norm": 1.981400110651066e-08, "learning_rate": 6.4768792569659436e-06, "loss": 0.0, "step": 7690 }, { "epoch": 0.5721078832008322, "grad_norm": 5.568028427660465e-06, "learning_rate": 6.474897832817337e-06, "loss": 0.0, "step": 7700 }, { "epoch": 0.5728508804517424, "grad_norm": 7.124887360987486e-06, "learning_rate": 6.4729164086687305e-06, "loss": 0.0, "step": 7710 }, { "epoch": 0.5735938777026525, "grad_norm": 3.9266315070563e-06, "learning_rate": 6.470934984520123e-06, "loss": 0.0004, "step": 7720 }, { "epoch": 0.5743368749535627, "grad_norm": 0.0002633326512295753, "learning_rate": 6.468953560371517e-06, "loss": 0.0, "step": 7730 }, { "epoch": 0.5750798722044729, "grad_norm": 4.659702623754924e-10, "learning_rate": 6.46697213622291e-06, "loss": 0.0001, "step": 7740 }, { "epoch": 0.575822869455383, "grad_norm": 7.636138222395436e-14, "learning_rate": 6.4649907120743035e-06, "loss": 0.0001, "step": 7750 }, { "epoch": 0.5765658667062932, "grad_norm": 0.016019893810153008, "learning_rate": 6.463009287925696e-06, "loss": 0.0, "step": 7760 }, { "epoch": 0.5773088639572034, "grad_norm": 0.10692372918128967, "learning_rate": 6.461027863777089e-06, "loss": 0.0, "step": 7770 }, { "epoch": 0.5780518612081136, "grad_norm": 1.2054341596012819e-06, "learning_rate": 6.459046439628482e-06, "loss": 0.1063, "step": 7780 }, { "epoch": 0.5787948584590237, "grad_norm": 0.001959781628102064, "learning_rate": 6.457065015479876e-06, "loss": 0.0001, "step": 7790 }, { "epoch": 0.5795378557099339, "grad_norm": 0.0018165361834689975, "learning_rate": 6.455083591331269e-06, "loss": 0.0001, "step": 7800 }, { "epoch": 0.5802808529608441, "grad_norm": 9.433304512640461e-06, "learning_rate": 6.453102167182663e-06, "loss": 0.0002, "step": 7810 }, { "epoch": 0.5810238502117542, "grad_norm": 3.025428171876676e-11, "learning_rate": 6.451120743034056e-06, "loss": 0.0, "step": 7820 }, { "epoch": 0.5817668474626644, "grad_norm": 0.0015475014224648476, "learning_rate": 6.449139318885449e-06, "loss": 0.0, "step": 7830 }, { "epoch": 0.5825098447135746, "grad_norm": 0.0002484378346707672, "learning_rate": 6.447157894736841e-06, "loss": 0.2629, "step": 7840 }, { "epoch": 0.5832528419644847, "grad_norm": 1.275467231831584e-10, "learning_rate": 6.445176470588235e-06, "loss": 0.0001, "step": 7850 }, { "epoch": 0.5839958392153949, "grad_norm": 0.002090283203870058, "learning_rate": 6.443195046439628e-06, "loss": 0.0, "step": 7860 }, { "epoch": 0.5847388364663051, "grad_norm": 1.461653709411621, "learning_rate": 6.441213622291022e-06, "loss": 0.0026, "step": 7870 }, { "epoch": 0.5854818337172153, "grad_norm": 1.1970661262239446e-07, "learning_rate": 6.4392321981424144e-06, "loss": 0.0, "step": 7880 }, { "epoch": 0.5862248309681254, "grad_norm": 1.3728321590633641e-08, "learning_rate": 6.437250773993808e-06, "loss": 0.0157, "step": 7890 }, { "epoch": 0.5869678282190356, "grad_norm": 1.6242016620537925e-09, "learning_rate": 6.435269349845201e-06, "loss": 0.0, "step": 7900 }, { "epoch": 0.5877108254699458, "grad_norm": 1.4378976231910201e-08, "learning_rate": 6.433287925696594e-06, "loss": 0.0, "step": 7910 }, { "epoch": 0.5884538227208559, "grad_norm": 0.008077711798250675, "learning_rate": 6.4313065015479875e-06, "loss": 0.0, "step": 7920 }, { "epoch": 0.5891968199717661, "grad_norm": 0.0002544414601288736, "learning_rate": 6.42932507739938e-06, "loss": 0.005, "step": 7930 }, { "epoch": 0.5899398172226763, "grad_norm": 5.384289541998655e-11, "learning_rate": 6.4273436532507736e-06, "loss": 0.0, "step": 7940 }, { "epoch": 0.5906828144735864, "grad_norm": 2.9039101678840495e-10, "learning_rate": 6.425362229102167e-06, "loss": 0.0003, "step": 7950 }, { "epoch": 0.5914258117244966, "grad_norm": 1.050093567656063e-09, "learning_rate": 6.4233808049535605e-06, "loss": 0.0, "step": 7960 }, { "epoch": 0.5921688089754068, "grad_norm": 0.005444488953799009, "learning_rate": 6.421399380804954e-06, "loss": 0.0002, "step": 7970 }, { "epoch": 0.592911806226317, "grad_norm": 7.657840290706264e-13, "learning_rate": 6.419417956656347e-06, "loss": 0.0, "step": 7980 }, { "epoch": 0.5936548034772271, "grad_norm": 3.0229304570639215e-07, "learning_rate": 6.417436532507739e-06, "loss": 0.0, "step": 7990 }, { "epoch": 0.5943978007281373, "grad_norm": 6.853522063465789e-05, "learning_rate": 6.415455108359133e-06, "loss": 0.0, "step": 8000 }, { "epoch": 0.5951407979790475, "grad_norm": 5.9747443199157715, "learning_rate": 6.413473684210526e-06, "loss": 0.0009, "step": 8010 }, { "epoch": 0.5958837952299576, "grad_norm": 3.2566595253857855e-11, "learning_rate": 6.41149226006192e-06, "loss": 0.0, "step": 8020 }, { "epoch": 0.5966267924808678, "grad_norm": 6.817396647207374e-10, "learning_rate": 6.409510835913312e-06, "loss": 0.0, "step": 8030 }, { "epoch": 0.597369789731778, "grad_norm": 1.0248624171538268e-10, "learning_rate": 6.407529411764706e-06, "loss": 0.0, "step": 8040 }, { "epoch": 0.5981127869826882, "grad_norm": 7.406639301876794e-09, "learning_rate": 6.405547987616099e-06, "loss": 0.0, "step": 8050 }, { "epoch": 0.5988557842335983, "grad_norm": 1.1266004795729145e-09, "learning_rate": 6.403566563467492e-06, "loss": 0.0, "step": 8060 }, { "epoch": 0.5995987814845085, "grad_norm": 6.674009637208655e-05, "learning_rate": 6.401585139318885e-06, "loss": 0.0, "step": 8070 }, { "epoch": 0.6003417787354187, "grad_norm": 0.034099988639354706, "learning_rate": 6.399603715170278e-06, "loss": 0.0003, "step": 8080 }, { "epoch": 0.6010847759863288, "grad_norm": 4.5263641368364915e-06, "learning_rate": 6.397622291021671e-06, "loss": 0.0006, "step": 8090 }, { "epoch": 0.601827773237239, "grad_norm": 6.958866265449615e-07, "learning_rate": 6.395640866873065e-06, "loss": 0.0165, "step": 8100 }, { "epoch": 0.6025707704881492, "grad_norm": 1.8344485652743714e-11, "learning_rate": 6.393659442724458e-06, "loss": 0.0, "step": 8110 }, { "epoch": 0.6033137677390593, "grad_norm": 1.8987422745198046e-09, "learning_rate": 6.391678018575852e-06, "loss": 0.0003, "step": 8120 }, { "epoch": 0.6040567649899695, "grad_norm": 2.7988012334390078e-06, "learning_rate": 6.389696594427244e-06, "loss": 0.0, "step": 8130 }, { "epoch": 0.6047997622408797, "grad_norm": 1.4109196854406036e-05, "learning_rate": 6.387715170278637e-06, "loss": 0.0, "step": 8140 }, { "epoch": 0.6055427594917899, "grad_norm": 0.003218319732695818, "learning_rate": 6.3857337461300305e-06, "loss": 0.0003, "step": 8150 }, { "epoch": 0.6062857567427, "grad_norm": 1.9810070170933614e-06, "learning_rate": 6.383752321981424e-06, "loss": 0.005, "step": 8160 }, { "epoch": 0.6070287539936102, "grad_norm": 0.0005217504221946001, "learning_rate": 6.3817708978328174e-06, "loss": 0.0, "step": 8170 }, { "epoch": 0.6077717512445204, "grad_norm": 0.0014107367023825645, "learning_rate": 6.37978947368421e-06, "loss": 0.0, "step": 8180 }, { "epoch": 0.6085147484954305, "grad_norm": 4.821816901312559e-07, "learning_rate": 6.3778080495356035e-06, "loss": 0.0, "step": 8190 }, { "epoch": 0.6092577457463407, "grad_norm": 1.7456792955616862e-11, "learning_rate": 6.375826625386997e-06, "loss": 0.0024, "step": 8200 }, { "epoch": 0.6100007429972509, "grad_norm": 0.02231399156153202, "learning_rate": 6.37384520123839e-06, "loss": 0.0657, "step": 8210 }, { "epoch": 0.610743740248161, "grad_norm": 1.3426299095153809, "learning_rate": 6.371863777089783e-06, "loss": 0.0002, "step": 8220 }, { "epoch": 0.6114867374990712, "grad_norm": 4.833533129300349e-09, "learning_rate": 6.369882352941176e-06, "loss": 0.0009, "step": 8230 }, { "epoch": 0.6122297347499814, "grad_norm": 8.332702918778523e-07, "learning_rate": 6.367900928792569e-06, "loss": 0.0, "step": 8240 }, { "epoch": 0.6129727320008916, "grad_norm": 2.5480322296544955e-09, "learning_rate": 6.365919504643963e-06, "loss": 0.0, "step": 8250 }, { "epoch": 0.6137157292518017, "grad_norm": 6.379737854003906, "learning_rate": 6.363938080495356e-06, "loss": 0.0496, "step": 8260 }, { "epoch": 0.6144587265027119, "grad_norm": 1.203751344291959e-05, "learning_rate": 6.36195665634675e-06, "loss": 0.0, "step": 8270 }, { "epoch": 0.6152017237536221, "grad_norm": 0.00012066869385307655, "learning_rate": 6.359975232198141e-06, "loss": 0.0, "step": 8280 }, { "epoch": 0.6159447210045322, "grad_norm": 0.15650011599063873, "learning_rate": 6.357993808049535e-06, "loss": 0.0, "step": 8290 }, { "epoch": 0.6166877182554424, "grad_norm": 1.2971264687355877e-12, "learning_rate": 6.356012383900928e-06, "loss": 0.0, "step": 8300 }, { "epoch": 0.6174307155063526, "grad_norm": 1.7732956214544515e-09, "learning_rate": 6.354030959752322e-06, "loss": 0.0, "step": 8310 }, { "epoch": 0.6181737127572628, "grad_norm": 2.9527835732068297e-10, "learning_rate": 6.352049535603715e-06, "loss": 0.0031, "step": 8320 }, { "epoch": 0.6189167100081729, "grad_norm": 9.507188224233687e-06, "learning_rate": 6.350068111455108e-06, "loss": 0.0002, "step": 8330 }, { "epoch": 0.6196597072590831, "grad_norm": 1.4581835330318427e-06, "learning_rate": 6.348086687306501e-06, "loss": 0.0007, "step": 8340 }, { "epoch": 0.6204027045099934, "grad_norm": 8.147990229190327e-07, "learning_rate": 6.346105263157895e-06, "loss": 0.0002, "step": 8350 }, { "epoch": 0.6211457017609034, "grad_norm": 642.1492309570312, "learning_rate": 6.3441238390092874e-06, "loss": 0.5625, "step": 8360 }, { "epoch": 0.6218886990118137, "grad_norm": 2.1766984446003335e-06, "learning_rate": 6.342142414860681e-06, "loss": 0.0, "step": 8370 }, { "epoch": 0.6226316962627239, "grad_norm": 2.0561956262099557e-05, "learning_rate": 6.3401609907120735e-06, "loss": 0.0, "step": 8380 }, { "epoch": 0.623374693513634, "grad_norm": 6.153625011444092, "learning_rate": 6.338179566563467e-06, "loss": 0.0008, "step": 8390 }, { "epoch": 0.6241176907645442, "grad_norm": 1.4457930319622392e-07, "learning_rate": 6.3361981424148605e-06, "loss": 0.0001, "step": 8400 }, { "epoch": 0.6248606880154544, "grad_norm": 0.00044177414383739233, "learning_rate": 6.334216718266254e-06, "loss": 0.0902, "step": 8410 }, { "epoch": 0.6256036852663646, "grad_norm": 2.8457465273135085e-09, "learning_rate": 6.332235294117647e-06, "loss": 0.0, "step": 8420 }, { "epoch": 0.6263466825172747, "grad_norm": 4.810104655916803e-06, "learning_rate": 6.330253869969039e-06, "loss": 0.0009, "step": 8430 }, { "epoch": 0.6270896797681849, "grad_norm": 0.00031204777769744396, "learning_rate": 6.328272445820433e-06, "loss": 0.0001, "step": 8440 }, { "epoch": 0.6278326770190951, "grad_norm": 0.0003991861594840884, "learning_rate": 6.326291021671826e-06, "loss": 0.0, "step": 8450 }, { "epoch": 0.6285756742700052, "grad_norm": 5.799013888463378e-05, "learning_rate": 6.32430959752322e-06, "loss": 0.0004, "step": 8460 }, { "epoch": 0.6293186715209154, "grad_norm": 0.12285811454057693, "learning_rate": 6.322328173374613e-06, "loss": 0.0014, "step": 8470 }, { "epoch": 0.6300616687718256, "grad_norm": 0.006039340980350971, "learning_rate": 6.320346749226006e-06, "loss": 0.0031, "step": 8480 }, { "epoch": 0.6308046660227357, "grad_norm": 3.530200298751396e-10, "learning_rate": 6.318365325077399e-06, "loss": 0.0, "step": 8490 }, { "epoch": 0.6315476632736459, "grad_norm": 2.8169969468194722e-08, "learning_rate": 6.316383900928793e-06, "loss": 0.0, "step": 8500 }, { "epoch": 0.6322906605245561, "grad_norm": 0.00012049041833961383, "learning_rate": 6.314402476780185e-06, "loss": 0.0014, "step": 8510 }, { "epoch": 0.6330336577754663, "grad_norm": 5.898574784168886e-08, "learning_rate": 6.312421052631579e-06, "loss": 0.0, "step": 8520 }, { "epoch": 0.6337766550263764, "grad_norm": 5.0912523875012994e-05, "learning_rate": 6.310439628482971e-06, "loss": 0.0003, "step": 8530 }, { "epoch": 0.6345196522772866, "grad_norm": 2.341688398743713e-09, "learning_rate": 6.308458204334365e-06, "loss": 0.0007, "step": 8540 }, { "epoch": 0.6352626495281968, "grad_norm": 0.0011061724508181214, "learning_rate": 6.306476780185758e-06, "loss": 0.0031, "step": 8550 }, { "epoch": 0.6360056467791069, "grad_norm": 0.00010917489271378145, "learning_rate": 6.304495356037152e-06, "loss": 0.0, "step": 8560 }, { "epoch": 0.6367486440300171, "grad_norm": 0.0019877322483807802, "learning_rate": 6.302513931888545e-06, "loss": 0.005, "step": 8570 }, { "epoch": 0.6374916412809273, "grad_norm": 3.2404634566773893e-06, "learning_rate": 6.300532507739937e-06, "loss": 0.0, "step": 8580 }, { "epoch": 0.6382346385318375, "grad_norm": 0.004815362393856049, "learning_rate": 6.2985510835913305e-06, "loss": 0.0063, "step": 8590 }, { "epoch": 0.6389776357827476, "grad_norm": 2.0056767358056504e-09, "learning_rate": 6.296569659442724e-06, "loss": 0.0001, "step": 8600 }, { "epoch": 0.6397206330336578, "grad_norm": 4.7629350774514023e-07, "learning_rate": 6.294588235294117e-06, "loss": 0.0, "step": 8610 }, { "epoch": 0.640463630284568, "grad_norm": 1.8592794859273454e-08, "learning_rate": 6.292606811145511e-06, "loss": 0.0, "step": 8620 }, { "epoch": 0.6412066275354781, "grad_norm": 1.3886802818774413e-09, "learning_rate": 6.290625386996904e-06, "loss": 0.0, "step": 8630 }, { "epoch": 0.6419496247863883, "grad_norm": 8.195591760795651e-08, "learning_rate": 6.288643962848297e-06, "loss": 0.0001, "step": 8640 }, { "epoch": 0.6426926220372985, "grad_norm": 2.905402968992471e-14, "learning_rate": 6.2866625386996904e-06, "loss": 0.0, "step": 8650 }, { "epoch": 0.6434356192882086, "grad_norm": 3.501263927319087e-05, "learning_rate": 6.284681114551083e-06, "loss": 0.0, "step": 8660 }, { "epoch": 0.6441786165391188, "grad_norm": 1.4161552486235962e-10, "learning_rate": 6.2826996904024765e-06, "loss": 0.0, "step": 8670 }, { "epoch": 0.644921613790029, "grad_norm": 6.586337235603423e-07, "learning_rate": 6.280718266253869e-06, "loss": 0.0003, "step": 8680 }, { "epoch": 0.6456646110409392, "grad_norm": 8.782965466025416e-08, "learning_rate": 6.278736842105263e-06, "loss": 0.0033, "step": 8690 }, { "epoch": 0.6464076082918493, "grad_norm": 5.031603359384462e-05, "learning_rate": 6.276755417956656e-06, "loss": 0.0001, "step": 8700 }, { "epoch": 0.6471506055427595, "grad_norm": 0.0007284189341589808, "learning_rate": 6.2747739938080496e-06, "loss": 0.0032, "step": 8710 }, { "epoch": 0.6478936027936697, "grad_norm": 4.621444077201886e-06, "learning_rate": 6.272792569659443e-06, "loss": 0.0, "step": 8720 }, { "epoch": 0.6486366000445798, "grad_norm": 9.700257301330566, "learning_rate": 6.270811145510835e-06, "loss": 0.0012, "step": 8730 }, { "epoch": 0.64937959729549, "grad_norm": 2.238198817394732e-07, "learning_rate": 6.268829721362228e-06, "loss": 0.0053, "step": 8740 }, { "epoch": 0.6501225945464002, "grad_norm": 7.751730368710241e-10, "learning_rate": 6.266848297213622e-06, "loss": 0.0408, "step": 8750 }, { "epoch": 0.6508655917973103, "grad_norm": 4.933460573397497e-08, "learning_rate": 6.264866873065015e-06, "loss": 0.0001, "step": 8760 }, { "epoch": 0.6516085890482205, "grad_norm": 7.965945769683458e-06, "learning_rate": 6.262885448916409e-06, "loss": 0.0, "step": 8770 }, { "epoch": 0.6523515862991307, "grad_norm": 2.9053885555185843e-06, "learning_rate": 6.260904024767802e-06, "loss": 0.0038, "step": 8780 }, { "epoch": 0.6530945835500409, "grad_norm": 9.609207154426258e-06, "learning_rate": 6.258922600619195e-06, "loss": 0.0, "step": 8790 }, { "epoch": 0.653837580800951, "grad_norm": 0.05899185687303543, "learning_rate": 6.256941176470588e-06, "loss": 0.0, "step": 8800 }, { "epoch": 0.6545805780518612, "grad_norm": 0.003531279042363167, "learning_rate": 6.254959752321981e-06, "loss": 0.0, "step": 8810 }, { "epoch": 0.6553235753027714, "grad_norm": 1.3975181900605094e-05, "learning_rate": 6.252978328173374e-06, "loss": 0.0002, "step": 8820 }, { "epoch": 0.6560665725536815, "grad_norm": 5.629171369037067e-07, "learning_rate": 6.250996904024768e-06, "loss": 0.0, "step": 8830 }, { "epoch": 0.6568095698045917, "grad_norm": 0.0003512411785777658, "learning_rate": 6.2490154798761604e-06, "loss": 0.0, "step": 8840 }, { "epoch": 0.6575525670555019, "grad_norm": 2.8467117552111176e-10, "learning_rate": 6.247034055727554e-06, "loss": 0.0, "step": 8850 }, { "epoch": 0.6582955643064121, "grad_norm": 2.804869947414801e-10, "learning_rate": 6.245052631578947e-06, "loss": 0.0, "step": 8860 }, { "epoch": 0.6590385615573222, "grad_norm": 5.392179064367397e-11, "learning_rate": 6.243071207430341e-06, "loss": 0.0, "step": 8870 }, { "epoch": 0.6597815588082324, "grad_norm": 1.9226559634741985e-13, "learning_rate": 6.2410897832817335e-06, "loss": 0.0, "step": 8880 }, { "epoch": 0.6605245560591426, "grad_norm": 2.4125051498413086, "learning_rate": 6.239108359133126e-06, "loss": 0.0028, "step": 8890 }, { "epoch": 0.6612675533100527, "grad_norm": 2.0695937564596534e-05, "learning_rate": 6.2371269349845196e-06, "loss": 0.0, "step": 8900 }, { "epoch": 0.6620105505609629, "grad_norm": 1.349458784716262e-06, "learning_rate": 6.235145510835913e-06, "loss": 0.0, "step": 8910 }, { "epoch": 0.6627535478118731, "grad_norm": 3.6755844895486334e-11, "learning_rate": 6.2331640866873065e-06, "loss": 0.0085, "step": 8920 }, { "epoch": 0.6634965450627832, "grad_norm": 4.353251642896794e-05, "learning_rate": 6.2311826625387e-06, "loss": 0.0, "step": 8930 }, { "epoch": 0.6642395423136934, "grad_norm": 1.1283654011151611e-08, "learning_rate": 6.229201238390093e-06, "loss": 0.0009, "step": 8940 }, { "epoch": 0.6649825395646036, "grad_norm": 0.505535900592804, "learning_rate": 6.227219814241485e-06, "loss": 0.0036, "step": 8950 }, { "epoch": 0.6657255368155138, "grad_norm": 0.05293703451752663, "learning_rate": 6.225238390092879e-06, "loss": 0.0, "step": 8960 }, { "epoch": 0.6664685340664239, "grad_norm": 1.1248572491240338e-06, "learning_rate": 6.223256965944272e-06, "loss": 0.0, "step": 8970 }, { "epoch": 0.6672115313173341, "grad_norm": 0.0006259474903345108, "learning_rate": 6.221275541795666e-06, "loss": 0.0001, "step": 8980 }, { "epoch": 0.6679545285682443, "grad_norm": 2.1951751180693257e-11, "learning_rate": 6.219294117647058e-06, "loss": 0.0, "step": 8990 }, { "epoch": 0.6686975258191544, "grad_norm": 7.785555499140173e-05, "learning_rate": 6.217312693498452e-06, "loss": 0.0, "step": 9000 }, { "epoch": 0.6694405230700646, "grad_norm": 0.0022915415465831757, "learning_rate": 6.215331269349845e-06, "loss": 0.0111, "step": 9010 }, { "epoch": 0.6701835203209748, "grad_norm": 0.0005879870732314885, "learning_rate": 6.213349845201239e-06, "loss": 0.0, "step": 9020 }, { "epoch": 0.670926517571885, "grad_norm": 0.028795717284083366, "learning_rate": 6.211368421052631e-06, "loss": 0.0033, "step": 9030 }, { "epoch": 0.6716695148227951, "grad_norm": 1.6599516899917788e-12, "learning_rate": 6.209386996904024e-06, "loss": 0.0001, "step": 9040 }, { "epoch": 0.6724125120737053, "grad_norm": 5.275196031107043e-07, "learning_rate": 6.207405572755417e-06, "loss": 0.0025, "step": 9050 }, { "epoch": 0.6731555093246155, "grad_norm": 0.2497449517250061, "learning_rate": 6.205424148606811e-06, "loss": 0.0001, "step": 9060 }, { "epoch": 0.6738985065755256, "grad_norm": 2.0690038127213484e-06, "learning_rate": 6.203442724458204e-06, "loss": 0.0, "step": 9070 }, { "epoch": 0.6746415038264358, "grad_norm": 4.6172323776849655e-12, "learning_rate": 6.201461300309598e-06, "loss": 0.0, "step": 9080 }, { "epoch": 0.675384501077346, "grad_norm": 0.034210335463285446, "learning_rate": 6.19947987616099e-06, "loss": 0.0, "step": 9090 }, { "epoch": 0.6761274983282561, "grad_norm": 0.002168006729334593, "learning_rate": 6.197498452012383e-06, "loss": 0.0, "step": 9100 }, { "epoch": 0.6768704955791663, "grad_norm": 2.967438984423018e-14, "learning_rate": 6.1955170278637765e-06, "loss": 0.0, "step": 9110 }, { "epoch": 0.6776134928300765, "grad_norm": 4.1828279790934175e-05, "learning_rate": 6.19353560371517e-06, "loss": 0.0012, "step": 9120 }, { "epoch": 0.6783564900809868, "grad_norm": 1.3192060291378205e-11, "learning_rate": 6.1915541795665635e-06, "loss": 0.0, "step": 9130 }, { "epoch": 0.6790994873318968, "grad_norm": 0.00048436567885801196, "learning_rate": 6.189572755417956e-06, "loss": 0.0, "step": 9140 }, { "epoch": 0.679842484582807, "grad_norm": 0.0018687432166188955, "learning_rate": 6.1875913312693495e-06, "loss": 0.0003, "step": 9150 }, { "epoch": 0.6805854818337173, "grad_norm": 2.106462365647488e-10, "learning_rate": 6.185609907120743e-06, "loss": 0.0001, "step": 9160 }, { "epoch": 0.6813284790846273, "grad_norm": 0.028425035998225212, "learning_rate": 6.1836284829721365e-06, "loss": 0.0002, "step": 9170 }, { "epoch": 0.6820714763355376, "grad_norm": 0.010357068851590157, "learning_rate": 6.181647058823529e-06, "loss": 0.0015, "step": 9180 }, { "epoch": 0.6828144735864478, "grad_norm": 0.00011436029308242723, "learning_rate": 6.179665634674922e-06, "loss": 0.0, "step": 9190 }, { "epoch": 0.6835574708373579, "grad_norm": 6.341998037839858e-10, "learning_rate": 6.177684210526315e-06, "loss": 0.0002, "step": 9200 }, { "epoch": 0.684300468088268, "grad_norm": 1.6868138118297793e-05, "learning_rate": 6.175702786377709e-06, "loss": 0.0, "step": 9210 }, { "epoch": 0.6850434653391783, "grad_norm": 6.5795107140331766e-09, "learning_rate": 6.173721362229102e-06, "loss": 0.0, "step": 9220 }, { "epoch": 0.6857864625900885, "grad_norm": 0.0005434908089227974, "learning_rate": 6.171739938080496e-06, "loss": 0.0, "step": 9230 }, { "epoch": 0.6865294598409986, "grad_norm": 9.726455027703196e-05, "learning_rate": 6.169758513931888e-06, "loss": 0.0, "step": 9240 }, { "epoch": 0.6872724570919088, "grad_norm": 1.3084656025341701e-08, "learning_rate": 6.167777089783281e-06, "loss": 0.0, "step": 9250 }, { "epoch": 0.688015454342819, "grad_norm": 0.0019909602124243975, "learning_rate": 6.165795665634674e-06, "loss": 0.0, "step": 9260 }, { "epoch": 0.6887584515937291, "grad_norm": 0.004965466447174549, "learning_rate": 6.163814241486068e-06, "loss": 0.0003, "step": 9270 }, { "epoch": 0.6895014488446393, "grad_norm": 1.300118981362175e-07, "learning_rate": 6.161832817337461e-06, "loss": 0.0, "step": 9280 }, { "epoch": 0.6902444460955495, "grad_norm": 2.1760074560006615e-06, "learning_rate": 6.159851393188854e-06, "loss": 0.0, "step": 9290 }, { "epoch": 0.6909874433464597, "grad_norm": 1.2454665011318866e-05, "learning_rate": 6.157869969040247e-06, "loss": 0.0008, "step": 9300 }, { "epoch": 0.6917304405973698, "grad_norm": 2.2068299585953355e-05, "learning_rate": 6.155888544891641e-06, "loss": 0.0, "step": 9310 }, { "epoch": 0.69247343784828, "grad_norm": 0.00019262022397015244, "learning_rate": 6.153907120743034e-06, "loss": 0.0004, "step": 9320 }, { "epoch": 0.6932164350991902, "grad_norm": 4.30234058512724e-06, "learning_rate": 6.151925696594427e-06, "loss": 0.0, "step": 9330 }, { "epoch": 0.6939594323501003, "grad_norm": 7.078849080244254e-07, "learning_rate": 6.1499442724458195e-06, "loss": 0.0, "step": 9340 }, { "epoch": 0.6947024296010105, "grad_norm": 0.0007000958430580795, "learning_rate": 6.147962848297213e-06, "loss": 0.008, "step": 9350 }, { "epoch": 0.6954454268519207, "grad_norm": 0.0003875796392094344, "learning_rate": 6.1459814241486065e-06, "loss": 0.0, "step": 9360 }, { "epoch": 0.6961884241028308, "grad_norm": 0.00012762853293679655, "learning_rate": 6.144e-06, "loss": 0.0, "step": 9370 }, { "epoch": 0.696931421353741, "grad_norm": 3.217312416481377e-09, "learning_rate": 6.1420185758513934e-06, "loss": 0.0, "step": 9380 }, { "epoch": 0.6976744186046512, "grad_norm": 0.00989949144423008, "learning_rate": 6.140037151702786e-06, "loss": 0.0, "step": 9390 }, { "epoch": 0.6984174158555614, "grad_norm": 0.00033113628160208464, "learning_rate": 6.138055727554179e-06, "loss": 0.0031, "step": 9400 }, { "epoch": 0.6991604131064715, "grad_norm": 0.0008172902744263411, "learning_rate": 6.136074303405572e-06, "loss": 0.0, "step": 9410 }, { "epoch": 0.6999034103573817, "grad_norm": 7.38310773158446e-05, "learning_rate": 6.134092879256966e-06, "loss": 0.0, "step": 9420 }, { "epoch": 0.7006464076082919, "grad_norm": 0.013618280179798603, "learning_rate": 6.132111455108359e-06, "loss": 0.004, "step": 9430 }, { "epoch": 0.701389404859202, "grad_norm": 0.006338709965348244, "learning_rate": 6.130130030959752e-06, "loss": 0.0, "step": 9440 }, { "epoch": 0.7021324021101122, "grad_norm": 0.027454465627670288, "learning_rate": 6.128148606811145e-06, "loss": 0.0, "step": 9450 }, { "epoch": 0.7028753993610224, "grad_norm": 5.7003412406997356e-11, "learning_rate": 6.126167182662539e-06, "loss": 0.0004, "step": 9460 }, { "epoch": 0.7036183966119325, "grad_norm": 0.11014649271965027, "learning_rate": 6.124185758513932e-06, "loss": 0.1289, "step": 9470 }, { "epoch": 0.7043613938628427, "grad_norm": 0.003714525606483221, "learning_rate": 6.122204334365325e-06, "loss": 0.0, "step": 9480 }, { "epoch": 0.7051043911137529, "grad_norm": 0.0006915118428878486, "learning_rate": 6.120222910216717e-06, "loss": 0.0, "step": 9490 }, { "epoch": 0.7058473883646631, "grad_norm": 0.030915629118680954, "learning_rate": 6.118241486068111e-06, "loss": 0.0, "step": 9500 }, { "epoch": 0.7065903856155732, "grad_norm": 0.000529183482285589, "learning_rate": 6.116260061919504e-06, "loss": 0.0, "step": 9510 }, { "epoch": 0.7073333828664834, "grad_norm": 3.6305945627645997e-07, "learning_rate": 6.114278637770898e-06, "loss": 0.0, "step": 9520 }, { "epoch": 0.7080763801173936, "grad_norm": 2.7834119453729045e-09, "learning_rate": 6.112297213622291e-06, "loss": 0.0, "step": 9530 }, { "epoch": 0.7088193773683037, "grad_norm": 4.1636467358330265e-05, "learning_rate": 6.110315789473685e-06, "loss": 0.0, "step": 9540 }, { "epoch": 0.7095623746192139, "grad_norm": 0.0001103381800930947, "learning_rate": 6.1083343653250765e-06, "loss": 0.0, "step": 9550 }, { "epoch": 0.7103053718701241, "grad_norm": 0.00013358589785639197, "learning_rate": 6.10635294117647e-06, "loss": 0.0001, "step": 9560 }, { "epoch": 0.7110483691210343, "grad_norm": 0.0002271732228109613, "learning_rate": 6.1043715170278634e-06, "loss": 0.0021, "step": 9570 }, { "epoch": 0.7117913663719444, "grad_norm": 5.604282705462538e-06, "learning_rate": 6.102390092879257e-06, "loss": 0.0002, "step": 9580 }, { "epoch": 0.7125343636228546, "grad_norm": 4.5418968852573016e-07, "learning_rate": 6.10040866873065e-06, "loss": 0.0, "step": 9590 }, { "epoch": 0.7132773608737648, "grad_norm": 2.7547335072763968e-12, "learning_rate": 6.098427244582043e-06, "loss": 0.0, "step": 9600 }, { "epoch": 0.7140203581246749, "grad_norm": 7.073685170597144e-13, "learning_rate": 6.0964458204334365e-06, "loss": 0.0, "step": 9610 }, { "epoch": 0.7147633553755851, "grad_norm": 2.2641354069374842e-10, "learning_rate": 6.094464396284829e-06, "loss": 0.0003, "step": 9620 }, { "epoch": 0.7155063526264953, "grad_norm": 0.0013932279543951154, "learning_rate": 6.0924829721362226e-06, "loss": 0.0009, "step": 9630 }, { "epoch": 0.7162493498774054, "grad_norm": 1.1341384151819511e-06, "learning_rate": 6.090501547987616e-06, "loss": 0.0, "step": 9640 }, { "epoch": 0.7169923471283156, "grad_norm": 2.2406553066948998e-13, "learning_rate": 6.088520123839009e-06, "loss": 0.0, "step": 9650 }, { "epoch": 0.7177353443792258, "grad_norm": 2.2642272501371963e-09, "learning_rate": 6.086538699690402e-06, "loss": 0.0, "step": 9660 }, { "epoch": 0.718478341630136, "grad_norm": 3.519062700932423e-16, "learning_rate": 6.084557275541796e-06, "loss": 0.0007, "step": 9670 }, { "epoch": 0.7192213388810461, "grad_norm": 3.240882506361231e-05, "learning_rate": 6.082575851393189e-06, "loss": 0.0, "step": 9680 }, { "epoch": 0.7199643361319563, "grad_norm": 4.70983432023786e-05, "learning_rate": 6.0805944272445825e-06, "loss": 0.0, "step": 9690 }, { "epoch": 0.7207073333828665, "grad_norm": 0.04641414061188698, "learning_rate": 6.078613003095974e-06, "loss": 0.0, "step": 9700 }, { "epoch": 0.7214503306337766, "grad_norm": 3.0081785951097118e-09, "learning_rate": 6.076631578947368e-06, "loss": 0.0014, "step": 9710 }, { "epoch": 0.7221933278846868, "grad_norm": 1.1809425437547816e-08, "learning_rate": 6.074650154798761e-06, "loss": 0.0, "step": 9720 }, { "epoch": 0.722936325135597, "grad_norm": 4.4722103624550424e-12, "learning_rate": 6.072668730650155e-06, "loss": 0.0, "step": 9730 }, { "epoch": 0.7236793223865071, "grad_norm": 5.773491125182773e-07, "learning_rate": 6.070687306501548e-06, "loss": 0.0002, "step": 9740 }, { "epoch": 0.7244223196374173, "grad_norm": 4.262565767021442e-08, "learning_rate": 6.068705882352941e-06, "loss": 0.3, "step": 9750 }, { "epoch": 0.7251653168883275, "grad_norm": 6.834246168117841e-13, "learning_rate": 6.066724458204334e-06, "loss": 0.0144, "step": 9760 }, { "epoch": 0.7259083141392377, "grad_norm": 0.6011549234390259, "learning_rate": 6.064743034055727e-06, "loss": 0.0001, "step": 9770 }, { "epoch": 0.7266513113901478, "grad_norm": 0.0003689719014801085, "learning_rate": 6.06276160990712e-06, "loss": 0.0016, "step": 9780 }, { "epoch": 0.727394308641058, "grad_norm": 0.000223283568629995, "learning_rate": 6.060780185758514e-06, "loss": 0.0, "step": 9790 }, { "epoch": 0.7281373058919682, "grad_norm": 0.00011272892152192071, "learning_rate": 6.0587987616099065e-06, "loss": 0.0, "step": 9800 }, { "epoch": 0.7288803031428783, "grad_norm": 4.4774513957568374e-13, "learning_rate": 6.0568173374613e-06, "loss": 0.0, "step": 9810 }, { "epoch": 0.7296233003937885, "grad_norm": 2.4415814040111363e-17, "learning_rate": 6.054835913312693e-06, "loss": 0.0, "step": 9820 }, { "epoch": 0.7303662976446987, "grad_norm": 9.66356550335945e-10, "learning_rate": 6.052854489164087e-06, "loss": 0.0, "step": 9830 }, { "epoch": 0.7311092948956089, "grad_norm": 6.973965849255137e-11, "learning_rate": 6.05087306501548e-06, "loss": 0.0, "step": 9840 }, { "epoch": 0.731852292146519, "grad_norm": 0.004760006442666054, "learning_rate": 6.048891640866872e-06, "loss": 0.0101, "step": 9850 }, { "epoch": 0.7325952893974292, "grad_norm": 0.013162006624042988, "learning_rate": 6.046910216718266e-06, "loss": 0.0006, "step": 9860 }, { "epoch": 0.7333382866483394, "grad_norm": 6.011569553265872e-07, "learning_rate": 6.044928792569659e-06, "loss": 0.0, "step": 9870 }, { "epoch": 0.7340812838992495, "grad_norm": 1.2240430805832148e-05, "learning_rate": 6.0429473684210525e-06, "loss": 0.0, "step": 9880 }, { "epoch": 0.7348242811501597, "grad_norm": 5.5759937822585925e-05, "learning_rate": 6.040965944272446e-06, "loss": 0.0, "step": 9890 }, { "epoch": 0.73556727840107, "grad_norm": 9.928947664050725e-10, "learning_rate": 6.038984520123839e-06, "loss": 0.009, "step": 9900 }, { "epoch": 0.73631027565198, "grad_norm": 1.0722930028350675e-06, "learning_rate": 6.037003095975232e-06, "loss": 0.0, "step": 9910 }, { "epoch": 0.7370532729028902, "grad_norm": 5.0526843551779166e-05, "learning_rate": 6.035021671826625e-06, "loss": 0.0003, "step": 9920 }, { "epoch": 0.7377962701538004, "grad_norm": 0.013250918127596378, "learning_rate": 6.033040247678018e-06, "loss": 0.0, "step": 9930 }, { "epoch": 0.7385392674047107, "grad_norm": 1.5018439626146574e-05, "learning_rate": 6.031058823529412e-06, "loss": 0.0, "step": 9940 }, { "epoch": 0.7392822646556207, "grad_norm": 7.957583392226297e-08, "learning_rate": 6.029077399380804e-06, "loss": 0.0, "step": 9950 }, { "epoch": 0.740025261906531, "grad_norm": 0.001931144972331822, "learning_rate": 6.027095975232198e-06, "loss": 0.0, "step": 9960 }, { "epoch": 0.7407682591574412, "grad_norm": 4.055039971717633e-05, "learning_rate": 6.025114551083591e-06, "loss": 0.0, "step": 9970 }, { "epoch": 0.7415112564083512, "grad_norm": 0.0013619080418720841, "learning_rate": 6.023133126934985e-06, "loss": 0.0852, "step": 9980 }, { "epoch": 0.7422542536592615, "grad_norm": 4.8992864321917295e-05, "learning_rate": 6.021151702786378e-06, "loss": 0.0, "step": 9990 }, { "epoch": 0.7429972509101717, "grad_norm": 2.094691080856137e-05, "learning_rate": 6.01917027863777e-06, "loss": 0.0, "step": 10000 }, { "epoch": 0.7437402481610818, "grad_norm": 4.288490890758112e-05, "learning_rate": 6.017188854489163e-06, "loss": 0.0004, "step": 10010 }, { "epoch": 0.744483245411992, "grad_norm": 0.0004932682495564222, "learning_rate": 6.015207430340557e-06, "loss": 0.0, "step": 10020 }, { "epoch": 0.7452262426629022, "grad_norm": 3.2169389214686817e-06, "learning_rate": 6.01322600619195e-06, "loss": 0.0001, "step": 10030 }, { "epoch": 0.7459692399138124, "grad_norm": 0.000751955434679985, "learning_rate": 6.011244582043344e-06, "loss": 0.0, "step": 10040 }, { "epoch": 0.7467122371647225, "grad_norm": 1.5851906511607852e-14, "learning_rate": 6.0092631578947364e-06, "loss": 0.0, "step": 10050 }, { "epoch": 0.7474552344156327, "grad_norm": 2.5171432677068917e-15, "learning_rate": 6.00728173374613e-06, "loss": 0.0001, "step": 10060 }, { "epoch": 0.7481982316665429, "grad_norm": 0.000306547008221969, "learning_rate": 6.0053003095975225e-06, "loss": 0.0, "step": 10070 }, { "epoch": 0.748941228917453, "grad_norm": 0.00045611447421833873, "learning_rate": 6.003318885448916e-06, "loss": 0.0, "step": 10080 }, { "epoch": 0.7496842261683632, "grad_norm": 5.669852271239506e-06, "learning_rate": 6.0013374613003095e-06, "loss": 0.0, "step": 10090 }, { "epoch": 0.7504272234192734, "grad_norm": 1.3221281847108912e-09, "learning_rate": 5.999356037151702e-06, "loss": 0.0049, "step": 10100 }, { "epoch": 0.7511702206701836, "grad_norm": 8.873130696653866e-16, "learning_rate": 5.9973746130030956e-06, "loss": 0.0, "step": 10110 }, { "epoch": 0.7519132179210937, "grad_norm": 0.019446171820163727, "learning_rate": 5.995393188854489e-06, "loss": 0.0006, "step": 10120 }, { "epoch": 0.7526562151720039, "grad_norm": 9.032380399176532e-11, "learning_rate": 5.9934117647058825e-06, "loss": 0.0, "step": 10130 }, { "epoch": 0.7533992124229141, "grad_norm": 8.339765372511465e-06, "learning_rate": 5.991430340557276e-06, "loss": 0.0, "step": 10140 }, { "epoch": 0.7541422096738242, "grad_norm": 1.0213769066624856e-11, "learning_rate": 5.989448916408668e-06, "loss": 0.008, "step": 10150 }, { "epoch": 0.7548852069247344, "grad_norm": 7.481716579604836e-09, "learning_rate": 5.987467492260061e-06, "loss": 0.0, "step": 10160 }, { "epoch": 0.7556282041756446, "grad_norm": 1.6884021307797537e-12, "learning_rate": 5.985486068111455e-06, "loss": 0.0, "step": 10170 }, { "epoch": 0.7563712014265547, "grad_norm": 6.375951488735154e-05, "learning_rate": 5.983504643962848e-06, "loss": 0.2203, "step": 10180 }, { "epoch": 0.7571141986774649, "grad_norm": 8.779186266849592e-09, "learning_rate": 5.981523219814242e-06, "loss": 0.0, "step": 10190 }, { "epoch": 0.7578571959283751, "grad_norm": 9.987869906136204e-13, "learning_rate": 5.979541795665634e-06, "loss": 0.0, "step": 10200 }, { "epoch": 0.7586001931792853, "grad_norm": 0.34047019481658936, "learning_rate": 5.977560371517028e-06, "loss": 0.0001, "step": 10210 }, { "epoch": 0.7593431904301954, "grad_norm": 1.6523547401448013e-06, "learning_rate": 5.97557894736842e-06, "loss": 0.0001, "step": 10220 }, { "epoch": 0.7600861876811056, "grad_norm": 0.0001123379115597345, "learning_rate": 5.973597523219814e-06, "loss": 0.0, "step": 10230 }, { "epoch": 0.7608291849320158, "grad_norm": 0.013509380631148815, "learning_rate": 5.971616099071207e-06, "loss": 0.0, "step": 10240 }, { "epoch": 0.7615721821829259, "grad_norm": 2.6726630153461883e-07, "learning_rate": 5.9696346749226e-06, "loss": 0.0, "step": 10250 }, { "epoch": 0.7623151794338361, "grad_norm": 3.484597854708227e-08, "learning_rate": 5.967653250773993e-06, "loss": 0.0, "step": 10260 }, { "epoch": 0.7630581766847463, "grad_norm": 1.7349294694213313e-06, "learning_rate": 5.965671826625387e-06, "loss": 0.0, "step": 10270 }, { "epoch": 0.7638011739356564, "grad_norm": 93.26669311523438, "learning_rate": 5.96369040247678e-06, "loss": 0.0168, "step": 10280 }, { "epoch": 0.7645441711865666, "grad_norm": 3.1056784921545955e-10, "learning_rate": 5.961708978328174e-06, "loss": 0.0281, "step": 10290 }, { "epoch": 0.7652871684374768, "grad_norm": 1.740884414402899e-07, "learning_rate": 5.9597275541795656e-06, "loss": 0.0, "step": 10300 }, { "epoch": 0.766030165688387, "grad_norm": 1.5263782188412733e-05, "learning_rate": 5.957746130030959e-06, "loss": 0.0, "step": 10310 }, { "epoch": 0.7667731629392971, "grad_norm": 0.0001345279160887003, "learning_rate": 5.9557647058823525e-06, "loss": 0.0, "step": 10320 }, { "epoch": 0.7675161601902073, "grad_norm": 4.905694007873535, "learning_rate": 5.953783281733746e-06, "loss": 0.0008, "step": 10330 }, { "epoch": 0.7682591574411175, "grad_norm": 0.000262371584540233, "learning_rate": 5.9518018575851395e-06, "loss": 0.0, "step": 10340 }, { "epoch": 0.7690021546920276, "grad_norm": 6.5353679019608535e-06, "learning_rate": 5.949820433436533e-06, "loss": 0.0, "step": 10350 }, { "epoch": 0.7697451519429378, "grad_norm": 0.002605338580906391, "learning_rate": 5.9478390092879255e-06, "loss": 0.0002, "step": 10360 }, { "epoch": 0.770488149193848, "grad_norm": 0.0002481119299773127, "learning_rate": 5.945857585139318e-06, "loss": 0.0001, "step": 10370 }, { "epoch": 0.7712311464447582, "grad_norm": 1.7162235174672524e-11, "learning_rate": 5.943876160990712e-06, "loss": 0.0486, "step": 10380 }, { "epoch": 0.7719741436956683, "grad_norm": 0.0064726173877716064, "learning_rate": 5.941894736842105e-06, "loss": 0.0002, "step": 10390 }, { "epoch": 0.7727171409465785, "grad_norm": 0.0002854816266335547, "learning_rate": 5.939913312693499e-06, "loss": 0.0004, "step": 10400 }, { "epoch": 0.7734601381974887, "grad_norm": 4.106884262000676e-06, "learning_rate": 5.937931888544891e-06, "loss": 0.0, "step": 10410 }, { "epoch": 0.7742031354483988, "grad_norm": 3.1875710959639036e-08, "learning_rate": 5.935950464396285e-06, "loss": 0.0001, "step": 10420 }, { "epoch": 0.774946132699309, "grad_norm": 5.016238091570813e-08, "learning_rate": 5.933969040247678e-06, "loss": 0.0, "step": 10430 }, { "epoch": 0.7756891299502192, "grad_norm": 2.544232047263506e-10, "learning_rate": 5.931987616099071e-06, "loss": 0.0001, "step": 10440 }, { "epoch": 0.7764321272011293, "grad_norm": 2.140318429155741e-05, "learning_rate": 5.930006191950464e-06, "loss": 0.0, "step": 10450 }, { "epoch": 0.7771751244520395, "grad_norm": 4.0058351080674015e-10, "learning_rate": 5.928024767801857e-06, "loss": 0.0, "step": 10460 }, { "epoch": 0.7779181217029497, "grad_norm": 1.1032379176878493e-10, "learning_rate": 5.92604334365325e-06, "loss": 0.0004, "step": 10470 }, { "epoch": 0.7786611189538599, "grad_norm": 3.829148664813431e-10, "learning_rate": 5.924061919504644e-06, "loss": 0.042, "step": 10480 }, { "epoch": 0.77940411620477, "grad_norm": 1.2686622312685358e-06, "learning_rate": 5.922080495356037e-06, "loss": 0.0141, "step": 10490 }, { "epoch": 0.7801471134556802, "grad_norm": 2.108249352872349e-09, "learning_rate": 5.920099071207431e-06, "loss": 0.0037, "step": 10500 }, { "epoch": 0.7808901107065904, "grad_norm": 0.16543616354465485, "learning_rate": 5.918117647058823e-06, "loss": 0.0002, "step": 10510 }, { "epoch": 0.7816331079575005, "grad_norm": 0.00403450895100832, "learning_rate": 5.916136222910216e-06, "loss": 0.0, "step": 10520 }, { "epoch": 0.7823761052084107, "grad_norm": 9.077193681150675e-05, "learning_rate": 5.9141547987616095e-06, "loss": 0.0, "step": 10530 }, { "epoch": 0.7831191024593209, "grad_norm": 5.860864007445343e-07, "learning_rate": 5.912173374613003e-06, "loss": 0.0006, "step": 10540 }, { "epoch": 0.783862099710231, "grad_norm": 4.437504941279258e-08, "learning_rate": 5.910191950464396e-06, "loss": 0.0001, "step": 10550 }, { "epoch": 0.7846050969611412, "grad_norm": 3.0640151874039567e-12, "learning_rate": 5.908210526315789e-06, "loss": 0.0, "step": 10560 }, { "epoch": 0.7853480942120514, "grad_norm": 0.5533859729766846, "learning_rate": 5.9062291021671825e-06, "loss": 0.0002, "step": 10570 }, { "epoch": 0.7860910914629616, "grad_norm": 9.649340881878743e-07, "learning_rate": 5.904247678018576e-06, "loss": 0.0, "step": 10580 }, { "epoch": 0.7868340887138717, "grad_norm": 2.8701084083935407e-16, "learning_rate": 5.902266253869969e-06, "loss": 0.0, "step": 10590 }, { "epoch": 0.7875770859647819, "grad_norm": 1.0866012445376327e-07, "learning_rate": 5.900284829721362e-06, "loss": 0.0001, "step": 10600 }, { "epoch": 0.7883200832156921, "grad_norm": 1.5580184481223114e-05, "learning_rate": 5.898303405572755e-06, "loss": 0.0, "step": 10610 }, { "epoch": 0.7890630804666022, "grad_norm": 0.006102435290813446, "learning_rate": 5.896321981424148e-06, "loss": 0.0, "step": 10620 }, { "epoch": 0.7898060777175124, "grad_norm": 4.692683069151826e-05, "learning_rate": 5.894340557275542e-06, "loss": 0.0, "step": 10630 }, { "epoch": 0.7905490749684226, "grad_norm": 1.9621255376023328e-07, "learning_rate": 5.892359133126935e-06, "loss": 0.0, "step": 10640 }, { "epoch": 0.7912920722193328, "grad_norm": 5.27158178220688e-11, "learning_rate": 5.8903777089783286e-06, "loss": 0.0, "step": 10650 }, { "epoch": 0.7920350694702429, "grad_norm": 2.445755542535011e-10, "learning_rate": 5.888396284829721e-06, "loss": 0.0, "step": 10660 }, { "epoch": 0.7927780667211531, "grad_norm": 1.1992368698120117, "learning_rate": 5.886414860681114e-06, "loss": 0.0002, "step": 10670 }, { "epoch": 0.7935210639720633, "grad_norm": 2.795533422805807e-11, "learning_rate": 5.884433436532507e-06, "loss": 0.0, "step": 10680 }, { "epoch": 0.7942640612229734, "grad_norm": 5.9136356867384166e-05, "learning_rate": 5.882452012383901e-06, "loss": 0.3, "step": 10690 }, { "epoch": 0.7950070584738836, "grad_norm": 3.889761137543246e-05, "learning_rate": 5.880470588235294e-06, "loss": 0.0418, "step": 10700 }, { "epoch": 0.7957500557247938, "grad_norm": 9.50610147747806e-16, "learning_rate": 5.878489164086687e-06, "loss": 0.0003, "step": 10710 }, { "epoch": 0.7964930529757039, "grad_norm": 6.581990508180979e-11, "learning_rate": 5.87650773993808e-06, "loss": 0.0, "step": 10720 }, { "epoch": 0.7972360502266141, "grad_norm": 2.836115697846253e-07, "learning_rate": 5.874526315789474e-06, "loss": 0.0005, "step": 10730 }, { "epoch": 0.7979790474775244, "grad_norm": 0.00040881423046812415, "learning_rate": 5.872544891640866e-06, "loss": 0.0, "step": 10740 }, { "epoch": 0.7987220447284346, "grad_norm": 0.008089880459010601, "learning_rate": 5.87056346749226e-06, "loss": 0.0004, "step": 10750 }, { "epoch": 0.7994650419793446, "grad_norm": 1.151008433529499e-12, "learning_rate": 5.8685820433436525e-06, "loss": 0.0066, "step": 10760 }, { "epoch": 0.8002080392302549, "grad_norm": 0.00018136223661713302, "learning_rate": 5.866600619195046e-06, "loss": 0.0, "step": 10770 }, { "epoch": 0.8009510364811651, "grad_norm": 2.2577885394614583e-10, "learning_rate": 5.8646191950464394e-06, "loss": 0.0, "step": 10780 }, { "epoch": 0.8016940337320752, "grad_norm": 2.3669249458180275e-06, "learning_rate": 5.862637770897833e-06, "loss": 0.0007, "step": 10790 }, { "epoch": 0.8024370309829854, "grad_norm": 0.16951783001422882, "learning_rate": 5.860656346749226e-06, "loss": 0.0, "step": 10800 }, { "epoch": 0.8031800282338956, "grad_norm": 0.0015784946735948324, "learning_rate": 5.858674922600619e-06, "loss": 0.0002, "step": 10810 }, { "epoch": 0.8039230254848057, "grad_norm": 5.818985565749987e-14, "learning_rate": 5.856693498452012e-06, "loss": 0.0003, "step": 10820 }, { "epoch": 0.8046660227357159, "grad_norm": 37.37020492553711, "learning_rate": 5.854712074303405e-06, "loss": 0.005, "step": 10830 }, { "epoch": 0.8054090199866261, "grad_norm": 0.0047941552475094795, "learning_rate": 5.8527306501547986e-06, "loss": 0.0, "step": 10840 }, { "epoch": 0.8061520172375363, "grad_norm": 8.270126841125602e-07, "learning_rate": 5.850749226006192e-06, "loss": 0.0002, "step": 10850 }, { "epoch": 0.8068950144884464, "grad_norm": 2.08947039936902e-06, "learning_rate": 5.848767801857585e-06, "loss": 0.0, "step": 10860 }, { "epoch": 0.8076380117393566, "grad_norm": 3.418485403060913, "learning_rate": 5.846786377708978e-06, "loss": 0.0005, "step": 10870 }, { "epoch": 0.8083810089902668, "grad_norm": 1.1189797352595399e-11, "learning_rate": 5.844804953560372e-06, "loss": 0.0012, "step": 10880 }, { "epoch": 0.8091240062411769, "grad_norm": 2.1537405539562293e-11, "learning_rate": 5.842823529411764e-06, "loss": 0.0316, "step": 10890 }, { "epoch": 0.8098670034920871, "grad_norm": 0.0014064647257328033, "learning_rate": 5.840842105263158e-06, "loss": 0.0, "step": 10900 }, { "epoch": 0.8106100007429973, "grad_norm": 6.045461177825928, "learning_rate": 5.83886068111455e-06, "loss": 0.0008, "step": 10910 }, { "epoch": 0.8113529979939075, "grad_norm": 4.053626980748959e-05, "learning_rate": 5.836879256965944e-06, "loss": 0.0, "step": 10920 }, { "epoch": 0.8120959952448176, "grad_norm": 1.2029997831231043e-10, "learning_rate": 5.834897832817337e-06, "loss": 0.0, "step": 10930 }, { "epoch": 0.8128389924957278, "grad_norm": 3.3816519029450376e-11, "learning_rate": 5.832916408668731e-06, "loss": 0.0, "step": 10940 }, { "epoch": 0.813581989746638, "grad_norm": 8.234967197040532e-08, "learning_rate": 5.830934984520124e-06, "loss": 0.0001, "step": 10950 }, { "epoch": 0.8143249869975481, "grad_norm": 0.0002453875495120883, "learning_rate": 5.828953560371517e-06, "loss": 0.0, "step": 10960 }, { "epoch": 0.8150679842484583, "grad_norm": 4.88361440176277e-09, "learning_rate": 5.8269721362229094e-06, "loss": 0.0195, "step": 10970 }, { "epoch": 0.8158109814993685, "grad_norm": 0.0026716694701462984, "learning_rate": 5.824990712074303e-06, "loss": 0.0081, "step": 10980 }, { "epoch": 0.8165539787502786, "grad_norm": 0.03051856718957424, "learning_rate": 5.823009287925696e-06, "loss": 0.0215, "step": 10990 }, { "epoch": 0.8172969760011888, "grad_norm": 0.000513051578309387, "learning_rate": 5.82102786377709e-06, "loss": 0.0001, "step": 11000 }, { "epoch": 0.818039973252099, "grad_norm": 3.8865754364847405e-11, "learning_rate": 5.8190464396284825e-06, "loss": 0.0, "step": 11010 }, { "epoch": 0.8187829705030092, "grad_norm": 2.3711514973712156e-09, "learning_rate": 5.817065015479876e-06, "loss": 0.0, "step": 11020 }, { "epoch": 0.8195259677539193, "grad_norm": 2.6273863527974926e-16, "learning_rate": 5.815083591331269e-06, "loss": 0.1234, "step": 11030 }, { "epoch": 0.8202689650048295, "grad_norm": 1.9006922424846806e-10, "learning_rate": 5.813102167182662e-06, "loss": 0.0001, "step": 11040 }, { "epoch": 0.8210119622557397, "grad_norm": 6.339902908214512e-11, "learning_rate": 5.8111207430340555e-06, "loss": 0.0, "step": 11050 }, { "epoch": 0.8217549595066498, "grad_norm": 6.933805613007493e-11, "learning_rate": 5.809139318885448e-06, "loss": 0.0, "step": 11060 }, { "epoch": 0.82249795675756, "grad_norm": 5.179937033972237e-06, "learning_rate": 5.807157894736842e-06, "loss": 0.0656, "step": 11070 }, { "epoch": 0.8232409540084702, "grad_norm": 1.015270569837412e-07, "learning_rate": 5.805176470588235e-06, "loss": 0.0, "step": 11080 }, { "epoch": 0.8239839512593803, "grad_norm": 8.100639048436165e-10, "learning_rate": 5.8031950464396285e-06, "loss": 0.0001, "step": 11090 }, { "epoch": 0.8247269485102905, "grad_norm": 0.00012064159091096371, "learning_rate": 5.801213622291022e-06, "loss": 0.0, "step": 11100 }, { "epoch": 0.8254699457612007, "grad_norm": 0.03790895268321037, "learning_rate": 5.799232198142414e-06, "loss": 0.0, "step": 11110 }, { "epoch": 0.8262129430121109, "grad_norm": 1.0634417337614804e-13, "learning_rate": 5.797250773993807e-06, "loss": 0.0, "step": 11120 }, { "epoch": 0.826955940263021, "grad_norm": 3.944758282159455e-05, "learning_rate": 5.795269349845201e-06, "loss": 0.0, "step": 11130 }, { "epoch": 0.8276989375139312, "grad_norm": 0.0015122968470677733, "learning_rate": 5.793287925696594e-06, "loss": 0.0001, "step": 11140 }, { "epoch": 0.8284419347648414, "grad_norm": 6.412102084141225e-05, "learning_rate": 5.791306501547988e-06, "loss": 0.0, "step": 11150 }, { "epoch": 0.8291849320157515, "grad_norm": 0.005599144846200943, "learning_rate": 5.78932507739938e-06, "loss": 0.0001, "step": 11160 }, { "epoch": 0.8299279292666617, "grad_norm": 7.969197133791961e-13, "learning_rate": 5.787343653250774e-06, "loss": 0.0, "step": 11170 }, { "epoch": 0.8306709265175719, "grad_norm": 1.185658717872684e-07, "learning_rate": 5.785362229102167e-06, "loss": 0.0, "step": 11180 }, { "epoch": 0.8314139237684821, "grad_norm": 0.16165073215961456, "learning_rate": 5.78338080495356e-06, "loss": 0.0111, "step": 11190 }, { "epoch": 0.8321569210193922, "grad_norm": 1.626756319711084e-13, "learning_rate": 5.781399380804953e-06, "loss": 0.0, "step": 11200 }, { "epoch": 0.8328999182703024, "grad_norm": 5.1102052367468787e-08, "learning_rate": 5.779417956656346e-06, "loss": 0.0002, "step": 11210 }, { "epoch": 0.8336429155212126, "grad_norm": 1.024515427161532e-06, "learning_rate": 5.777436532507739e-06, "loss": 0.0, "step": 11220 }, { "epoch": 0.8343859127721227, "grad_norm": 9.701993258204311e-05, "learning_rate": 5.775455108359133e-06, "loss": 0.0, "step": 11230 }, { "epoch": 0.8351289100230329, "grad_norm": 6.0247717925676625e-09, "learning_rate": 5.773473684210526e-06, "loss": 0.0, "step": 11240 }, { "epoch": 0.8358719072739431, "grad_norm": 0.03216814249753952, "learning_rate": 5.77149226006192e-06, "loss": 0.0018, "step": 11250 }, { "epoch": 0.8366149045248532, "grad_norm": 5.426255142992886e-07, "learning_rate": 5.769510835913312e-06, "loss": 0.0, "step": 11260 }, { "epoch": 0.8373579017757634, "grad_norm": 0.04645568132400513, "learning_rate": 5.767529411764705e-06, "loss": 0.0, "step": 11270 }, { "epoch": 0.8381008990266736, "grad_norm": 4.673921666364933e-11, "learning_rate": 5.7655479876160985e-06, "loss": 0.0, "step": 11280 }, { "epoch": 0.8388438962775838, "grad_norm": 1.20306035027329e-16, "learning_rate": 5.763566563467492e-06, "loss": 0.0, "step": 11290 }, { "epoch": 0.8395868935284939, "grad_norm": 0.00020505643624346703, "learning_rate": 5.7615851393188855e-06, "loss": 0.0, "step": 11300 }, { "epoch": 0.8403298907794041, "grad_norm": 4.8851634543156663e-11, "learning_rate": 5.759603715170279e-06, "loss": 0.0, "step": 11310 }, { "epoch": 0.8410728880303143, "grad_norm": 4.030108357255813e-06, "learning_rate": 5.7576222910216716e-06, "loss": 0.0, "step": 11320 }, { "epoch": 0.8418158852812244, "grad_norm": 8.612317469669506e-05, "learning_rate": 5.755640866873065e-06, "loss": 0.0, "step": 11330 }, { "epoch": 0.8425588825321346, "grad_norm": 1.26528092891931e-07, "learning_rate": 5.753659442724458e-06, "loss": 0.0, "step": 11340 }, { "epoch": 0.8433018797830448, "grad_norm": 0.04567428305745125, "learning_rate": 5.751678018575851e-06, "loss": 0.0, "step": 11350 }, { "epoch": 0.8440448770339549, "grad_norm": 0.00027625239454209805, "learning_rate": 5.749696594427245e-06, "loss": 0.0001, "step": 11360 }, { "epoch": 0.8447878742848651, "grad_norm": 327.62353515625, "learning_rate": 5.747715170278637e-06, "loss": 0.1414, "step": 11370 }, { "epoch": 0.8455308715357753, "grad_norm": 3.656084809899873e-11, "learning_rate": 5.745733746130031e-06, "loss": 0.0, "step": 11380 }, { "epoch": 0.8462738687866855, "grad_norm": 1.1380757972714829e-15, "learning_rate": 5.743752321981424e-06, "loss": 0.0, "step": 11390 }, { "epoch": 0.8470168660375956, "grad_norm": 2.3084213707225132e-10, "learning_rate": 5.741770897832818e-06, "loss": 0.0, "step": 11400 }, { "epoch": 0.8477598632885058, "grad_norm": 4.270603315426058e-10, "learning_rate": 5.73978947368421e-06, "loss": 0.0, "step": 11410 }, { "epoch": 0.848502860539416, "grad_norm": 2.2320163499032174e-14, "learning_rate": 5.737808049535603e-06, "loss": 0.0, "step": 11420 }, { "epoch": 0.8492458577903261, "grad_norm": 0.00042008221498690546, "learning_rate": 5.735826625386996e-06, "loss": 0.0, "step": 11430 }, { "epoch": 0.8499888550412363, "grad_norm": 6.38435221844702e-06, "learning_rate": 5.73384520123839e-06, "loss": 0.0001, "step": 11440 }, { "epoch": 0.8507318522921465, "grad_norm": 0.3845556676387787, "learning_rate": 5.731863777089783e-06, "loss": 0.0002, "step": 11450 }, { "epoch": 0.8514748495430567, "grad_norm": 6.48345157969743e-05, "learning_rate": 5.729882352941177e-06, "loss": 0.0002, "step": 11460 }, { "epoch": 0.8522178467939668, "grad_norm": 2.7513389344591133e-09, "learning_rate": 5.727900928792569e-06, "loss": 0.0005, "step": 11470 }, { "epoch": 0.852960844044877, "grad_norm": 6.917881734125331e-08, "learning_rate": 5.725919504643963e-06, "loss": 0.0, "step": 11480 }, { "epoch": 0.8537038412957872, "grad_norm": 1.241514553385059e-07, "learning_rate": 5.7239380804953555e-06, "loss": 0.0063, "step": 11490 }, { "epoch": 0.8544468385466973, "grad_norm": 0.0002859732776414603, "learning_rate": 5.721956656346749e-06, "loss": 0.0, "step": 11500 }, { "epoch": 0.8551898357976075, "grad_norm": 0.0013987168204039335, "learning_rate": 5.719975232198142e-06, "loss": 0.0, "step": 11510 }, { "epoch": 0.8559328330485177, "grad_norm": 1.6114221068619372e-08, "learning_rate": 5.717993808049535e-06, "loss": 0.0, "step": 11520 }, { "epoch": 0.8566758302994278, "grad_norm": 8.924436710466832e-12, "learning_rate": 5.7160123839009285e-06, "loss": 0.0, "step": 11530 }, { "epoch": 0.857418827550338, "grad_norm": 2.1956377167953178e-05, "learning_rate": 5.714030959752322e-06, "loss": 0.0, "step": 11540 }, { "epoch": 0.8581618248012483, "grad_norm": 0.008093055337667465, "learning_rate": 5.7120495356037154e-06, "loss": 0.1766, "step": 11550 }, { "epoch": 0.8589048220521585, "grad_norm": 1.0435254049756537e-10, "learning_rate": 5.710068111455108e-06, "loss": 0.0, "step": 11560 }, { "epoch": 0.8596478193030685, "grad_norm": 6.694943266438713e-08, "learning_rate": 5.708086687306501e-06, "loss": 0.0002, "step": 11570 }, { "epoch": 0.8603908165539788, "grad_norm": 0.04803592711687088, "learning_rate": 5.706105263157894e-06, "loss": 0.0, "step": 11580 }, { "epoch": 0.861133813804889, "grad_norm": 3.116029461125436e-07, "learning_rate": 5.704123839009288e-06, "loss": 0.0001, "step": 11590 }, { "epoch": 0.861876811055799, "grad_norm": 0.03385472297668457, "learning_rate": 5.702142414860681e-06, "loss": 0.0005, "step": 11600 }, { "epoch": 0.8626198083067093, "grad_norm": 1.6419374540554976e-14, "learning_rate": 5.7001609907120746e-06, "loss": 0.0, "step": 11610 }, { "epoch": 0.8633628055576195, "grad_norm": 0.00022258164244703948, "learning_rate": 5.698179566563467e-06, "loss": 0.0, "step": 11620 }, { "epoch": 0.8641058028085296, "grad_norm": 0.0001447295944672078, "learning_rate": 5.696198142414861e-06, "loss": 0.0, "step": 11630 }, { "epoch": 0.8648488000594398, "grad_norm": 5.570419559647322e-12, "learning_rate": 5.694216718266253e-06, "loss": 0.0, "step": 11640 }, { "epoch": 0.86559179731035, "grad_norm": 0.959900438785553, "learning_rate": 5.692235294117647e-06, "loss": 0.0002, "step": 11650 }, { "epoch": 0.8663347945612602, "grad_norm": 0.00056387425865978, "learning_rate": 5.69025386996904e-06, "loss": 0.0, "step": 11660 }, { "epoch": 0.8670777918121703, "grad_norm": 1.7676902643870562e-05, "learning_rate": 5.688272445820433e-06, "loss": 0.0, "step": 11670 }, { "epoch": 0.8678207890630805, "grad_norm": 1.2965365385753103e-05, "learning_rate": 5.686291021671826e-06, "loss": 0.0, "step": 11680 }, { "epoch": 0.8685637863139907, "grad_norm": 1.7545049558975734e-05, "learning_rate": 5.68430959752322e-06, "loss": 0.0, "step": 11690 }, { "epoch": 0.8693067835649008, "grad_norm": 8.313563012052327e-05, "learning_rate": 5.682328173374613e-06, "loss": 0.0009, "step": 11700 }, { "epoch": 0.870049780815811, "grad_norm": 3.8564474964114526e-13, "learning_rate": 5.680346749226006e-06, "loss": 0.0156, "step": 11710 }, { "epoch": 0.8707927780667212, "grad_norm": 3.037268925254466e-07, "learning_rate": 5.6783653250773985e-06, "loss": 0.0, "step": 11720 }, { "epoch": 0.8715357753176314, "grad_norm": 3.0822184271528386e-07, "learning_rate": 5.676383900928792e-06, "loss": 0.0486, "step": 11730 }, { "epoch": 0.8722787725685415, "grad_norm": 2.0005399559153147e-09, "learning_rate": 5.6744024767801854e-06, "loss": 0.0001, "step": 11740 }, { "epoch": 0.8730217698194517, "grad_norm": 7.24806534235789e-11, "learning_rate": 5.672421052631579e-06, "loss": 0.0, "step": 11750 }, { "epoch": 0.8737647670703619, "grad_norm": 3.510741407808382e-06, "learning_rate": 5.670439628482972e-06, "loss": 0.0001, "step": 11760 }, { "epoch": 0.874507764321272, "grad_norm": 7.405080715283674e-11, "learning_rate": 5.668458204334365e-06, "loss": 0.0, "step": 11770 }, { "epoch": 0.8752507615721822, "grad_norm": 0.0029110496398061514, "learning_rate": 5.6664767801857585e-06, "loss": 0.0001, "step": 11780 }, { "epoch": 0.8759937588230924, "grad_norm": 5.802525936815647e-15, "learning_rate": 5.664495356037151e-06, "loss": 0.0193, "step": 11790 }, { "epoch": 0.8767367560740025, "grad_norm": 1.9839896392426226e-10, "learning_rate": 5.6625139318885446e-06, "loss": 0.0015, "step": 11800 }, { "epoch": 0.8774797533249127, "grad_norm": 0.002000786131247878, "learning_rate": 5.660532507739938e-06, "loss": 0.0, "step": 11810 }, { "epoch": 0.8782227505758229, "grad_norm": 3.4525521186878905e-05, "learning_rate": 5.658551083591331e-06, "loss": 0.0, "step": 11820 }, { "epoch": 0.8789657478267331, "grad_norm": 4.349551545601571e-06, "learning_rate": 5.656569659442724e-06, "loss": 0.0, "step": 11830 }, { "epoch": 0.8797087450776432, "grad_norm": 8.154100328283675e-07, "learning_rate": 5.654588235294118e-06, "loss": 0.0092, "step": 11840 }, { "epoch": 0.8804517423285534, "grad_norm": 4.792653361847254e-13, "learning_rate": 5.652606811145511e-06, "loss": 0.0003, "step": 11850 }, { "epoch": 0.8811947395794636, "grad_norm": 0.01988409459590912, "learning_rate": 5.650625386996904e-06, "loss": 0.0, "step": 11860 }, { "epoch": 0.8819377368303737, "grad_norm": 1.1272351912339218e-05, "learning_rate": 5.648643962848296e-06, "loss": 0.0, "step": 11870 }, { "epoch": 0.8826807340812839, "grad_norm": 1.444346793277873e-07, "learning_rate": 5.64666253869969e-06, "loss": 0.0, "step": 11880 }, { "epoch": 0.8834237313321941, "grad_norm": 6.836680910282666e-08, "learning_rate": 5.644681114551083e-06, "loss": 0.0, "step": 11890 }, { "epoch": 0.8841667285831042, "grad_norm": 0.0006425076280720532, "learning_rate": 5.642699690402477e-06, "loss": 0.0, "step": 11900 }, { "epoch": 0.8849097258340144, "grad_norm": 7.639387455959934e-10, "learning_rate": 5.64071826625387e-06, "loss": 0.0, "step": 11910 }, { "epoch": 0.8856527230849246, "grad_norm": 0.0021283773239701986, "learning_rate": 5.638736842105263e-06, "loss": 0.0, "step": 11920 }, { "epoch": 0.8863957203358348, "grad_norm": 3.914873616395198e-07, "learning_rate": 5.6367554179566555e-06, "loss": 0.0002, "step": 11930 }, { "epoch": 0.8871387175867449, "grad_norm": 0.0005401484086178243, "learning_rate": 5.634773993808049e-06, "loss": 0.0, "step": 11940 }, { "epoch": 0.8878817148376551, "grad_norm": 0.0002997290575876832, "learning_rate": 5.632792569659442e-06, "loss": 0.0004, "step": 11950 }, { "epoch": 0.8886247120885653, "grad_norm": 1.0902587860295299e-10, "learning_rate": 5.630811145510836e-06, "loss": 0.0, "step": 11960 }, { "epoch": 0.8893677093394754, "grad_norm": 0.0002431497268844396, "learning_rate": 5.6288297213622285e-06, "loss": 0.0014, "step": 11970 }, { "epoch": 0.8901107065903856, "grad_norm": 9.602125317087484e-09, "learning_rate": 5.626848297213622e-06, "loss": 0.0, "step": 11980 }, { "epoch": 0.8908537038412958, "grad_norm": 1.7446057098968737e-10, "learning_rate": 5.6248668730650154e-06, "loss": 0.0, "step": 11990 }, { "epoch": 0.891596701092206, "grad_norm": 0.00014409823052119464, "learning_rate": 5.622885448916409e-06, "loss": 0.0001, "step": 12000 }, { "epoch": 0.8923396983431161, "grad_norm": 1.6398293155361898e-05, "learning_rate": 5.6209040247678015e-06, "loss": 0.0003, "step": 12010 }, { "epoch": 0.8930826955940263, "grad_norm": 142.40423583984375, "learning_rate": 5.618922600619194e-06, "loss": 0.0237, "step": 12020 }, { "epoch": 0.8938256928449365, "grad_norm": 9.659529496275354e-06, "learning_rate": 5.616941176470588e-06, "loss": 0.0194, "step": 12030 }, { "epoch": 0.8945686900958466, "grad_norm": 8.2748165958213e-12, "learning_rate": 5.614959752321981e-06, "loss": 0.0, "step": 12040 }, { "epoch": 0.8953116873467568, "grad_norm": 2.2750465955567734e-09, "learning_rate": 5.6129783281733746e-06, "loss": 0.0003, "step": 12050 }, { "epoch": 0.896054684597667, "grad_norm": 3.424536387797161e-08, "learning_rate": 5.610996904024768e-06, "loss": 0.0, "step": 12060 }, { "epoch": 0.8967976818485771, "grad_norm": 1.0793207554917958e-13, "learning_rate": 5.6090154798761615e-06, "loss": 0.0072, "step": 12070 }, { "epoch": 0.8975406790994873, "grad_norm": 6.529089091600326e-07, "learning_rate": 5.607034055727553e-06, "loss": 0.0, "step": 12080 }, { "epoch": 0.8982836763503975, "grad_norm": 7.768941577523947e-05, "learning_rate": 5.605052631578947e-06, "loss": 0.0, "step": 12090 }, { "epoch": 0.8990266736013077, "grad_norm": 0.0001442983775632456, "learning_rate": 5.60307120743034e-06, "loss": 0.0, "step": 12100 }, { "epoch": 0.8997696708522178, "grad_norm": 1.4699292449904533e-08, "learning_rate": 5.601089783281734e-06, "loss": 0.0, "step": 12110 }, { "epoch": 0.900512668103128, "grad_norm": 1.3796525699660833e-08, "learning_rate": 5.599108359133127e-06, "loss": 0.005, "step": 12120 }, { "epoch": 0.9012556653540382, "grad_norm": 1.8813398755535005e-11, "learning_rate": 5.59712693498452e-06, "loss": 0.0, "step": 12130 }, { "epoch": 0.9019986626049483, "grad_norm": 5.9292267276522814e-12, "learning_rate": 5.595145510835913e-06, "loss": 0.001, "step": 12140 }, { "epoch": 0.9027416598558585, "grad_norm": 6.119568046747403e-11, "learning_rate": 5.593164086687307e-06, "loss": 0.0, "step": 12150 }, { "epoch": 0.9034846571067687, "grad_norm": 7.268849003594369e-05, "learning_rate": 5.591182662538699e-06, "loss": 0.0009, "step": 12160 }, { "epoch": 0.9042276543576788, "grad_norm": 4.9557559445023713e-11, "learning_rate": 5.589201238390093e-06, "loss": 0.0, "step": 12170 }, { "epoch": 0.904970651608589, "grad_norm": 1.3493608719272743e-07, "learning_rate": 5.5872198142414854e-06, "loss": 0.0287, "step": 12180 }, { "epoch": 0.9057136488594992, "grad_norm": 2.1324193255622959e-07, "learning_rate": 5.585238390092879e-06, "loss": 0.0, "step": 12190 }, { "epoch": 0.9064566461104094, "grad_norm": 0.01952764391899109, "learning_rate": 5.583256965944272e-06, "loss": 0.0, "step": 12200 }, { "epoch": 0.9071996433613195, "grad_norm": 0.00020433991448953748, "learning_rate": 5.581275541795666e-06, "loss": 0.0001, "step": 12210 }, { "epoch": 0.9079426406122297, "grad_norm": 1.2454627267288743e-06, "learning_rate": 5.579294117647059e-06, "loss": 0.0, "step": 12220 }, { "epoch": 0.9086856378631399, "grad_norm": 3.2977433284120394e-11, "learning_rate": 5.577312693498451e-06, "loss": 0.0194, "step": 12230 }, { "epoch": 0.90942863511405, "grad_norm": 0.129503533244133, "learning_rate": 5.5753312693498446e-06, "loss": 0.0, "step": 12240 }, { "epoch": 0.9101716323649602, "grad_norm": 1.1046694226024556e-06, "learning_rate": 5.573349845201238e-06, "loss": 0.0003, "step": 12250 }, { "epoch": 0.9109146296158704, "grad_norm": 1.897476077079773, "learning_rate": 5.5713684210526315e-06, "loss": 0.0003, "step": 12260 }, { "epoch": 0.9116576268667806, "grad_norm": 0.09681554138660431, "learning_rate": 5.569386996904025e-06, "loss": 0.0, "step": 12270 }, { "epoch": 0.9124006241176907, "grad_norm": 5.7642043915784313e-11, "learning_rate": 5.567405572755418e-06, "loss": 0.0, "step": 12280 }, { "epoch": 0.9131436213686009, "grad_norm": 1.0768674146843793e-10, "learning_rate": 5.565424148606811e-06, "loss": 0.0, "step": 12290 }, { "epoch": 0.9138866186195111, "grad_norm": 0.0012163738720119, "learning_rate": 5.5634427244582045e-06, "loss": 0.0, "step": 12300 }, { "epoch": 0.9146296158704212, "grad_norm": 1.5533205960632096e-13, "learning_rate": 5.561461300309597e-06, "loss": 0.0412, "step": 12310 }, { "epoch": 0.9153726131213314, "grad_norm": 0.0020083878189325333, "learning_rate": 5.559479876160991e-06, "loss": 0.0, "step": 12320 }, { "epoch": 0.9161156103722417, "grad_norm": 0.0002501030685380101, "learning_rate": 5.557498452012383e-06, "loss": 0.0001, "step": 12330 }, { "epoch": 0.9168586076231517, "grad_norm": 1.2668735507759266e-05, "learning_rate": 5.555517027863777e-06, "loss": 0.0, "step": 12340 }, { "epoch": 0.917601604874062, "grad_norm": 1.0050942364614457e-05, "learning_rate": 5.55353560371517e-06, "loss": 0.0, "step": 12350 }, { "epoch": 0.9183446021249722, "grad_norm": 1.3533935749876491e-08, "learning_rate": 5.551554179566564e-06, "loss": 0.0, "step": 12360 }, { "epoch": 0.9190875993758824, "grad_norm": 0.0033469386398792267, "learning_rate": 5.549572755417957e-06, "loss": 0.0, "step": 12370 }, { "epoch": 0.9198305966267925, "grad_norm": 0.03890065848827362, "learning_rate": 5.547591331269349e-06, "loss": 0.0, "step": 12380 }, { "epoch": 0.9205735938777027, "grad_norm": 1.2853020052716602e-05, "learning_rate": 5.545609907120742e-06, "loss": 0.0, "step": 12390 }, { "epoch": 0.9213165911286129, "grad_norm": 8.424040970567148e-06, "learning_rate": 5.543628482972136e-06, "loss": 0.0, "step": 12400 }, { "epoch": 0.922059588379523, "grad_norm": 0.009078891016542912, "learning_rate": 5.541647058823529e-06, "loss": 0.0007, "step": 12410 }, { "epoch": 0.9228025856304332, "grad_norm": 1.4690392013960718e-08, "learning_rate": 5.539665634674923e-06, "loss": 0.0, "step": 12420 }, { "epoch": 0.9235455828813434, "grad_norm": 1.398217932546686e-06, "learning_rate": 5.537684210526315e-06, "loss": 0.0001, "step": 12430 }, { "epoch": 0.9242885801322535, "grad_norm": 2.240076781845346e-07, "learning_rate": 5.535702786377709e-06, "loss": 0.0, "step": 12440 }, { "epoch": 0.9250315773831637, "grad_norm": 6.147161772662724e-13, "learning_rate": 5.533721362229102e-06, "loss": 0.0, "step": 12450 }, { "epoch": 0.9257745746340739, "grad_norm": 2.5370676670632974e-09, "learning_rate": 5.531739938080495e-06, "loss": 0.0412, "step": 12460 }, { "epoch": 0.9265175718849841, "grad_norm": 5.919265270233154, "learning_rate": 5.5297585139318884e-06, "loss": 0.0004, "step": 12470 }, { "epoch": 0.9272605691358942, "grad_norm": 1.633419927093982e-10, "learning_rate": 5.527777089783281e-06, "loss": 0.0001, "step": 12480 }, { "epoch": 0.9280035663868044, "grad_norm": 0.13146740198135376, "learning_rate": 5.5257956656346745e-06, "loss": 0.0, "step": 12490 }, { "epoch": 0.9287465636377146, "grad_norm": 0.00012182805221527815, "learning_rate": 5.523814241486068e-06, "loss": 0.0, "step": 12500 }, { "epoch": 0.9294895608886247, "grad_norm": 3.9989106880966574e-05, "learning_rate": 5.5218328173374615e-06, "loss": 0.0, "step": 12510 }, { "epoch": 0.9302325581395349, "grad_norm": 0.002134963171556592, "learning_rate": 5.519851393188855e-06, "loss": 0.0, "step": 12520 }, { "epoch": 0.9309755553904451, "grad_norm": 8.067933876532152e-09, "learning_rate": 5.517869969040247e-06, "loss": 0.0, "step": 12530 }, { "epoch": 0.9317185526413553, "grad_norm": 6.582395961629572e-09, "learning_rate": 5.51588854489164e-06, "loss": 0.0, "step": 12540 }, { "epoch": 0.9324615498922654, "grad_norm": 2.2079259309748522e-08, "learning_rate": 5.513907120743034e-06, "loss": 0.0, "step": 12550 }, { "epoch": 0.9332045471431756, "grad_norm": 3.574210438728187e-08, "learning_rate": 5.511925696594427e-06, "loss": 0.0, "step": 12560 }, { "epoch": 0.9339475443940858, "grad_norm": 9.312388637994218e-10, "learning_rate": 5.509944272445821e-06, "loss": 0.0, "step": 12570 }, { "epoch": 0.9346905416449959, "grad_norm": 1.745539748299052e-06, "learning_rate": 5.507962848297213e-06, "loss": 0.0, "step": 12580 }, { "epoch": 0.9354335388959061, "grad_norm": 5.0815437191431556e-08, "learning_rate": 5.505981424148607e-06, "loss": 0.0003, "step": 12590 }, { "epoch": 0.9361765361468163, "grad_norm": 0.004067856818437576, "learning_rate": 5.503999999999999e-06, "loss": 0.0, "step": 12600 }, { "epoch": 0.9369195333977264, "grad_norm": 0.0012685784604400396, "learning_rate": 5.502018575851393e-06, "loss": 0.0001, "step": 12610 }, { "epoch": 0.9376625306486366, "grad_norm": 6.341381464380902e-08, "learning_rate": 5.500037151702786e-06, "loss": 0.0001, "step": 12620 }, { "epoch": 0.9384055278995468, "grad_norm": 3.1053107250045286e-06, "learning_rate": 5.498055727554179e-06, "loss": 0.0, "step": 12630 }, { "epoch": 0.939148525150457, "grad_norm": 3.5339638770892634e-07, "learning_rate": 5.496074303405572e-06, "loss": 0.0, "step": 12640 }, { "epoch": 0.9398915224013671, "grad_norm": 0.0001280097058042884, "learning_rate": 5.494092879256966e-06, "loss": 0.0, "step": 12650 }, { "epoch": 0.9406345196522773, "grad_norm": 1.3700240850448608, "learning_rate": 5.492111455108359e-06, "loss": 0.0002, "step": 12660 }, { "epoch": 0.9413775169031875, "grad_norm": 3.720205299373447e-08, "learning_rate": 5.490130030959753e-06, "loss": 0.1117, "step": 12670 }, { "epoch": 0.9421205141540976, "grad_norm": 0.00017579378618393093, "learning_rate": 5.4881486068111445e-06, "loss": 0.0, "step": 12680 }, { "epoch": 0.9428635114050078, "grad_norm": 0.004884654656052589, "learning_rate": 5.486167182662538e-06, "loss": 0.3875, "step": 12690 }, { "epoch": 0.943606508655918, "grad_norm": 0.0044697122648358345, "learning_rate": 5.4841857585139315e-06, "loss": 0.0006, "step": 12700 }, { "epoch": 0.9443495059068281, "grad_norm": 1.609314203262329, "learning_rate": 5.482204334365325e-06, "loss": 0.0002, "step": 12710 }, { "epoch": 0.9450925031577383, "grad_norm": 0.002076967153698206, "learning_rate": 5.480222910216718e-06, "loss": 0.0, "step": 12720 }, { "epoch": 0.9458355004086485, "grad_norm": 0.00034295208752155304, "learning_rate": 5.478241486068111e-06, "loss": 0.0021, "step": 12730 }, { "epoch": 0.9465784976595587, "grad_norm": 15.533943176269531, "learning_rate": 5.4762600619195045e-06, "loss": 0.0025, "step": 12740 }, { "epoch": 0.9473214949104688, "grad_norm": 2.8009684085845947, "learning_rate": 5.474278637770897e-06, "loss": 0.0006, "step": 12750 }, { "epoch": 0.948064492161379, "grad_norm": 0.0001058711131918244, "learning_rate": 5.472297213622291e-06, "loss": 0.0, "step": 12760 }, { "epoch": 0.9488074894122892, "grad_norm": 5.509461775687896e-09, "learning_rate": 5.470315789473684e-06, "loss": 0.0, "step": 12770 }, { "epoch": 0.9495504866631993, "grad_norm": 5.704950400975406e-10, "learning_rate": 5.468334365325077e-06, "loss": 0.0, "step": 12780 }, { "epoch": 0.9502934839141095, "grad_norm": 0.0019190621096640825, "learning_rate": 5.46635294117647e-06, "loss": 0.0126, "step": 12790 }, { "epoch": 0.9510364811650197, "grad_norm": 5.176408012630418e-05, "learning_rate": 5.464371517027864e-06, "loss": 0.0004, "step": 12800 }, { "epoch": 0.9517794784159299, "grad_norm": 1.1224059015546572e-08, "learning_rate": 5.462390092879257e-06, "loss": 0.0, "step": 12810 }, { "epoch": 0.95252247566684, "grad_norm": 0.0019681649282574654, "learning_rate": 5.4604086687306506e-06, "loss": 0.0, "step": 12820 }, { "epoch": 0.9532654729177502, "grad_norm": 7.417687747672194e-15, "learning_rate": 5.458427244582042e-06, "loss": 0.0009, "step": 12830 }, { "epoch": 0.9540084701686604, "grad_norm": 1.1600228617680841e-06, "learning_rate": 5.456445820433436e-06, "loss": 0.0, "step": 12840 }, { "epoch": 0.9547514674195705, "grad_norm": 1.0628299351722603e-09, "learning_rate": 5.454464396284829e-06, "loss": 0.0003, "step": 12850 }, { "epoch": 0.9554944646704807, "grad_norm": 1.531931985709889e-07, "learning_rate": 5.452482972136223e-06, "loss": 0.0, "step": 12860 }, { "epoch": 0.9562374619213909, "grad_norm": 1.8115921263106571e-10, "learning_rate": 5.450501547987616e-06, "loss": 0.0001, "step": 12870 }, { "epoch": 0.956980459172301, "grad_norm": 2.801902496685216e-07, "learning_rate": 5.44852012383901e-06, "loss": 0.0, "step": 12880 }, { "epoch": 0.9577234564232112, "grad_norm": 0.04919161647558212, "learning_rate": 5.446538699690402e-06, "loss": 0.0, "step": 12890 }, { "epoch": 0.9584664536741214, "grad_norm": 2.6947461719828425e-06, "learning_rate": 5.444557275541795e-06, "loss": 0.0193, "step": 12900 }, { "epoch": 0.9592094509250316, "grad_norm": 3.1660570698477386e-07, "learning_rate": 5.442575851393188e-06, "loss": 0.0, "step": 12910 }, { "epoch": 0.9599524481759417, "grad_norm": 9.557778867019806e-06, "learning_rate": 5.440594427244582e-06, "loss": 0.0004, "step": 12920 }, { "epoch": 0.9606954454268519, "grad_norm": 4.294448685548957e-10, "learning_rate": 5.4386130030959745e-06, "loss": 0.0, "step": 12930 }, { "epoch": 0.9614384426777621, "grad_norm": 3.063538045466885e-08, "learning_rate": 5.436631578947368e-06, "loss": 0.0, "step": 12940 }, { "epoch": 0.9621814399286722, "grad_norm": 1.2780335678641563e-09, "learning_rate": 5.4346501547987614e-06, "loss": 0.0, "step": 12950 }, { "epoch": 0.9629244371795824, "grad_norm": 6.216329534458609e-12, "learning_rate": 5.432668730650155e-06, "loss": 0.0113, "step": 12960 }, { "epoch": 0.9636674344304926, "grad_norm": 0.576353132724762, "learning_rate": 5.430687306501548e-06, "loss": 0.0239, "step": 12970 }, { "epoch": 0.9644104316814027, "grad_norm": 0.004668753128498793, "learning_rate": 5.42870588235294e-06, "loss": 0.0, "step": 12980 }, { "epoch": 0.9651534289323129, "grad_norm": 0.00024834927171468735, "learning_rate": 5.426724458204334e-06, "loss": 0.0, "step": 12990 }, { "epoch": 0.9658964261832231, "grad_norm": 0.00216829776763916, "learning_rate": 5.424743034055727e-06, "loss": 0.0051, "step": 13000 }, { "epoch": 0.9666394234341333, "grad_norm": 4.386068758321926e-05, "learning_rate": 5.4227616099071206e-06, "loss": 0.0, "step": 13010 }, { "epoch": 0.9673824206850434, "grad_norm": 0.00026333637651987374, "learning_rate": 5.420780185758514e-06, "loss": 0.0, "step": 13020 }, { "epoch": 0.9681254179359536, "grad_norm": 3.273822102300983e-08, "learning_rate": 5.4187987616099075e-06, "loss": 0.1587, "step": 13030 }, { "epoch": 0.9688684151868638, "grad_norm": 0.19884347915649414, "learning_rate": 5.4168173374613e-06, "loss": 0.0287, "step": 13040 }, { "epoch": 0.9696114124377739, "grad_norm": 1.6368703370517323e-07, "learning_rate": 5.414835913312693e-06, "loss": 0.0001, "step": 13050 }, { "epoch": 0.9703544096886841, "grad_norm": 3.419578433749848e-06, "learning_rate": 5.412854489164086e-06, "loss": 0.0, "step": 13060 }, { "epoch": 0.9710974069395943, "grad_norm": 0.177012100815773, "learning_rate": 5.41087306501548e-06, "loss": 0.0, "step": 13070 }, { "epoch": 0.9718404041905045, "grad_norm": 0.13552260398864746, "learning_rate": 5.408891640866873e-06, "loss": 0.0, "step": 13080 }, { "epoch": 0.9725834014414146, "grad_norm": 1.5327520272423976e-14, "learning_rate": 5.406910216718266e-06, "loss": 0.0006, "step": 13090 }, { "epoch": 0.9733263986923248, "grad_norm": 8.364349014300387e-06, "learning_rate": 5.404928792569659e-06, "loss": 0.0, "step": 13100 }, { "epoch": 0.974069395943235, "grad_norm": 2.168237533961559e-10, "learning_rate": 5.402947368421053e-06, "loss": 0.0, "step": 13110 }, { "epoch": 0.9748123931941451, "grad_norm": 0.023274226114153862, "learning_rate": 5.400965944272446e-06, "loss": 0.0, "step": 13120 }, { "epoch": 0.9755553904450553, "grad_norm": 0.01341934222728014, "learning_rate": 5.398984520123839e-06, "loss": 0.0, "step": 13130 }, { "epoch": 0.9762983876959656, "grad_norm": 24.02332878112793, "learning_rate": 5.3970030959752314e-06, "loss": 0.0027, "step": 13140 }, { "epoch": 0.9770413849468756, "grad_norm": 1.6098878896642077e-09, "learning_rate": 5.395021671826625e-06, "loss": 0.0001, "step": 13150 }, { "epoch": 0.9777843821977859, "grad_norm": 5.869712822459405e-07, "learning_rate": 5.393040247678018e-06, "loss": 0.0, "step": 13160 }, { "epoch": 0.978527379448696, "grad_norm": 1.3009074205242235e-14, "learning_rate": 5.391058823529412e-06, "loss": 0.0002, "step": 13170 }, { "epoch": 0.9792703766996063, "grad_norm": 3.359878064657096e-07, "learning_rate": 5.389077399380805e-06, "loss": 0.0754, "step": 13180 }, { "epoch": 0.9800133739505164, "grad_norm": 4.2013853089883924e-05, "learning_rate": 5.387095975232198e-06, "loss": 0.0, "step": 13190 }, { "epoch": 0.9807563712014266, "grad_norm": 7.83574526336088e-14, "learning_rate": 5.3851145510835906e-06, "loss": 0.0, "step": 13200 }, { "epoch": 0.9814993684523368, "grad_norm": 1.5744856796118256e-07, "learning_rate": 5.383133126934984e-06, "loss": 0.0005, "step": 13210 }, { "epoch": 0.9822423657032469, "grad_norm": 3.219616337446496e-05, "learning_rate": 5.3811517027863775e-06, "loss": 0.0, "step": 13220 }, { "epoch": 0.9829853629541571, "grad_norm": 4.937175617669709e-05, "learning_rate": 5.379170278637771e-06, "loss": 0.0, "step": 13230 }, { "epoch": 0.9837283602050673, "grad_norm": 2.4777457863223162e-09, "learning_rate": 5.377188854489164e-06, "loss": 0.0001, "step": 13240 }, { "epoch": 0.9844713574559774, "grad_norm": 3.497034484212236e-08, "learning_rate": 5.375207430340557e-06, "loss": 0.0, "step": 13250 }, { "epoch": 0.9852143547068876, "grad_norm": 0.00011610241926973686, "learning_rate": 5.3732260061919505e-06, "loss": 0.0001, "step": 13260 }, { "epoch": 0.9859573519577978, "grad_norm": 1.4077724586059048e-07, "learning_rate": 5.371244582043344e-06, "loss": 0.0, "step": 13270 }, { "epoch": 0.986700349208708, "grad_norm": 6.575947918463498e-05, "learning_rate": 5.369263157894737e-06, "loss": 0.0, "step": 13280 }, { "epoch": 0.9874433464596181, "grad_norm": 11.201228141784668, "learning_rate": 5.367281733746129e-06, "loss": 0.0015, "step": 13290 }, { "epoch": 0.9881863437105283, "grad_norm": 1.6089423127141345e-08, "learning_rate": 5.365300309597523e-06, "loss": 0.0, "step": 13300 }, { "epoch": 0.9889293409614385, "grad_norm": 7.099618202360825e-13, "learning_rate": 5.363318885448916e-06, "loss": 0.001, "step": 13310 }, { "epoch": 0.9896723382123486, "grad_norm": 0.0003913654654752463, "learning_rate": 5.36133746130031e-06, "loss": 0.0, "step": 13320 }, { "epoch": 0.9904153354632588, "grad_norm": 9.814119948714506e-06, "learning_rate": 5.359356037151703e-06, "loss": 0.0852, "step": 13330 }, { "epoch": 0.991158332714169, "grad_norm": 4.3183436559957045e-07, "learning_rate": 5.357374613003096e-06, "loss": 0.0, "step": 13340 }, { "epoch": 0.9919013299650792, "grad_norm": 0.007433414459228516, "learning_rate": 5.355393188854488e-06, "loss": 0.0001, "step": 13350 }, { "epoch": 0.9926443272159893, "grad_norm": 1.3307047908028835e-08, "learning_rate": 5.353411764705882e-06, "loss": 0.0, "step": 13360 }, { "epoch": 0.9933873244668995, "grad_norm": 0.0008435609634034336, "learning_rate": 5.351430340557275e-06, "loss": 0.0, "step": 13370 }, { "epoch": 0.9941303217178097, "grad_norm": 4.780190465680789e-07, "learning_rate": 5.349448916408669e-06, "loss": 0.0, "step": 13380 }, { "epoch": 0.9948733189687198, "grad_norm": 6.181208433986285e-09, "learning_rate": 5.347467492260061e-06, "loss": 0.0, "step": 13390 }, { "epoch": 0.99561631621963, "grad_norm": 1.051253195605284e-09, "learning_rate": 5.345486068111455e-06, "loss": 0.0, "step": 13400 }, { "epoch": 0.9963593134705402, "grad_norm": 0.02094096876680851, "learning_rate": 5.343504643962848e-06, "loss": 0.0, "step": 13410 }, { "epoch": 0.9971023107214503, "grad_norm": 0.05897127836942673, "learning_rate": 5.341523219814241e-06, "loss": 0.0, "step": 13420 }, { "epoch": 0.9978453079723605, "grad_norm": 1.64443099492928e-05, "learning_rate": 5.3395417956656345e-06, "loss": 0.0, "step": 13430 }, { "epoch": 0.9985883052232707, "grad_norm": 8.126213243975222e-12, "learning_rate": 5.337560371517027e-06, "loss": 0.0, "step": 13440 }, { "epoch": 0.9993313024741809, "grad_norm": 3.785869921557605e-05, "learning_rate": 5.3355789473684205e-06, "loss": 0.0, "step": 13450 } ], "logging_steps": 10, "max_steps": 40377, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 13459, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }