{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.3047177107501935, "eval_steps": 500, "global_step": 1490, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 0.000199739323151795, "loss": 1.5905, "step": 10 }, { "epoch": 0.03, "learning_rate": 0.00019682229406025635, "loss": 1.3988, "step": 20 }, { "epoch": 0.05, "learning_rate": 0.00019075754196709572, "loss": 1.4513, "step": 30 }, { "epoch": 0.06, "learning_rate": 0.00018174223385588917, "loss": 1.3878, "step": 40 }, { "epoch": 0.08, "learning_rate": 0.00016949152542372882, "loss": 1.2662, "step": 50 }, { "epoch": 0.09, "learning_rate": 0.0001999998602293167, "loss": 1.0731, "step": 60 }, { "epoch": 0.11, "learning_rate": 0.0001999830882200816, "loss": 1.0733, "step": 70 }, { "epoch": 0.12, "learning_rate": 0.0001999383674462943, "loss": 1.0432, "step": 80 }, { "epoch": 0.14, "learning_rate": 0.00019986571040897272, "loss": 1.0459, "step": 90 }, { "epoch": 0.15, "learning_rate": 0.00019976513741829603, "loss": 1.0564, "step": 100 }, { "epoch": 0.17, "learning_rate": 0.00019963667658792704, "loss": 1.006, "step": 110 }, { "epoch": 0.19, "learning_rate": 0.00019948036382715371, "loss": 1.0124, "step": 120 }, { "epoch": 0.2, "learning_rate": 0.0001992962428308511, "loss": 1.0495, "step": 130 }, { "epoch": 0.22, "learning_rate": 0.00019908436506726714, "loss": 1.0061, "step": 140 }, { "epoch": 0.23, "learning_rate": 0.00019884478976363548, "loss": 1.0166, "step": 150 }, { "epoch": 0.25, "learning_rate": 0.00019857758388961943, "loss": 0.986, "step": 160 }, { "epoch": 0.26, "learning_rate": 0.0001982828221385916, "loss": 1.0052, "step": 170 }, { "epoch": 0.28, "learning_rate": 0.00019796058690675435, "loss": 0.9986, "step": 180 }, { "epoch": 0.29, "learning_rate": 0.0001976109682701075, "loss": 0.989, "step": 190 }, { "epoch": 0.31, "learning_rate": 0.00019723406395926856, "loss": 1.0184, "step": 200 }, { "epoch": 0.32, "learning_rate": 0.00019682997933215385, "loss": 0.9896, "step": 210 }, { "epoch": 0.34, "learning_rate": 0.00019639882734452722, "loss": 1.0046, "step": 220 }, { "epoch": 0.36, "learning_rate": 0.000195940728518425, "loss": 0.9922, "step": 230 }, { "epoch": 0.37, "learning_rate": 0.00019545581090846584, "loss": 0.9665, "step": 240 }, { "epoch": 0.39, "learning_rate": 0.00019494421006605492, "loss": 1.0012, "step": 250 }, { "epoch": 0.4, "learning_rate": 0.00019440606900149275, "loss": 0.9937, "step": 260 }, { "epoch": 0.42, "learning_rate": 0.0001938415381439987, "loss": 0.984, "step": 270 }, { "epoch": 0.43, "learning_rate": 0.00019325077529966077, "loss": 0.9651, "step": 280 }, { "epoch": 0.45, "learning_rate": 0.00019263394560732326, "loss": 0.9621, "step": 290 }, { "epoch": 0.46, "learning_rate": 0.00019199122149242485, "loss": 0.9761, "step": 300 }, { "epoch": 0.48, "learning_rate": 0.00019132278261879945, "loss": 0.9792, "step": 310 }, { "epoch": 0.49, "learning_rate": 0.00019062881583845398, "loss": 0.9918, "step": 320 }, { "epoch": 0.51, "learning_rate": 0.00018990951513933683, "loss": 0.969, "step": 330 }, { "epoch": 0.53, "learning_rate": 0.00018916508159111126, "loss": 0.9513, "step": 340 }, { "epoch": 0.54, "learning_rate": 0.0001883957232889495, "loss": 0.9987, "step": 350 }, { "epoch": 0.56, "learning_rate": 0.00018760165529536286, "loss": 0.9687, "step": 360 }, { "epoch": 0.57, "learning_rate": 0.00018678309958008432, "loss": 0.9734, "step": 370 }, { "epoch": 0.59, "learning_rate": 0.00018594028495802015, "loss": 0.9705, "step": 380 }, { "epoch": 0.6, "learning_rate": 0.00018507344702528816, "loss": 0.949, "step": 390 }, { "epoch": 0.62, "learning_rate": 0.0001841828280933604, "loss": 0.9709, "step": 400 }, { "epoch": 0.63, "learning_rate": 0.00018326867712132865, "loss": 0.9484, "step": 410 }, { "epoch": 0.65, "learning_rate": 0.00018233124964631156, "loss": 0.9677, "step": 420 }, { "epoch": 0.67, "learning_rate": 0.00018137080771202325, "loss": 0.9597, "step": 430 }, { "epoch": 0.68, "learning_rate": 0.00018038761979552285, "loss": 0.9289, "step": 440 }, { "epoch": 0.7, "learning_rate": 0.00017938196073216586, "loss": 0.9749, "step": 450 }, { "epoch": 0.71, "learning_rate": 0.00017835411163877782, "loss": 0.9445, "step": 460 }, { "epoch": 0.73, "learning_rate": 0.00017730435983507277, "loss": 0.9609, "step": 470 }, { "epoch": 0.74, "learning_rate": 0.00017623299876333697, "loss": 0.9652, "step": 480 }, { "epoch": 0.76, "learning_rate": 0.0001751403279064018, "loss": 0.9415, "step": 490 }, { "epoch": 0.77, "learning_rate": 0.0001740266527039276, "loss": 0.9769, "step": 500 }, { "epoch": 0.79, "learning_rate": 0.00017289228446702292, "loss": 0.9477, "step": 510 }, { "epoch": 0.8, "learning_rate": 0.00017173754029122188, "loss": 0.9622, "step": 520 }, { "epoch": 0.82, "learning_rate": 0.00017056274296784518, "loss": 0.9631, "step": 530 }, { "epoch": 0.84, "learning_rate": 0.00016936822089376837, "loss": 0.9482, "step": 540 }, { "epoch": 0.85, "learning_rate": 0.00016815430797962374, "loss": 0.9666, "step": 550 }, { "epoch": 0.87, "learning_rate": 0.00016692134355646046, "loss": 0.9331, "step": 560 }, { "epoch": 0.88, "learning_rate": 0.00016566967228088972, "loss": 0.9395, "step": 570 }, { "epoch": 0.9, "learning_rate": 0.0001643996440387412, "loss": 0.9432, "step": 580 }, { "epoch": 0.91, "learning_rate": 0.0001631116138472578, "loss": 0.9244, "step": 590 }, { "epoch": 0.93, "learning_rate": 0.00016180594175585586, "loss": 0.9617, "step": 600 }, { "epoch": 0.94, "learning_rate": 0.00016048299274547883, "loss": 0.9213, "step": 610 }, { "epoch": 0.96, "learning_rate": 0.00015914313662657224, "loss": 0.9497, "step": 620 }, { "epoch": 0.97, "learning_rate": 0.00015778674793570896, "loss": 0.9377, "step": 630 }, { "epoch": 0.99, "learning_rate": 0.00015641420583089295, "loss": 0.9205, "step": 640 }, { "epoch": 1.01, "learning_rate": 0.00015502589398557146, "loss": 0.9664, "step": 650 }, { "epoch": 1.02, "learning_rate": 0.0001536222004813849, "loss": 0.86, "step": 660 }, { "epoch": 1.04, "learning_rate": 0.0001522035176996845, "loss": 0.8729, "step": 670 }, { "epoch": 1.05, "learning_rate": 0.00015077024221184793, "loss": 0.8386, "step": 680 }, { "epoch": 1.07, "learning_rate": 0.00014932277466842377, "loss": 0.836, "step": 690 }, { "epoch": 1.08, "learning_rate": 0.0001478615196871358, "loss": 0.8874, "step": 700 }, { "epoch": 1.1, "learning_rate": 0.00014638688573977805, "loss": 0.8546, "step": 710 }, { "epoch": 1.11, "learning_rate": 0.00014489928503803285, "loss": 0.8682, "step": 720 }, { "epoch": 1.13, "learning_rate": 0.00014339913341824314, "loss": 0.8407, "step": 730 }, { "epoch": 1.14, "learning_rate": 0.0001418868502251717, "loss": 0.8476, "step": 740 }, { "epoch": 1.16, "learning_rate": 0.00014036285819477967, "loss": 0.8962, "step": 750 }, { "epoch": 1.18, "learning_rate": 0.0001388275833360572, "loss": 0.8799, "step": 760 }, { "epoch": 1.19, "learning_rate": 0.00013728145481193882, "loss": 0.8585, "step": 770 }, { "epoch": 1.21, "learning_rate": 0.00013572490481933758, "loss": 0.8541, "step": 780 }, { "epoch": 1.22, "learning_rate": 0.000134158368468331, "loss": 0.8239, "step": 790 }, { "epoch": 1.24, "learning_rate": 0.0001325822836605323, "loss": 0.8758, "step": 800 }, { "epoch": 1.25, "learning_rate": 0.00013099709096668193, "loss": 0.8682, "step": 810 }, { "epoch": 1.27, "learning_rate": 0.00012940323350349258, "loss": 0.8588, "step": 820 }, { "epoch": 1.28, "learning_rate": 0.0001278011568097824, "loss": 0.8455, "step": 830 }, { "epoch": 1.3, "learning_rate": 0.00012619130872193162, "loss": 0.8485, "step": 840 }, { "epoch": 1.31, "learning_rate": 0.00012457413924869644, "loss": 0.8995, "step": 850 }, { "epoch": 1.33, "learning_rate": 0.0001229501004454159, "loss": 0.8736, "step": 860 }, { "epoch": 1.35, "learning_rate": 0.00012131964628764678, "loss": 0.8638, "step": 870 }, { "epoch": 1.36, "learning_rate": 0.00011968323254426135, "loss": 0.8412, "step": 880 }, { "epoch": 1.38, "learning_rate": 0.00011804131665004423, "loss": 0.815, "step": 890 }, { "epoch": 1.39, "learning_rate": 0.00011639435757782336, "loss": 0.8969, "step": 900 }, { "epoch": 1.41, "learning_rate": 0.0001147428157101709, "loss": 0.8737, "step": 910 }, { "epoch": 1.42, "learning_rate": 0.00011308715271071049, "loss": 0.8604, "step": 920 }, { "epoch": 1.44, "learning_rate": 0.00011142783139506601, "loss": 0.8425, "step": 930 }, { "epoch": 1.45, "learning_rate": 0.00010976531560148841, "loss": 0.8262, "step": 940 }, { "epoch": 1.47, "learning_rate": 0.00010810007006119685, "loss": 0.9044, "step": 950 }, { "epoch": 1.48, "learning_rate": 0.00010643256026846992, "loss": 0.8575, "step": 960 }, { "epoch": 1.5, "learning_rate": 0.00010476325235052389, "loss": 0.8602, "step": 970 }, { "epoch": 1.52, "learning_rate": 0.00010309261293721384, "loss": 0.8507, "step": 980 }, { "epoch": 1.53, "learning_rate": 0.00010142110903059424, "loss": 0.8279, "step": 990 }, { "epoch": 1.55, "learning_rate": 9.974920787437567e-05, "loss": 0.8845, "step": 1000 }, { "epoch": 1.56, "learning_rate": 9.807737682331383e-05, "loss": 0.8515, "step": 1010 }, { "epoch": 1.58, "learning_rate": 9.640608321256761e-05, "loss": 0.8467, "step": 1020 }, { "epoch": 1.59, "learning_rate": 9.47357942270625e-05, "loss": 0.8406, "step": 1030 }, { "epoch": 1.61, "learning_rate": 9.306697677089621e-05, "loss": 0.8435, "step": 1040 }, { "epoch": 1.62, "learning_rate": 9.140009733682262e-05, "loss": 0.8836, "step": 1050 }, { "epoch": 1.64, "learning_rate": 8.973562187585071e-05, "loss": 0.866, "step": 1060 }, { "epoch": 1.66, "learning_rate": 8.80740156669951e-05, "loss": 0.8585, "step": 1070 }, { "epoch": 1.67, "learning_rate": 8.641574318721413e-05, "loss": 0.847, "step": 1080 }, { "epoch": 1.69, "learning_rate": 8.476126798157258e-05, "loss": 0.8221, "step": 1090 }, { "epoch": 1.7, "learning_rate": 8.311105253366448e-05, "loss": 0.8873, "step": 1100 }, { "epoch": 1.72, "learning_rate": 8.146555813633274e-05, "loss": 0.8415, "step": 1110 }, { "epoch": 1.73, "learning_rate": 7.982524476272188e-05, "loss": 0.8533, "step": 1120 }, { "epoch": 1.75, "learning_rate": 7.819057093769931e-05, "loss": 0.8281, "step": 1130 }, { "epoch": 1.76, "learning_rate": 7.656199360968167e-05, "loss": 0.8319, "step": 1140 }, { "epoch": 1.78, "learning_rate": 7.493996802290186e-05, "loss": 0.8593, "step": 1150 }, { "epoch": 1.79, "learning_rate": 7.332494759015226e-05, "loss": 0.8361, "step": 1160 }, { "epoch": 1.81, "learning_rate": 7.171738376604012e-05, "loss": 0.8536, "step": 1170 }, { "epoch": 1.83, "learning_rate": 7.011772592079013e-05, "loss": 0.8478, "step": 1180 }, { "epoch": 1.84, "learning_rate": 6.85264212146299e-05, "loss": 0.8276, "step": 1190 }, { "epoch": 1.86, "learning_rate": 6.694391447279287e-05, "loss": 0.8934, "step": 1200 }, { "epoch": 1.87, "learning_rate": 6.537064806117435e-05, "loss": 0.8359, "step": 1210 }, { "epoch": 1.89, "learning_rate": 6.380706176267455e-05, "loss": 0.8477, "step": 1220 }, { "epoch": 1.9, "learning_rate": 6.225359265426414e-05, "loss": 0.8293, "step": 1230 }, { "epoch": 1.92, "learning_rate": 6.071067498480583e-05, "loss": 0.828, "step": 1240 }, { "epoch": 1.93, "learning_rate": 5.9178740053666706e-05, "loss": 0.8773, "step": 1250 }, { "epoch": 1.95, "learning_rate": 5.765821609015495e-05, "loss": 0.8376, "step": 1260 }, { "epoch": 1.96, "learning_rate": 5.61495281338148e-05, "loss": 0.8384, "step": 1270 }, { "epoch": 1.98, "learning_rate": 5.465309791561328e-05, "loss": 0.8403, "step": 1280 }, { "epoch": 2.0, "learning_rate": 5.316934374005142e-05, "loss": 0.8464, "step": 1290 }, { "epoch": 2.01, "learning_rate": 5.1698680368233864e-05, "loss": 0.8334, "step": 1300 }, { "epoch": 2.03, "learning_rate": 5.02415189019283e-05, "loss": 0.7771, "step": 1310 }, { "epoch": 2.04, "learning_rate": 4.879826666864852e-05, "loss": 0.7607, "step": 1320 }, { "epoch": 2.06, "learning_rate": 4.736932710779173e-05, "loss": 0.7367, "step": 1330 }, { "epoch": 2.07, "learning_rate": 4.595509965786368e-05, "loss": 0.763, "step": 1340 }, { "epoch": 2.09, "learning_rate": 4.4555979644821145e-05, "loss": 0.7878, "step": 1350 }, { "epoch": 2.1, "learning_rate": 4.3172358171564916e-05, "loss": 0.763, "step": 1360 }, { "epoch": 2.12, "learning_rate": 4.180462200861256e-05, "loss": 0.7487, "step": 1370 }, { "epoch": 2.13, "learning_rate": 4.045315348598283e-05, "loss": 0.738, "step": 1380 }, { "epoch": 2.15, "learning_rate": 3.9118330386320814e-05, "loss": 0.7439, "step": 1390 }, { "epoch": 2.17, "learning_rate": 3.780052583929475e-05, "loss": 0.7903, "step": 1400 }, { "epoch": 2.18, "learning_rate": 3.650010821729303e-05, "loss": 0.7653, "step": 1410 }, { "epoch": 2.2, "learning_rate": 3.521744103245145e-05, "loss": 0.7447, "step": 1420 }, { "epoch": 2.21, "learning_rate": 3.395288283503867e-05, "loss": 0.7186, "step": 1430 }, { "epoch": 2.23, "learning_rate": 3.2706787113229075e-05, "loss": 0.7259, "step": 1440 }, { "epoch": 2.24, "learning_rate": 3.147950219429036e-05, "loss": 0.8041, "step": 1450 }, { "epoch": 2.26, "learning_rate": 3.027137114721401e-05, "loss": 0.7545, "step": 1460 }, { "epoch": 2.27, "learning_rate": 2.9082731686815412e-05, "loss": 0.7363, "step": 1470 }, { "epoch": 2.29, "learning_rate": 2.7913916079330905e-05, "loss": 0.7364, "step": 1480 }, { "epoch": 2.3, "learning_rate": 2.676525104953759e-05, "loss": 0.7509, "step": 1490 } ], "logging_steps": 10, "max_steps": 1938, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, "total_flos": 4.7276257552131686e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }