{ "best_metric": 2.8960378170013428, "best_model_checkpoint": "outputs-6_7/checkpoint-64000", "epoch": 2.7480548923964756, "eval_steps": 4000, "global_step": 64000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021469178846847466, "grad_norm": 0.39146578311920166, "learning_rate": 0.00029989693820586593, "loss": 2.4186, "step": 500 }, { "epoch": 0.04293835769369493, "grad_norm": 0.5122425556182861, "learning_rate": 0.0002997681109631983, "loss": 2.3709, "step": 1000 }, { "epoch": 0.0644075365405424, "grad_norm": 0.4298762083053589, "learning_rate": 0.0002996392837205307, "loss": 2.3735, "step": 1500 }, { "epoch": 0.08587671538738986, "grad_norm": 0.39066824316978455, "learning_rate": 0.00029951045647786317, "loss": 2.3524, "step": 2000 }, { "epoch": 0.10734589423423734, "grad_norm": 0.39771586656570435, "learning_rate": 0.00029938162923519556, "loss": 2.347, "step": 2500 }, { "epoch": 0.1288150730810848, "grad_norm": 0.47568196058273315, "learning_rate": 0.000299252801992528, "loss": 2.3336, "step": 3000 }, { "epoch": 0.15028425192793227, "grad_norm": 0.44162309169769287, "learning_rate": 0.0002991239747498604, "loss": 2.334, "step": 3500 }, { "epoch": 0.17175343077477973, "grad_norm": 0.4031461179256439, "learning_rate": 0.0002989951475071928, "loss": 2.3266, "step": 4000 }, { "epoch": 0.17175343077477973, "eval_loss": 3.0375378131866455, "eval_runtime": 174.8485, "eval_samples_per_second": 14.298, "eval_steps_per_second": 3.575, "step": 4000 }, { "epoch": 0.19322260962162718, "grad_norm": 0.384034126996994, "learning_rate": 0.00029886632026452525, "loss": 2.3344, "step": 4500 }, { "epoch": 0.21469178846847467, "grad_norm": 0.44177886843681335, "learning_rate": 0.00029873749302185764, "loss": 2.3157, "step": 5000 }, { "epoch": 0.23616096731532213, "grad_norm": 0.4425281286239624, "learning_rate": 0.0002986086657791901, "loss": 2.3232, "step": 5500 }, { "epoch": 0.2576301461621696, "grad_norm": 0.4302816390991211, "learning_rate": 0.0002984798385365225, "loss": 2.3141, "step": 6000 }, { "epoch": 0.2790993250090171, "grad_norm": 0.5806054472923279, "learning_rate": 0.00029835101129385493, "loss": 2.2975, "step": 6500 }, { "epoch": 0.30056850385586453, "grad_norm": 0.5654121041297913, "learning_rate": 0.00029822218405118733, "loss": 2.3141, "step": 7000 }, { "epoch": 0.322037682702712, "grad_norm": 0.5454065203666687, "learning_rate": 0.0002980933568085197, "loss": 2.3097, "step": 7500 }, { "epoch": 0.34350686154955945, "grad_norm": 0.43060022592544556, "learning_rate": 0.00029796452956585217, "loss": 2.308, "step": 8000 }, { "epoch": 0.34350686154955945, "eval_loss": 3.0060064792633057, "eval_runtime": 171.3325, "eval_samples_per_second": 14.592, "eval_steps_per_second": 3.648, "step": 8000 }, { "epoch": 0.3649760403964069, "grad_norm": 0.490461140871048, "learning_rate": 0.00029783570232318456, "loss": 2.2985, "step": 8500 }, { "epoch": 0.38644521924325437, "grad_norm": 0.5096587538719177, "learning_rate": 0.000297706875080517, "loss": 2.3041, "step": 9000 }, { "epoch": 0.4079143980901018, "grad_norm": 0.4906415343284607, "learning_rate": 0.0002975780478378494, "loss": 2.2903, "step": 9500 }, { "epoch": 0.42938357693694934, "grad_norm": 0.5885447263717651, "learning_rate": 0.00029744922059518186, "loss": 2.3069, "step": 10000 }, { "epoch": 0.4508527557837968, "grad_norm": 0.5200388431549072, "learning_rate": 0.00029732039335251425, "loss": 2.3025, "step": 10500 }, { "epoch": 0.47232193463064426, "grad_norm": 0.6331049799919128, "learning_rate": 0.00029719156610984664, "loss": 2.2957, "step": 11000 }, { "epoch": 0.4937911134774917, "grad_norm": 0.5442560315132141, "learning_rate": 0.0002970627388671791, "loss": 2.2878, "step": 11500 }, { "epoch": 0.5152602923243392, "grad_norm": 0.5305426120758057, "learning_rate": 0.0002969339116245115, "loss": 2.2903, "step": 12000 }, { "epoch": 0.5152602923243392, "eval_loss": 2.973823070526123, "eval_runtime": 170.1319, "eval_samples_per_second": 14.694, "eval_steps_per_second": 3.674, "step": 12000 }, { "epoch": 0.5367294711711866, "grad_norm": 0.5756106972694397, "learning_rate": 0.0002968050843818439, "loss": 2.2883, "step": 12500 }, { "epoch": 0.5581986500180341, "grad_norm": 0.5812390446662903, "learning_rate": 0.00029667625713917633, "loss": 2.2807, "step": 13000 }, { "epoch": 0.5796678288648816, "grad_norm": 0.4355560541152954, "learning_rate": 0.0002965474298965088, "loss": 2.2885, "step": 13500 }, { "epoch": 0.6011370077117291, "grad_norm": 0.41715824604034424, "learning_rate": 0.00029641860265384117, "loss": 2.2834, "step": 14000 }, { "epoch": 0.6226061865585765, "grad_norm": 0.4623817801475525, "learning_rate": 0.00029628977541117357, "loss": 2.2748, "step": 14500 }, { "epoch": 0.644075365405424, "grad_norm": 0.5191289186477661, "learning_rate": 0.000296160948168506, "loss": 2.2811, "step": 15000 }, { "epoch": 0.6655445442522714, "grad_norm": 0.6877865791320801, "learning_rate": 0.0002960321209258384, "loss": 2.2783, "step": 15500 }, { "epoch": 0.6870137230991189, "grad_norm": 0.49987566471099854, "learning_rate": 0.0002959032936831708, "loss": 2.2719, "step": 16000 }, { "epoch": 0.6870137230991189, "eval_loss": 2.964353561401367, "eval_runtime": 171.8492, "eval_samples_per_second": 14.548, "eval_steps_per_second": 3.637, "step": 16000 }, { "epoch": 0.7084829019459664, "grad_norm": 0.5470739006996155, "learning_rate": 0.00029577446644050325, "loss": 2.2832, "step": 16500 }, { "epoch": 0.7299520807928138, "grad_norm": 0.6002724766731262, "learning_rate": 0.0002956456391978357, "loss": 2.2838, "step": 17000 }, { "epoch": 0.7514212596396613, "grad_norm": 0.6674920320510864, "learning_rate": 0.0002955168119551681, "loss": 2.2686, "step": 17500 }, { "epoch": 0.7728904384865087, "grad_norm": 0.5728652477264404, "learning_rate": 0.0002953879847125005, "loss": 2.2725, "step": 18000 }, { "epoch": 0.7943596173333562, "grad_norm": 0.5590266585350037, "learning_rate": 0.00029525915746983294, "loss": 2.2794, "step": 18500 }, { "epoch": 0.8158287961802037, "grad_norm": 0.7446316480636597, "learning_rate": 0.00029513033022716533, "loss": 2.2676, "step": 19000 }, { "epoch": 0.8372979750270512, "grad_norm": 0.4322523772716522, "learning_rate": 0.0002950015029844977, "loss": 2.2832, "step": 19500 }, { "epoch": 0.8587671538738987, "grad_norm": 0.6566835045814514, "learning_rate": 0.00029487267574183017, "loss": 2.2636, "step": 20000 }, { "epoch": 0.8587671538738987, "eval_loss": 2.954716920852661, "eval_runtime": 171.5581, "eval_samples_per_second": 14.572, "eval_steps_per_second": 3.643, "step": 20000 }, { "epoch": 0.8802363327207461, "grad_norm": 0.5313192009925842, "learning_rate": 0.0002947438484991626, "loss": 2.2819, "step": 20500 }, { "epoch": 0.9017055115675936, "grad_norm": 0.689608633518219, "learning_rate": 0.000294615021256495, "loss": 2.2728, "step": 21000 }, { "epoch": 0.923174690414441, "grad_norm": 0.7024255394935608, "learning_rate": 0.0002944861940138274, "loss": 2.2746, "step": 21500 }, { "epoch": 0.9446438692612885, "grad_norm": 0.6012333035469055, "learning_rate": 0.00029435736677115986, "loss": 2.2658, "step": 22000 }, { "epoch": 0.9661130481081359, "grad_norm": 0.6304742693901062, "learning_rate": 0.0002942285395284923, "loss": 2.2718, "step": 22500 }, { "epoch": 0.9875822269549834, "grad_norm": 0.541362464427948, "learning_rate": 0.0002940997122858247, "loss": 2.272, "step": 23000 }, { "epoch": 1.009051405801831, "grad_norm": 0.5888085961341858, "learning_rate": 0.0002939708850431571, "loss": 2.2485, "step": 23500 }, { "epoch": 1.0305205846486785, "grad_norm": 0.5453173518180847, "learning_rate": 0.00029384205780048954, "loss": 2.2395, "step": 24000 }, { "epoch": 1.0305205846486785, "eval_loss": 2.940072774887085, "eval_runtime": 171.7472, "eval_samples_per_second": 14.556, "eval_steps_per_second": 3.639, "step": 24000 }, { "epoch": 1.0519897634955258, "grad_norm": 0.7155711054801941, "learning_rate": 0.00029371323055782194, "loss": 2.2476, "step": 24500 }, { "epoch": 1.0734589423423733, "grad_norm": 0.7307182550430298, "learning_rate": 0.00029358440331515433, "loss": 2.2479, "step": 25000 }, { "epoch": 1.0949281211892208, "grad_norm": 0.6849473714828491, "learning_rate": 0.0002934555760724868, "loss": 2.2407, "step": 25500 }, { "epoch": 1.1163973000360683, "grad_norm": 0.7161998152732849, "learning_rate": 0.00029332674882981923, "loss": 2.247, "step": 26000 }, { "epoch": 1.1378664788829158, "grad_norm": 0.723235011100769, "learning_rate": 0.0002931979215871516, "loss": 2.2382, "step": 26500 }, { "epoch": 1.159335657729763, "grad_norm": 0.4874274432659149, "learning_rate": 0.000293069094344484, "loss": 2.2483, "step": 27000 }, { "epoch": 1.1808048365766106, "grad_norm": 0.5381557941436768, "learning_rate": 0.00029294026710181646, "loss": 2.2423, "step": 27500 }, { "epoch": 1.2022740154234581, "grad_norm": 0.7897226214408875, "learning_rate": 0.00029281143985914886, "loss": 2.2538, "step": 28000 }, { "epoch": 1.2022740154234581, "eval_loss": 2.926734447479248, "eval_runtime": 172.6333, "eval_samples_per_second": 14.482, "eval_steps_per_second": 3.62, "step": 28000 }, { "epoch": 1.2237431942703056, "grad_norm": 0.5494747161865234, "learning_rate": 0.00029268261261648125, "loss": 2.2441, "step": 28500 }, { "epoch": 1.245212373117153, "grad_norm": 0.5955171585083008, "learning_rate": 0.0002925537853738137, "loss": 2.245, "step": 29000 }, { "epoch": 1.2666815519640005, "grad_norm": 0.7213128805160522, "learning_rate": 0.0002924249581311461, "loss": 2.2488, "step": 29500 }, { "epoch": 1.288150730810848, "grad_norm": 0.7488630414009094, "learning_rate": 0.00029229613088847854, "loss": 2.2412, "step": 30000 }, { "epoch": 1.3096199096576955, "grad_norm": 0.5948154330253601, "learning_rate": 0.00029216730364581094, "loss": 2.2378, "step": 30500 }, { "epoch": 1.3310890885045428, "grad_norm": 0.7915855050086975, "learning_rate": 0.0002920384764031434, "loss": 2.2464, "step": 31000 }, { "epoch": 1.3525582673513903, "grad_norm": 0.6043704152107239, "learning_rate": 0.0002919096491604758, "loss": 2.2421, "step": 31500 }, { "epoch": 1.3740274461982378, "grad_norm": 0.5474274158477783, "learning_rate": 0.0002917808219178082, "loss": 2.2507, "step": 32000 }, { "epoch": 1.3740274461982378, "eval_loss": 2.9269840717315674, "eval_runtime": 174.9446, "eval_samples_per_second": 14.29, "eval_steps_per_second": 3.573, "step": 32000 }, { "epoch": 1.3954966250450853, "grad_norm": 0.5420586466789246, "learning_rate": 0.0002916519946751406, "loss": 2.2405, "step": 32500 }, { "epoch": 1.4169658038919328, "grad_norm": 0.4751032888889313, "learning_rate": 0.000291523167432473, "loss": 2.2497, "step": 33000 }, { "epoch": 1.4384349827387801, "grad_norm": 0.5793635249137878, "learning_rate": 0.00029139434018980547, "loss": 2.2448, "step": 33500 }, { "epoch": 1.4599041615856276, "grad_norm": 0.6635434031486511, "learning_rate": 0.00029126551294713786, "loss": 2.2434, "step": 34000 }, { "epoch": 1.4813733404324751, "grad_norm": 0.5708619356155396, "learning_rate": 0.0002911366857044703, "loss": 2.2343, "step": 34500 }, { "epoch": 1.5028425192793224, "grad_norm": 0.5989744067192078, "learning_rate": 0.0002910078584618027, "loss": 2.2388, "step": 35000 }, { "epoch": 1.5243116981261702, "grad_norm": 0.746486246585846, "learning_rate": 0.0002908790312191351, "loss": 2.2484, "step": 35500 }, { "epoch": 1.5457808769730175, "grad_norm": 0.6059302687644958, "learning_rate": 0.00029075020397646755, "loss": 2.2409, "step": 36000 }, { "epoch": 1.5457808769730175, "eval_loss": 2.918299913406372, "eval_runtime": 171.3628, "eval_samples_per_second": 14.589, "eval_steps_per_second": 3.647, "step": 36000 }, { "epoch": 1.567250055819865, "grad_norm": 0.5767127871513367, "learning_rate": 0.00029062137673379994, "loss": 2.2459, "step": 36500 }, { "epoch": 1.5887192346667125, "grad_norm": 0.6815518736839294, "learning_rate": 0.0002904925494911324, "loss": 2.2565, "step": 37000 }, { "epoch": 1.6101884135135598, "grad_norm": 0.6565374732017517, "learning_rate": 0.0002903637222484648, "loss": 2.2388, "step": 37500 }, { "epoch": 1.6316575923604075, "grad_norm": 0.6622541546821594, "learning_rate": 0.0002902348950057972, "loss": 2.261, "step": 38000 }, { "epoch": 1.6531267712072548, "grad_norm": 0.8162985444068909, "learning_rate": 0.0002901060677631296, "loss": 2.2495, "step": 38500 }, { "epoch": 1.6745959500541023, "grad_norm": 0.5659546852111816, "learning_rate": 0.000289977240520462, "loss": 2.2385, "step": 39000 }, { "epoch": 1.6960651289009498, "grad_norm": 0.5625469088554382, "learning_rate": 0.00028984841327779447, "loss": 2.2372, "step": 39500 }, { "epoch": 1.7175343077477971, "grad_norm": 0.5423092842102051, "learning_rate": 0.00028971958603512686, "loss": 2.2424, "step": 40000 }, { "epoch": 1.7175343077477971, "eval_loss": 2.92061710357666, "eval_runtime": 172.1102, "eval_samples_per_second": 14.526, "eval_steps_per_second": 3.631, "step": 40000 }, { "epoch": 1.7390034865946449, "grad_norm": 0.7644880414009094, "learning_rate": 0.0002895907587924593, "loss": 2.2368, "step": 40500 }, { "epoch": 1.7604726654414922, "grad_norm": 0.8192068934440613, "learning_rate": 0.0002894619315497917, "loss": 2.2357, "step": 41000 }, { "epoch": 1.7819418442883397, "grad_norm": 0.6234991550445557, "learning_rate": 0.0002893331043071241, "loss": 2.2418, "step": 41500 }, { "epoch": 1.8034110231351872, "grad_norm": 0.5751623511314392, "learning_rate": 0.00028920427706445655, "loss": 2.2413, "step": 42000 }, { "epoch": 1.8248802019820345, "grad_norm": 0.8999291062355042, "learning_rate": 0.00028907544982178894, "loss": 2.2356, "step": 42500 }, { "epoch": 1.846349380828882, "grad_norm": 0.7696816325187683, "learning_rate": 0.00028894662257912133, "loss": 2.2427, "step": 43000 }, { "epoch": 1.8678185596757295, "grad_norm": 0.6660240292549133, "learning_rate": 0.0002888177953364538, "loss": 2.2507, "step": 43500 }, { "epoch": 1.889287738522577, "grad_norm": 0.6106180548667908, "learning_rate": 0.00028868896809378623, "loss": 2.2428, "step": 44000 }, { "epoch": 1.889287738522577, "eval_loss": 2.9116756916046143, "eval_runtime": 172.218, "eval_samples_per_second": 14.516, "eval_steps_per_second": 3.629, "step": 44000 }, { "epoch": 1.9107569173694245, "grad_norm": 0.6366661190986633, "learning_rate": 0.0002885601408511186, "loss": 2.2408, "step": 44500 }, { "epoch": 1.9322260962162718, "grad_norm": 0.7893187403678894, "learning_rate": 0.000288431313608451, "loss": 2.2369, "step": 45000 }, { "epoch": 1.9536952750631194, "grad_norm": 0.633651077747345, "learning_rate": 0.00028830248636578347, "loss": 2.2431, "step": 45500 }, { "epoch": 1.9751644539099669, "grad_norm": 0.7481298446655273, "learning_rate": 0.00028817365912311586, "loss": 2.2371, "step": 46000 }, { "epoch": 1.9966336327568142, "grad_norm": 0.596591055393219, "learning_rate": 0.00028804483188044826, "loss": 2.2358, "step": 46500 }, { "epoch": 2.018102811603662, "grad_norm": 0.7450771927833557, "learning_rate": 0.0002879160046377807, "loss": 2.2238, "step": 47000 }, { "epoch": 2.039571990450509, "grad_norm": 0.6886998414993286, "learning_rate": 0.00028778717739511315, "loss": 2.218, "step": 47500 }, { "epoch": 2.061041169297357, "grad_norm": 0.5555692911148071, "learning_rate": 0.00028765835015244555, "loss": 2.2151, "step": 48000 }, { "epoch": 2.061041169297357, "eval_loss": 2.909609317779541, "eval_runtime": 175.1711, "eval_samples_per_second": 14.272, "eval_steps_per_second": 3.568, "step": 48000 }, { "epoch": 2.082510348144204, "grad_norm": 0.7904226183891296, "learning_rate": 0.00028752952290977794, "loss": 2.2271, "step": 48500 }, { "epoch": 2.1039795269910515, "grad_norm": 0.8151206374168396, "learning_rate": 0.0002874006956671104, "loss": 2.2104, "step": 49000 }, { "epoch": 2.1254487058378992, "grad_norm": 0.6942662596702576, "learning_rate": 0.0002872718684244428, "loss": 2.2197, "step": 49500 }, { "epoch": 2.1469178846847465, "grad_norm": 0.6846303939819336, "learning_rate": 0.0002871430411817752, "loss": 2.2286, "step": 50000 }, { "epoch": 2.1683870635315943, "grad_norm": 0.8961315155029297, "learning_rate": 0.0002870142139391076, "loss": 2.2329, "step": 50500 }, { "epoch": 2.1898562423784416, "grad_norm": 0.5635807514190674, "learning_rate": 0.0002868853866964401, "loss": 2.2216, "step": 51000 }, { "epoch": 2.211325421225289, "grad_norm": 0.6454870700836182, "learning_rate": 0.00028675655945377247, "loss": 2.2119, "step": 51500 }, { "epoch": 2.2327946000721366, "grad_norm": 0.5770113468170166, "learning_rate": 0.00028662773221110486, "loss": 2.2216, "step": 52000 }, { "epoch": 2.2327946000721366, "eval_loss": 2.9131178855895996, "eval_runtime": 170.3262, "eval_samples_per_second": 14.678, "eval_steps_per_second": 3.669, "step": 52000 }, { "epoch": 2.254263778918984, "grad_norm": 0.6589009165763855, "learning_rate": 0.0002864989049684373, "loss": 2.2237, "step": 52500 }, { "epoch": 2.2757329577658316, "grad_norm": 0.6595714688301086, "learning_rate": 0.0002863700777257697, "loss": 2.2331, "step": 53000 }, { "epoch": 2.297202136612679, "grad_norm": 0.5528385639190674, "learning_rate": 0.0002862412504831021, "loss": 2.2316, "step": 53500 }, { "epoch": 2.318671315459526, "grad_norm": 0.6706179976463318, "learning_rate": 0.00028611242324043455, "loss": 2.2312, "step": 54000 }, { "epoch": 2.340140494306374, "grad_norm": 0.6599323153495789, "learning_rate": 0.000285983595997767, "loss": 2.2228, "step": 54500 }, { "epoch": 2.3616096731532212, "grad_norm": 0.7218915820121765, "learning_rate": 0.0002858547687550994, "loss": 2.2269, "step": 55000 }, { "epoch": 2.3830788520000685, "grad_norm": 0.6501777768135071, "learning_rate": 0.0002857259415124318, "loss": 2.225, "step": 55500 }, { "epoch": 2.4045480308469163, "grad_norm": 0.6774255037307739, "learning_rate": 0.00028559711426976423, "loss": 2.2193, "step": 56000 }, { "epoch": 2.4045480308469163, "eval_loss": 2.90551495552063, "eval_runtime": 173.4554, "eval_samples_per_second": 14.413, "eval_steps_per_second": 3.603, "step": 56000 }, { "epoch": 2.4260172096937636, "grad_norm": 0.667073130607605, "learning_rate": 0.00028546828702709663, "loss": 2.2228, "step": 56500 }, { "epoch": 2.4474863885406113, "grad_norm": 0.776077389717102, "learning_rate": 0.000285339459784429, "loss": 2.226, "step": 57000 }, { "epoch": 2.4689555673874586, "grad_norm": 0.7873576879501343, "learning_rate": 0.00028521063254176147, "loss": 2.2219, "step": 57500 }, { "epoch": 2.490424746234306, "grad_norm": 0.7621210813522339, "learning_rate": 0.0002850818052990939, "loss": 2.2226, "step": 58000 }, { "epoch": 2.5118939250811536, "grad_norm": 0.774750828742981, "learning_rate": 0.0002849529780564263, "loss": 2.2328, "step": 58500 }, { "epoch": 2.533363103928001, "grad_norm": 0.707665205001831, "learning_rate": 0.0002848241508137587, "loss": 2.2342, "step": 59000 }, { "epoch": 2.554832282774848, "grad_norm": 0.7524703741073608, "learning_rate": 0.00028469532357109116, "loss": 2.2296, "step": 59500 }, { "epoch": 2.576301461621696, "grad_norm": 0.6186488270759583, "learning_rate": 0.00028456649632842355, "loss": 2.2282, "step": 60000 }, { "epoch": 2.576301461621696, "eval_loss": 2.907226085662842, "eval_runtime": 176.429, "eval_samples_per_second": 14.17, "eval_steps_per_second": 3.543, "step": 60000 }, { "epoch": 2.5977706404685432, "grad_norm": 0.6811486482620239, "learning_rate": 0.00028443766908575594, "loss": 2.2357, "step": 60500 }, { "epoch": 2.619239819315391, "grad_norm": 0.7401767373085022, "learning_rate": 0.0002843088418430884, "loss": 2.2264, "step": 61000 }, { "epoch": 2.6407089981622383, "grad_norm": 0.6240813136100769, "learning_rate": 0.00028418001460042084, "loss": 2.2402, "step": 61500 }, { "epoch": 2.6621781770090855, "grad_norm": 0.6217384338378906, "learning_rate": 0.00028405118735775324, "loss": 2.2294, "step": 62000 }, { "epoch": 2.6836473558559333, "grad_norm": 0.5563312768936157, "learning_rate": 0.00028392236011508563, "loss": 2.2344, "step": 62500 }, { "epoch": 2.7051165347027806, "grad_norm": 0.7275550961494446, "learning_rate": 0.0002837935328724181, "loss": 2.2233, "step": 63000 }, { "epoch": 2.7265857135496283, "grad_norm": 0.6657426953315735, "learning_rate": 0.00028366470562975047, "loss": 2.2269, "step": 63500 }, { "epoch": 2.7480548923964756, "grad_norm": 0.5833483934402466, "learning_rate": 0.0002835358783870829, "loss": 2.2234, "step": 64000 }, { "epoch": 2.7480548923964756, "eval_loss": 2.8960378170013428, "eval_runtime": 165.7901, "eval_samples_per_second": 15.079, "eval_steps_per_second": 3.77, "step": 64000 } ], "logging_steps": 500, "max_steps": 1164450, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 8000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.740828282336215e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }