{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2578875171467764, "eval_steps": 500, "global_step": 94, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0027434842249657062, "grad_norm": 0.3216973543167114, "learning_rate": 2.0000000000000003e-06, "loss": 2.039, "step": 1 }, { "epoch": 0.0054869684499314125, "grad_norm": 0.3218615651130676, "learning_rate": 4.000000000000001e-06, "loss": 2.063, "step": 2 }, { "epoch": 0.00823045267489712, "grad_norm": 0.3071180582046509, "learning_rate": 6e-06, "loss": 2.0298, "step": 3 }, { "epoch": 0.010973936899862825, "grad_norm": 0.30708086490631104, "learning_rate": 8.000000000000001e-06, "loss": 1.9489, "step": 4 }, { "epoch": 0.013717421124828532, "grad_norm": 0.31064078211784363, "learning_rate": 1e-05, "loss": 2.0401, "step": 5 }, { "epoch": 0.01646090534979424, "grad_norm": 0.35925883054733276, "learning_rate": 1.2e-05, "loss": 2.0963, "step": 6 }, { "epoch": 0.019204389574759947, "grad_norm": 0.3267571032047272, "learning_rate": 1.4000000000000001e-05, "loss": 2.0252, "step": 7 }, { "epoch": 0.02194787379972565, "grad_norm": 0.30728623270988464, "learning_rate": 1.6000000000000003e-05, "loss": 2.0108, "step": 8 }, { "epoch": 0.024691358024691357, "grad_norm": 0.2928607761859894, "learning_rate": 1.8e-05, "loss": 1.9542, "step": 9 }, { "epoch": 0.027434842249657063, "grad_norm": 0.30577352643013, "learning_rate": 2e-05, "loss": 2.017, "step": 10 }, { "epoch": 0.03017832647462277, "grad_norm": 0.3024803102016449, "learning_rate": 2.2000000000000003e-05, "loss": 1.982, "step": 11 }, { "epoch": 0.03292181069958848, "grad_norm": 0.28839072585105896, "learning_rate": 2.4e-05, "loss": 1.9217, "step": 12 }, { "epoch": 0.03566529492455418, "grad_norm": 0.2843893766403198, "learning_rate": 2.6000000000000002e-05, "loss": 1.9608, "step": 13 }, { "epoch": 0.038408779149519894, "grad_norm": 0.2703002095222473, "learning_rate": 2.8000000000000003e-05, "loss": 1.9696, "step": 14 }, { "epoch": 0.0411522633744856, "grad_norm": 0.24636265635490417, "learning_rate": 3e-05, "loss": 1.8818, "step": 15 }, { "epoch": 0.0438957475994513, "grad_norm": 0.2405432015657425, "learning_rate": 3.2000000000000005e-05, "loss": 1.975, "step": 16 }, { "epoch": 0.04663923182441701, "grad_norm": 0.24582137167453766, "learning_rate": 3.4000000000000007e-05, "loss": 1.923, "step": 17 }, { "epoch": 0.04938271604938271, "grad_norm": 0.2504767179489136, "learning_rate": 3.6e-05, "loss": 1.9781, "step": 18 }, { "epoch": 0.05212620027434842, "grad_norm": 0.2394665777683258, "learning_rate": 3.8e-05, "loss": 1.8959, "step": 19 }, { "epoch": 0.05486968449931413, "grad_norm": 0.24969030916690826, "learning_rate": 4e-05, "loss": 1.855, "step": 20 }, { "epoch": 0.05761316872427984, "grad_norm": 0.2694351077079773, "learning_rate": 4.2e-05, "loss": 1.9602, "step": 21 }, { "epoch": 0.06035665294924554, "grad_norm": 0.25622957944869995, "learning_rate": 4.4000000000000006e-05, "loss": 1.8208, "step": 22 }, { "epoch": 0.06310013717421124, "grad_norm": 0.24535588920116425, "learning_rate": 4.600000000000001e-05, "loss": 1.7967, "step": 23 }, { "epoch": 0.06584362139917696, "grad_norm": 0.2737885117530823, "learning_rate": 4.8e-05, "loss": 1.7841, "step": 24 }, { "epoch": 0.06858710562414266, "grad_norm": 0.2646300196647644, "learning_rate": 5e-05, "loss": 1.7744, "step": 25 }, { "epoch": 0.07133058984910837, "grad_norm": 0.2676407992839813, "learning_rate": 5.2000000000000004e-05, "loss": 1.7359, "step": 26 }, { "epoch": 0.07407407407407407, "grad_norm": 0.2649776041507721, "learning_rate": 5.4000000000000005e-05, "loss": 1.7205, "step": 27 }, { "epoch": 0.07681755829903979, "grad_norm": 0.296818345785141, "learning_rate": 5.6000000000000006e-05, "loss": 1.696, "step": 28 }, { "epoch": 0.07956104252400549, "grad_norm": 0.31905728578567505, "learning_rate": 5.8e-05, "loss": 1.7261, "step": 29 }, { "epoch": 0.0823045267489712, "grad_norm": 0.4174517095088959, "learning_rate": 6e-05, "loss": 1.6451, "step": 30 }, { "epoch": 0.0850480109739369, "grad_norm": 0.4545894265174866, "learning_rate": 6.2e-05, "loss": 1.5867, "step": 31 }, { "epoch": 0.0877914951989026, "grad_norm": 0.45722702145576477, "learning_rate": 6.400000000000001e-05, "loss": 1.5184, "step": 32 }, { "epoch": 0.09053497942386832, "grad_norm": 0.4953472316265106, "learning_rate": 6.6e-05, "loss": 1.4793, "step": 33 }, { "epoch": 0.09327846364883402, "grad_norm": 0.5516601800918579, "learning_rate": 6.800000000000001e-05, "loss": 1.4967, "step": 34 }, { "epoch": 0.09602194787379972, "grad_norm": 0.5295405983924866, "learning_rate": 7e-05, "loss": 1.4445, "step": 35 }, { "epoch": 0.09876543209876543, "grad_norm": 0.3918333351612091, "learning_rate": 7.2e-05, "loss": 1.3956, "step": 36 }, { "epoch": 0.10150891632373114, "grad_norm": 0.4032560884952545, "learning_rate": 7.4e-05, "loss": 1.3773, "step": 37 }, { "epoch": 0.10425240054869685, "grad_norm": 0.30622419714927673, "learning_rate": 7.6e-05, "loss": 1.2721, "step": 38 }, { "epoch": 0.10699588477366255, "grad_norm": 0.2740858495235443, "learning_rate": 7.800000000000001e-05, "loss": 1.3173, "step": 39 }, { "epoch": 0.10973936899862825, "grad_norm": 0.19876384735107422, "learning_rate": 8e-05, "loss": 1.388, "step": 40 }, { "epoch": 0.11248285322359397, "grad_norm": 0.20184342563152313, "learning_rate": 8.2e-05, "loss": 1.2856, "step": 41 }, { "epoch": 0.11522633744855967, "grad_norm": 0.1708114743232727, "learning_rate": 8.4e-05, "loss": 1.3209, "step": 42 }, { "epoch": 0.11796982167352538, "grad_norm": 0.1288367211818695, "learning_rate": 8.6e-05, "loss": 1.2782, "step": 43 }, { "epoch": 0.12071330589849108, "grad_norm": 0.12842676043510437, "learning_rate": 8.800000000000001e-05, "loss": 1.3378, "step": 44 }, { "epoch": 0.12345679012345678, "grad_norm": 0.12385573983192444, "learning_rate": 9e-05, "loss": 1.2032, "step": 45 }, { "epoch": 0.1262002743484225, "grad_norm": 0.12317045032978058, "learning_rate": 9.200000000000001e-05, "loss": 1.3391, "step": 46 }, { "epoch": 0.1289437585733882, "grad_norm": 0.126034677028656, "learning_rate": 9.4e-05, "loss": 1.2907, "step": 47 }, { "epoch": 0.13168724279835392, "grad_norm": 0.1402864307165146, "learning_rate": 9.6e-05, "loss": 1.2058, "step": 48 }, { "epoch": 0.13443072702331962, "grad_norm": 0.13603994250297546, "learning_rate": 9.8e-05, "loss": 1.2651, "step": 49 }, { "epoch": 0.13717421124828533, "grad_norm": 0.13384652137756348, "learning_rate": 0.0001, "loss": 1.3055, "step": 50 }, { "epoch": 0.13991769547325103, "grad_norm": 0.12770739197731018, "learning_rate": 0.00010200000000000001, "loss": 1.2601, "step": 51 }, { "epoch": 0.14266117969821673, "grad_norm": 0.13551487028598785, "learning_rate": 0.00010400000000000001, "loss": 1.2371, "step": 52 }, { "epoch": 0.14540466392318244, "grad_norm": 0.14969402551651, "learning_rate": 0.00010600000000000002, "loss": 1.1847, "step": 53 }, { "epoch": 0.14814814814814814, "grad_norm": 0.15096262097358704, "learning_rate": 0.00010800000000000001, "loss": 1.2237, "step": 54 }, { "epoch": 0.15089163237311384, "grad_norm": 0.167573019862175, "learning_rate": 0.00011000000000000002, "loss": 1.268, "step": 55 }, { "epoch": 0.15363511659807957, "grad_norm": 0.16625213623046875, "learning_rate": 0.00011200000000000001, "loss": 1.23, "step": 56 }, { "epoch": 0.15637860082304528, "grad_norm": 0.17408426105976105, "learning_rate": 0.00011399999999999999, "loss": 1.2036, "step": 57 }, { "epoch": 0.15912208504801098, "grad_norm": 0.19839078187942505, "learning_rate": 0.000116, "loss": 1.1494, "step": 58 }, { "epoch": 0.16186556927297668, "grad_norm": 0.1805109679698944, "learning_rate": 0.000118, "loss": 1.2177, "step": 59 }, { "epoch": 0.1646090534979424, "grad_norm": 0.19576989114284515, "learning_rate": 0.00012, "loss": 1.2494, "step": 60 }, { "epoch": 0.1673525377229081, "grad_norm": 0.21191267669200897, "learning_rate": 0.000122, "loss": 1.2526, "step": 61 }, { "epoch": 0.1700960219478738, "grad_norm": 0.20439420640468597, "learning_rate": 0.000124, "loss": 1.193, "step": 62 }, { "epoch": 0.1728395061728395, "grad_norm": 0.1174619197845459, "learning_rate": 0.000126, "loss": 1.1726, "step": 63 }, { "epoch": 0.1755829903978052, "grad_norm": 0.21233001351356506, "learning_rate": 0.00012800000000000002, "loss": 1.1495, "step": 64 }, { "epoch": 0.17832647462277093, "grad_norm": 0.20875731110572815, "learning_rate": 0.00013000000000000002, "loss": 1.1956, "step": 65 }, { "epoch": 0.18106995884773663, "grad_norm": 0.17120110988616943, "learning_rate": 0.000132, "loss": 1.1818, "step": 66 }, { "epoch": 0.18381344307270234, "grad_norm": 0.1390346735715866, "learning_rate": 0.000134, "loss": 1.1771, "step": 67 }, { "epoch": 0.18655692729766804, "grad_norm": 0.09387281537055969, "learning_rate": 0.00013600000000000003, "loss": 1.1983, "step": 68 }, { "epoch": 0.18930041152263374, "grad_norm": 0.07457795739173889, "learning_rate": 0.000138, "loss": 1.1385, "step": 69 }, { "epoch": 0.19204389574759945, "grad_norm": 0.08271007239818573, "learning_rate": 0.00014, "loss": 1.127, "step": 70 }, { "epoch": 0.19478737997256515, "grad_norm": 0.07676747441291809, "learning_rate": 0.000142, "loss": 1.1444, "step": 71 }, { "epoch": 0.19753086419753085, "grad_norm": 0.0707523301243782, "learning_rate": 0.000144, "loss": 1.1589, "step": 72 }, { "epoch": 0.20027434842249658, "grad_norm": 0.0701480582356453, "learning_rate": 0.000146, "loss": 1.1631, "step": 73 }, { "epoch": 0.2030178326474623, "grad_norm": 0.07315018028020859, "learning_rate": 0.000148, "loss": 1.1915, "step": 74 }, { "epoch": 0.205761316872428, "grad_norm": 0.0682872086763382, "learning_rate": 0.00015000000000000001, "loss": 1.2017, "step": 75 }, { "epoch": 0.2085048010973937, "grad_norm": 0.07075867056846619, "learning_rate": 0.000152, "loss": 1.1562, "step": 76 }, { "epoch": 0.2112482853223594, "grad_norm": 0.06364033371210098, "learning_rate": 0.000154, "loss": 1.1936, "step": 77 }, { "epoch": 0.2139917695473251, "grad_norm": 0.06413716077804565, "learning_rate": 0.00015600000000000002, "loss": 1.1463, "step": 78 }, { "epoch": 0.2167352537722908, "grad_norm": 0.06316008418798447, "learning_rate": 0.00015800000000000002, "loss": 1.1975, "step": 79 }, { "epoch": 0.2194787379972565, "grad_norm": 0.06660479307174683, "learning_rate": 0.00016, "loss": 1.1684, "step": 80 }, { "epoch": 0.2222222222222222, "grad_norm": 0.06477335095405579, "learning_rate": 0.000162, "loss": 1.1851, "step": 81 }, { "epoch": 0.22496570644718794, "grad_norm": 0.0677405372262001, "learning_rate": 0.000164, "loss": 1.093, "step": 82 }, { "epoch": 0.22770919067215364, "grad_norm": 0.06988447159528732, "learning_rate": 0.000166, "loss": 1.2542, "step": 83 }, { "epoch": 0.23045267489711935, "grad_norm": 0.06364695727825165, "learning_rate": 0.000168, "loss": 1.0451, "step": 84 }, { "epoch": 0.23319615912208505, "grad_norm": 0.06621105968952179, "learning_rate": 0.00017, "loss": 1.1022, "step": 85 }, { "epoch": 0.23593964334705075, "grad_norm": 0.06702058762311935, "learning_rate": 0.000172, "loss": 1.1803, "step": 86 }, { "epoch": 0.23868312757201646, "grad_norm": 0.06412065774202347, "learning_rate": 0.000174, "loss": 1.1867, "step": 87 }, { "epoch": 0.24142661179698216, "grad_norm": 0.06154424324631691, "learning_rate": 0.00017600000000000002, "loss": 1.2207, "step": 88 }, { "epoch": 0.24417009602194786, "grad_norm": 0.06396840512752533, "learning_rate": 0.00017800000000000002, "loss": 1.1466, "step": 89 }, { "epoch": 0.24691358024691357, "grad_norm": 0.06146432086825371, "learning_rate": 0.00018, "loss": 1.1748, "step": 90 }, { "epoch": 0.2496570644718793, "grad_norm": 0.06494217365980148, "learning_rate": 0.000182, "loss": 1.1481, "step": 91 }, { "epoch": 0.252400548696845, "grad_norm": 0.06837109476327896, "learning_rate": 0.00018400000000000003, "loss": 1.2352, "step": 92 }, { "epoch": 0.2551440329218107, "grad_norm": 0.06343371421098709, "learning_rate": 0.00018600000000000002, "loss": 1.1362, "step": 93 }, { "epoch": 0.2578875171467764, "grad_norm": 0.06184321269392967, "learning_rate": 0.000188, "loss": 1.1256, "step": 94 } ], "logging_steps": 1, "max_steps": 364, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.6971271019115315e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }