|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2578875171467764, |
|
"eval_steps": 500, |
|
"global_step": 94, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0027434842249657062, |
|
"grad_norm": 0.3216973543167114, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.039, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0054869684499314125, |
|
"grad_norm": 0.3218615651130676, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.063, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00823045267489712, |
|
"grad_norm": 0.3071180582046509, |
|
"learning_rate": 6e-06, |
|
"loss": 2.0298, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010973936899862825, |
|
"grad_norm": 0.30708086490631104, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.9489, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.013717421124828532, |
|
"grad_norm": 0.31064078211784363, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0401, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01646090534979424, |
|
"grad_norm": 0.35925883054733276, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.0963, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.019204389574759947, |
|
"grad_norm": 0.3267571032047272, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 2.0252, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02194787379972565, |
|
"grad_norm": 0.30728623270988464, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.0108, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.024691358024691357, |
|
"grad_norm": 0.2928607761859894, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.9542, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.027434842249657063, |
|
"grad_norm": 0.30577352643013, |
|
"learning_rate": 2e-05, |
|
"loss": 2.017, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03017832647462277, |
|
"grad_norm": 0.3024803102016449, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 1.982, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03292181069958848, |
|
"grad_norm": 0.28839072585105896, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.9217, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03566529492455418, |
|
"grad_norm": 0.2843893766403198, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.9608, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.038408779149519894, |
|
"grad_norm": 0.2703002095222473, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.9696, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0411522633744856, |
|
"grad_norm": 0.24636265635490417, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8818, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0438957475994513, |
|
"grad_norm": 0.2405432015657425, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.975, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04663923182441701, |
|
"grad_norm": 0.24582137167453766, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.923, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04938271604938271, |
|
"grad_norm": 0.2504767179489136, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.9781, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05212620027434842, |
|
"grad_norm": 0.2394665777683258, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.8959, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05486968449931413, |
|
"grad_norm": 0.24969030916690826, |
|
"learning_rate": 4e-05, |
|
"loss": 1.855, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05761316872427984, |
|
"grad_norm": 0.2694351077079773, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.9602, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06035665294924554, |
|
"grad_norm": 0.25622957944869995, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.8208, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06310013717421124, |
|
"grad_norm": 0.24535588920116425, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.7967, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06584362139917696, |
|
"grad_norm": 0.2737885117530823, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.7841, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06858710562414266, |
|
"grad_norm": 0.2646300196647644, |
|
"learning_rate": 5e-05, |
|
"loss": 1.7744, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07133058984910837, |
|
"grad_norm": 0.2676407992839813, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.7359, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 0.2649776041507721, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 1.7205, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07681755829903979, |
|
"grad_norm": 0.296818345785141, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.696, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07956104252400549, |
|
"grad_norm": 0.31905728578567505, |
|
"learning_rate": 5.8e-05, |
|
"loss": 1.7261, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0823045267489712, |
|
"grad_norm": 0.4174517095088959, |
|
"learning_rate": 6e-05, |
|
"loss": 1.6451, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0850480109739369, |
|
"grad_norm": 0.4545894265174866, |
|
"learning_rate": 6.2e-05, |
|
"loss": 1.5867, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0877914951989026, |
|
"grad_norm": 0.45722702145576477, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 1.5184, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09053497942386832, |
|
"grad_norm": 0.4953472316265106, |
|
"learning_rate": 6.6e-05, |
|
"loss": 1.4793, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09327846364883402, |
|
"grad_norm": 0.5516601800918579, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 1.4967, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09602194787379972, |
|
"grad_norm": 0.5295405983924866, |
|
"learning_rate": 7e-05, |
|
"loss": 1.4445, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09876543209876543, |
|
"grad_norm": 0.3918333351612091, |
|
"learning_rate": 7.2e-05, |
|
"loss": 1.3956, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10150891632373114, |
|
"grad_norm": 0.4032560884952545, |
|
"learning_rate": 7.4e-05, |
|
"loss": 1.3773, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.10425240054869685, |
|
"grad_norm": 0.30622419714927673, |
|
"learning_rate": 7.6e-05, |
|
"loss": 1.2721, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10699588477366255, |
|
"grad_norm": 0.2740858495235443, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.3173, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10973936899862825, |
|
"grad_norm": 0.19876384735107422, |
|
"learning_rate": 8e-05, |
|
"loss": 1.388, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11248285322359397, |
|
"grad_norm": 0.20184342563152313, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.2856, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.11522633744855967, |
|
"grad_norm": 0.1708114743232727, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.3209, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11796982167352538, |
|
"grad_norm": 0.1288367211818695, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.2782, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.12071330589849108, |
|
"grad_norm": 0.12842676043510437, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.3378, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12345679012345678, |
|
"grad_norm": 0.12385573983192444, |
|
"learning_rate": 9e-05, |
|
"loss": 1.2032, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1262002743484225, |
|
"grad_norm": 0.12317045032978058, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.3391, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1289437585733882, |
|
"grad_norm": 0.126034677028656, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.2907, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.13168724279835392, |
|
"grad_norm": 0.1402864307165146, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.2058, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13443072702331962, |
|
"grad_norm": 0.13603994250297546, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.2651, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.13717421124828533, |
|
"grad_norm": 0.13384652137756348, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3055, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13991769547325103, |
|
"grad_norm": 0.12770739197731018, |
|
"learning_rate": 0.00010200000000000001, |
|
"loss": 1.2601, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.14266117969821673, |
|
"grad_norm": 0.13551487028598785, |
|
"learning_rate": 0.00010400000000000001, |
|
"loss": 1.2371, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.14540466392318244, |
|
"grad_norm": 0.14969402551651, |
|
"learning_rate": 0.00010600000000000002, |
|
"loss": 1.1847, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.15096262097358704, |
|
"learning_rate": 0.00010800000000000001, |
|
"loss": 1.2237, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.15089163237311384, |
|
"grad_norm": 0.167573019862175, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 1.268, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.15363511659807957, |
|
"grad_norm": 0.16625213623046875, |
|
"learning_rate": 0.00011200000000000001, |
|
"loss": 1.23, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15637860082304528, |
|
"grad_norm": 0.17408426105976105, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 1.2036, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.15912208504801098, |
|
"grad_norm": 0.19839078187942505, |
|
"learning_rate": 0.000116, |
|
"loss": 1.1494, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.16186556927297668, |
|
"grad_norm": 0.1805109679698944, |
|
"learning_rate": 0.000118, |
|
"loss": 1.2177, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.1646090534979424, |
|
"grad_norm": 0.19576989114284515, |
|
"learning_rate": 0.00012, |
|
"loss": 1.2494, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1673525377229081, |
|
"grad_norm": 0.21191267669200897, |
|
"learning_rate": 0.000122, |
|
"loss": 1.2526, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.1700960219478738, |
|
"grad_norm": 0.20439420640468597, |
|
"learning_rate": 0.000124, |
|
"loss": 1.193, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1728395061728395, |
|
"grad_norm": 0.1174619197845459, |
|
"learning_rate": 0.000126, |
|
"loss": 1.1726, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1755829903978052, |
|
"grad_norm": 0.21233001351356506, |
|
"learning_rate": 0.00012800000000000002, |
|
"loss": 1.1495, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17832647462277093, |
|
"grad_norm": 0.20875731110572815, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 1.1956, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.18106995884773663, |
|
"grad_norm": 0.17120110988616943, |
|
"learning_rate": 0.000132, |
|
"loss": 1.1818, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.18381344307270234, |
|
"grad_norm": 0.1390346735715866, |
|
"learning_rate": 0.000134, |
|
"loss": 1.1771, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.18655692729766804, |
|
"grad_norm": 0.09387281537055969, |
|
"learning_rate": 0.00013600000000000003, |
|
"loss": 1.1983, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18930041152263374, |
|
"grad_norm": 0.07457795739173889, |
|
"learning_rate": 0.000138, |
|
"loss": 1.1385, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.19204389574759945, |
|
"grad_norm": 0.08271007239818573, |
|
"learning_rate": 0.00014, |
|
"loss": 1.127, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19478737997256515, |
|
"grad_norm": 0.07676747441291809, |
|
"learning_rate": 0.000142, |
|
"loss": 1.1444, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.19753086419753085, |
|
"grad_norm": 0.0707523301243782, |
|
"learning_rate": 0.000144, |
|
"loss": 1.1589, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.20027434842249658, |
|
"grad_norm": 0.0701480582356453, |
|
"learning_rate": 0.000146, |
|
"loss": 1.1631, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2030178326474623, |
|
"grad_norm": 0.07315018028020859, |
|
"learning_rate": 0.000148, |
|
"loss": 1.1915, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.205761316872428, |
|
"grad_norm": 0.0682872086763382, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.2017, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2085048010973937, |
|
"grad_norm": 0.07075867056846619, |
|
"learning_rate": 0.000152, |
|
"loss": 1.1562, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.2112482853223594, |
|
"grad_norm": 0.06364033371210098, |
|
"learning_rate": 0.000154, |
|
"loss": 1.1936, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.2139917695473251, |
|
"grad_norm": 0.06413716077804565, |
|
"learning_rate": 0.00015600000000000002, |
|
"loss": 1.1463, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2167352537722908, |
|
"grad_norm": 0.06316008418798447, |
|
"learning_rate": 0.00015800000000000002, |
|
"loss": 1.1975, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2194787379972565, |
|
"grad_norm": 0.06660479307174683, |
|
"learning_rate": 0.00016, |
|
"loss": 1.1684, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.06477335095405579, |
|
"learning_rate": 0.000162, |
|
"loss": 1.1851, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.22496570644718794, |
|
"grad_norm": 0.0677405372262001, |
|
"learning_rate": 0.000164, |
|
"loss": 1.093, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.22770919067215364, |
|
"grad_norm": 0.06988447159528732, |
|
"learning_rate": 0.000166, |
|
"loss": 1.2542, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.23045267489711935, |
|
"grad_norm": 0.06364695727825165, |
|
"learning_rate": 0.000168, |
|
"loss": 1.0451, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.23319615912208505, |
|
"grad_norm": 0.06621105968952179, |
|
"learning_rate": 0.00017, |
|
"loss": 1.1022, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.23593964334705075, |
|
"grad_norm": 0.06702058762311935, |
|
"learning_rate": 0.000172, |
|
"loss": 1.1803, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.23868312757201646, |
|
"grad_norm": 0.06412065774202347, |
|
"learning_rate": 0.000174, |
|
"loss": 1.1867, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.24142661179698216, |
|
"grad_norm": 0.06154424324631691, |
|
"learning_rate": 0.00017600000000000002, |
|
"loss": 1.2207, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.24417009602194786, |
|
"grad_norm": 0.06396840512752533, |
|
"learning_rate": 0.00017800000000000002, |
|
"loss": 1.1466, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.24691358024691357, |
|
"grad_norm": 0.06146432086825371, |
|
"learning_rate": 0.00018, |
|
"loss": 1.1748, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2496570644718793, |
|
"grad_norm": 0.06494217365980148, |
|
"learning_rate": 0.000182, |
|
"loss": 1.1481, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.252400548696845, |
|
"grad_norm": 0.06837109476327896, |
|
"learning_rate": 0.00018400000000000003, |
|
"loss": 1.2352, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2551440329218107, |
|
"grad_norm": 0.06343371421098709, |
|
"learning_rate": 0.00018600000000000002, |
|
"loss": 1.1362, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2578875171467764, |
|
"grad_norm": 0.06184321269392967, |
|
"learning_rate": 0.000188, |
|
"loss": 1.1256, |
|
"step": 94 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 364, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.6971271019115315e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|