{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 24435, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.061387354205033766, "grad_norm": 4.773414134979248, "learning_rate": 2.9386126457949665e-05, "loss": 0.7901, "step": 500 }, { "epoch": 0.12277470841006753, "grad_norm": 5.613208293914795, "learning_rate": 2.8772252915899326e-05, "loss": 0.686, "step": 1000 }, { "epoch": 0.1841620626151013, "grad_norm": 6.7994384765625, "learning_rate": 2.815837937384899e-05, "loss": 0.6822, "step": 1500 }, { "epoch": 0.24554941682013506, "grad_norm": 4.575761318206787, "learning_rate": 2.7544505831798647e-05, "loss": 0.6375, "step": 2000 }, { "epoch": 0.3069367710251688, "grad_norm": 5.669924736022949, "learning_rate": 2.693063228974831e-05, "loss": 0.6488, "step": 2500 }, { "epoch": 0.3683241252302026, "grad_norm": 5.164685249328613, "learning_rate": 2.6316758747697975e-05, "loss": 0.6232, "step": 3000 }, { "epoch": 0.42971147943523635, "grad_norm": 3.6371734142303467, "learning_rate": 2.5702885205647636e-05, "loss": 0.6478, "step": 3500 }, { "epoch": 0.4910988336402701, "grad_norm": 13.574466705322266, "learning_rate": 2.50890116635973e-05, "loss": 0.6271, "step": 4000 }, { "epoch": 0.5524861878453039, "grad_norm": 3.65866756439209, "learning_rate": 2.4475138121546964e-05, "loss": 0.6212, "step": 4500 }, { "epoch": 0.6138735420503376, "grad_norm": 7.22385835647583, "learning_rate": 2.3861264579496625e-05, "loss": 0.6126, "step": 5000 }, { "epoch": 0.6752608962553714, "grad_norm": 4.810811996459961, "learning_rate": 2.3247391037446286e-05, "loss": 0.6095, "step": 5500 }, { "epoch": 0.7366482504604052, "grad_norm": 4.843252182006836, "learning_rate": 2.2633517495395946e-05, "loss": 0.5984, "step": 6000 }, { "epoch": 0.7980356046654389, "grad_norm": 3.590651750564575, "learning_rate": 2.201964395334561e-05, "loss": 0.6056, "step": 6500 }, { "epoch": 0.8594229588704727, "grad_norm": 6.5436177253723145, "learning_rate": 2.1405770411295274e-05, "loss": 0.6149, "step": 7000 }, { "epoch": 0.9208103130755064, "grad_norm": 3.6206412315368652, "learning_rate": 2.0791896869244935e-05, "loss": 0.6191, "step": 7500 }, { "epoch": 0.9821976672805403, "grad_norm": 3.8409266471862793, "learning_rate": 2.01780233271946e-05, "loss": 0.5964, "step": 8000 }, { "epoch": 1.043585021485574, "grad_norm": 3.4344983100891113, "learning_rate": 1.9564149785144263e-05, "loss": 0.5774, "step": 8500 }, { "epoch": 1.1049723756906078, "grad_norm": 6.731381893157959, "learning_rate": 1.8950276243093924e-05, "loss": 0.5958, "step": 9000 }, { "epoch": 1.1663597298956414, "grad_norm": 3.6475465297698975, "learning_rate": 1.8336402701043585e-05, "loss": 0.5723, "step": 9500 }, { "epoch": 1.2277470841006752, "grad_norm": 5.497737884521484, "learning_rate": 1.7722529158993245e-05, "loss": 0.5592, "step": 10000 }, { "epoch": 1.289134438305709, "grad_norm": 3.4102916717529297, "learning_rate": 1.710865561694291e-05, "loss": 0.563, "step": 10500 }, { "epoch": 1.350521792510743, "grad_norm": 2.2644035816192627, "learning_rate": 1.6494782074892574e-05, "loss": 0.5677, "step": 11000 }, { "epoch": 1.4119091467157765, "grad_norm": 4.101683139801025, "learning_rate": 1.5880908532842234e-05, "loss": 0.5714, "step": 11500 }, { "epoch": 1.4732965009208103, "grad_norm": 3.3737809658050537, "learning_rate": 1.52670349907919e-05, "loss": 0.5716, "step": 12000 }, { "epoch": 1.5346838551258442, "grad_norm": 4.371752738952637, "learning_rate": 1.465316144874156e-05, "loss": 0.5706, "step": 12500 }, { "epoch": 1.5960712093308778, "grad_norm": 2.752619504928589, "learning_rate": 1.4039287906691222e-05, "loss": 0.576, "step": 13000 }, { "epoch": 1.6574585635359116, "grad_norm": 5.145851135253906, "learning_rate": 1.3425414364640886e-05, "loss": 0.5532, "step": 13500 }, { "epoch": 1.7188459177409454, "grad_norm": 3.320183515548706, "learning_rate": 1.2811540822590546e-05, "loss": 0.5496, "step": 14000 }, { "epoch": 1.780233271945979, "grad_norm": 8.330361366271973, "learning_rate": 1.2197667280540209e-05, "loss": 0.5785, "step": 14500 }, { "epoch": 1.8416206261510129, "grad_norm": 5.743785381317139, "learning_rate": 1.1583793738489871e-05, "loss": 0.5484, "step": 15000 }, { "epoch": 1.9030079803560467, "grad_norm": 3.6862549781799316, "learning_rate": 1.0969920196439534e-05, "loss": 0.5614, "step": 15500 }, { "epoch": 1.9643953345610803, "grad_norm": 3.0343778133392334, "learning_rate": 1.0356046654389196e-05, "loss": 0.5636, "step": 16000 }, { "epoch": 2.0257826887661143, "grad_norm": 3.191669225692749, "learning_rate": 9.742173112338858e-06, "loss": 0.5583, "step": 16500 }, { "epoch": 2.087170042971148, "grad_norm": 2.4944469928741455, "learning_rate": 9.128299570288521e-06, "loss": 0.5592, "step": 17000 }, { "epoch": 2.1485573971761815, "grad_norm": 5.961979389190674, "learning_rate": 8.514426028238183e-06, "loss": 0.543, "step": 17500 }, { "epoch": 2.2099447513812156, "grad_norm": 3.5605664253234863, "learning_rate": 7.900552486187846e-06, "loss": 0.5553, "step": 18000 }, { "epoch": 2.271332105586249, "grad_norm": 4.210664749145508, "learning_rate": 7.286678944137508e-06, "loss": 0.5449, "step": 18500 }, { "epoch": 2.332719459791283, "grad_norm": 3.2335548400878906, "learning_rate": 6.6728054020871705e-06, "loss": 0.5365, "step": 19000 }, { "epoch": 2.394106813996317, "grad_norm": 2.6599347591400146, "learning_rate": 6.058931860036833e-06, "loss": 0.5399, "step": 19500 }, { "epoch": 2.4554941682013505, "grad_norm": 4.371065616607666, "learning_rate": 5.445058317986495e-06, "loss": 0.5443, "step": 20000 }, { "epoch": 2.5168815224063845, "grad_norm": 4.86035680770874, "learning_rate": 4.831184775936157e-06, "loss": 0.5334, "step": 20500 }, { "epoch": 2.578268876611418, "grad_norm": 7.235117435455322, "learning_rate": 4.21731123388582e-06, "loss": 0.5513, "step": 21000 }, { "epoch": 2.6396562308164517, "grad_norm": 7.687514305114746, "learning_rate": 3.603437691835482e-06, "loss": 0.5367, "step": 21500 }, { "epoch": 2.701043585021486, "grad_norm": 3.5359065532684326, "learning_rate": 2.9895641497851445e-06, "loss": 0.5545, "step": 22000 }, { "epoch": 2.7624309392265194, "grad_norm": 3.8917315006256104, "learning_rate": 2.375690607734807e-06, "loss": 0.5401, "step": 22500 }, { "epoch": 2.823818293431553, "grad_norm": 3.8702080249786377, "learning_rate": 1.761817065684469e-06, "loss": 0.5424, "step": 23000 }, { "epoch": 2.885205647636587, "grad_norm": 5.246346473693848, "learning_rate": 1.1479435236341315e-06, "loss": 0.5495, "step": 23500 }, { "epoch": 2.9465930018416207, "grad_norm": 4.763291358947754, "learning_rate": 5.340699815837937e-07, "loss": 0.538, "step": 24000 }, { "epoch": 3.0, "step": 24435, "total_flos": 2.645641233904435e+16, "train_loss": 0.5831284902567558, "train_runtime": 2740.1657, "train_samples_per_second": 142.676, "train_steps_per_second": 8.917 } ], "logging_steps": 500, "max_steps": 24435, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.645641233904435e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }