diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9384d67714558de0c023db86877d807337db3c8b --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,4392 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9987085665088248, + "eval_steps": 500, + "global_step": 290, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0034438226431338786, + "grad_norm": 52.674065050138765, + "learning_rate": 2.7586206896551723e-08, + "logits/chosen": -5.615417003631592, + "logits/rejected": -5.667238712310791, + "logps/chosen": -0.4943414330482483, + "logps/rejected": -0.6143913865089417, + "loss": 5.1079, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.943414688110352, + "rewards/margins": 1.200499415397644, + "rewards/rejected": -6.143914222717285, + "step": 1 + }, + { + "epoch": 0.006887645286267757, + "grad_norm": 60.60351785955272, + "learning_rate": 5.517241379310345e-08, + "logits/chosen": -5.4836554527282715, + "logits/rejected": -5.671990871429443, + "logps/chosen": -0.5493791103363037, + "logps/rejected": -0.6132436990737915, + "loss": 5.0887, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.493791103363037, + "rewards/margins": 0.638646125793457, + "rewards/rejected": -6.132437705993652, + "step": 2 + }, + { + "epoch": 0.010331467929401636, + "grad_norm": 59.59005871499247, + "learning_rate": 8.275862068965517e-08, + "logits/chosen": -5.246090412139893, + "logits/rejected": -5.381677627563477, + "logps/chosen": -0.45825690031051636, + "logps/rejected": -0.531363308429718, + "loss": 5.0369, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.582569122314453, + "rewards/margins": 0.7310642004013062, + "rewards/rejected": -5.313632965087891, + "step": 3 + }, + { + "epoch": 0.013775290572535515, + "grad_norm": 58.937392298416974, + "learning_rate": 1.103448275862069e-07, + "logits/chosen": -5.279723167419434, + "logits/rejected": -5.392016410827637, + "logps/chosen": -0.5628327131271362, + "logps/rejected": -0.5180907249450684, + "loss": 5.3846, + "rewards/accuracies": 0.3125, + "rewards/chosen": -5.628327369689941, + "rewards/margins": -0.4474201798439026, + "rewards/rejected": -5.180907249450684, + "step": 4 + }, + { + "epoch": 0.017219113215669393, + "grad_norm": 51.34542849508367, + "learning_rate": 1.3793103448275863e-07, + "logits/chosen": -5.257406234741211, + "logits/rejected": -5.315362453460693, + "logps/chosen": -0.525193989276886, + "logps/rejected": -0.572905421257019, + "loss": 5.1008, + "rewards/accuracies": 0.5625, + "rewards/chosen": -5.25193977355957, + "rewards/margins": 0.47711431980133057, + "rewards/rejected": -5.729053974151611, + "step": 5 + }, + { + "epoch": 0.020662935858803272, + "grad_norm": 62.72154095060763, + "learning_rate": 1.6551724137931034e-07, + "logits/chosen": -5.621905326843262, + "logits/rejected": -5.849234104156494, + "logps/chosen": -0.5514707565307617, + "logps/rejected": -0.5989887118339539, + "loss": 5.0616, + "rewards/accuracies": 0.4375, + "rewards/chosen": -5.514707565307617, + "rewards/margins": 0.47517985105514526, + "rewards/rejected": -5.989887237548828, + "step": 6 + }, + { + "epoch": 0.02410675850193715, + "grad_norm": 58.76872623763212, + "learning_rate": 1.9310344827586208e-07, + "logits/chosen": -5.585313320159912, + "logits/rejected": -5.405150413513184, + "logps/chosen": -0.6232742667198181, + "logps/rejected": -0.5102998614311218, + "loss": 5.2864, + "rewards/accuracies": 0.6875, + "rewards/chosen": -6.232743263244629, + "rewards/margins": -1.129744529724121, + "rewards/rejected": -5.102999210357666, + "step": 7 + }, + { + "epoch": 0.02755058114507103, + "grad_norm": 49.99034574764161, + "learning_rate": 2.206896551724138e-07, + "logits/chosen": -5.209949016571045, + "logits/rejected": -5.195341110229492, + "logps/chosen": -0.47157737612724304, + "logps/rejected": -0.47744542360305786, + "loss": 5.1666, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.715773582458496, + "rewards/margins": 0.05868096649646759, + "rewards/rejected": -4.7744550704956055, + "step": 8 + }, + { + "epoch": 0.030994403788204908, + "grad_norm": 58.66744740284535, + "learning_rate": 2.482758620689655e-07, + "logits/chosen": -5.36690616607666, + "logits/rejected": -5.485147953033447, + "logps/chosen": -0.5631701946258545, + "logps/rejected": -0.5826945304870605, + "loss": 5.1211, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.631701946258545, + "rewards/margins": 0.1952425241470337, + "rewards/rejected": -5.826944828033447, + "step": 9 + }, + { + "epoch": 0.034438226431338786, + "grad_norm": 46.491425410700735, + "learning_rate": 2.7586206896551726e-07, + "logits/chosen": -5.605222225189209, + "logits/rejected": -5.74063777923584, + "logps/chosen": -0.501873791217804, + "logps/rejected": -0.5232819318771362, + "loss": 5.1411, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.018738269805908, + "rewards/margins": 0.21408089995384216, + "rewards/rejected": -5.232819080352783, + "step": 10 + }, + { + "epoch": 0.037882049074472665, + "grad_norm": 83.27704023131166, + "learning_rate": 3.034482758620689e-07, + "logits/chosen": -5.455163955688477, + "logits/rejected": -5.336368560791016, + "logps/chosen": -0.7997897267341614, + "logps/rejected": -0.5844107270240784, + "loss": 5.6532, + "rewards/accuracies": 0.5, + "rewards/chosen": -7.997897624969482, + "rewards/margins": -2.1537904739379883, + "rewards/rejected": -5.844107151031494, + "step": 11 + }, + { + "epoch": 0.041325871717606544, + "grad_norm": 48.93901907364537, + "learning_rate": 3.310344827586207e-07, + "logits/chosen": -5.099009990692139, + "logits/rejected": -5.078139781951904, + "logps/chosen": -0.4147089719772339, + "logps/rejected": -0.4047776460647583, + "loss": 5.1796, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.147089958190918, + "rewards/margins": -0.09931322932243347, + "rewards/rejected": -4.047776699066162, + "step": 12 + }, + { + "epoch": 0.04476969436074042, + "grad_norm": 50.65745487462828, + "learning_rate": 3.586206896551724e-07, + "logits/chosen": -4.989038467407227, + "logits/rejected": -4.911070346832275, + "logps/chosen": -0.5246780514717102, + "logps/rejected": -0.47285518050193787, + "loss": 5.0677, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.2467803955078125, + "rewards/margins": -0.5182281732559204, + "rewards/rejected": -4.728552341461182, + "step": 13 + }, + { + "epoch": 0.0482135170038743, + "grad_norm": 57.66601162283009, + "learning_rate": 3.8620689655172415e-07, + "logits/chosen": -5.70846700668335, + "logits/rejected": -5.75710916519165, + "logps/chosen": -0.44634702801704407, + "logps/rejected": -0.4534481465816498, + "loss": 5.1503, + "rewards/accuracies": 0.4375, + "rewards/chosen": -4.463469982147217, + "rewards/margins": 0.07101157307624817, + "rewards/rejected": -4.534482002258301, + "step": 14 + }, + { + "epoch": 0.05165733964700818, + "grad_norm": 64.39971082160424, + "learning_rate": 4.1379310344827586e-07, + "logits/chosen": -5.157317161560059, + "logits/rejected": -5.051291465759277, + "logps/chosen": -0.5117197632789612, + "logps/rejected": -0.6016834378242493, + "loss": 5.1127, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.117197036743164, + "rewards/margins": 0.8996371626853943, + "rewards/rejected": -6.016834735870361, + "step": 15 + }, + { + "epoch": 0.05510116229014206, + "grad_norm": 43.28657458222468, + "learning_rate": 4.413793103448276e-07, + "logits/chosen": -5.220727920532227, + "logits/rejected": -5.298764228820801, + "logps/chosen": -0.4690421521663666, + "logps/rejected": -0.4100668132305145, + "loss": 4.9565, + "rewards/accuracies": 0.375, + "rewards/chosen": -4.6904215812683105, + "rewards/margins": -0.5897536873817444, + "rewards/rejected": -4.100667953491211, + "step": 16 + }, + { + "epoch": 0.05854498493327594, + "grad_norm": 55.940864947672644, + "learning_rate": 4.6896551724137923e-07, + "logits/chosen": -5.025401592254639, + "logits/rejected": -5.0607781410217285, + "logps/chosen": -0.6796688437461853, + "logps/rejected": -0.6925962567329407, + "loss": 5.0153, + "rewards/accuracies": 0.5, + "rewards/chosen": -6.796688079833984, + "rewards/margins": 0.12927353382110596, + "rewards/rejected": -6.925961971282959, + "step": 17 + }, + { + "epoch": 0.061988807576409816, + "grad_norm": 60.73894736133069, + "learning_rate": 4.96551724137931e-07, + "logits/chosen": -5.200845718383789, + "logits/rejected": -5.258395195007324, + "logps/chosen": -0.38808485865592957, + "logps/rejected": -0.38210412859916687, + "loss": 5.0964, + "rewards/accuracies": 0.375, + "rewards/chosen": -3.8808484077453613, + "rewards/margins": -0.059806786477565765, + "rewards/rejected": -3.8210418224334717, + "step": 18 + }, + { + "epoch": 0.0654326302195437, + "grad_norm": 50.35855568678846, + "learning_rate": 5.241379310344828e-07, + "logits/chosen": -5.284404277801514, + "logits/rejected": -5.353991508483887, + "logps/chosen": -0.4545897841453552, + "logps/rejected": -0.4399701952934265, + "loss": 5.0991, + "rewards/accuracies": 0.3125, + "rewards/chosen": -4.545897483825684, + "rewards/margins": -0.1461959034204483, + "rewards/rejected": -4.399702548980713, + "step": 19 + }, + { + "epoch": 0.06887645286267757, + "grad_norm": 103.40309567567355, + "learning_rate": 5.517241379310345e-07, + "logits/chosen": -5.317500114440918, + "logits/rejected": -5.262624740600586, + "logps/chosen": -0.4840647280216217, + "logps/rejected": -0.6067878007888794, + "loss": 5.1123, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.8406476974487305, + "rewards/margins": 1.2272305488586426, + "rewards/rejected": -6.067877769470215, + "step": 20 + }, + { + "epoch": 0.07232027550581145, + "grad_norm": 46.89431893563974, + "learning_rate": 5.793103448275862e-07, + "logits/chosen": -5.2541890144348145, + "logits/rejected": -5.130758285522461, + "logps/chosen": -0.4697263538837433, + "logps/rejected": -0.5909743905067444, + "loss": 4.937, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.697263240814209, + "rewards/margins": 1.2124805450439453, + "rewards/rejected": -5.9097442626953125, + "step": 21 + }, + { + "epoch": 0.07576409814894533, + "grad_norm": 67.56806825458494, + "learning_rate": 6.068965517241378e-07, + "logits/chosen": -4.811267375946045, + "logits/rejected": -4.888763904571533, + "logps/chosen": -0.4097307324409485, + "logps/rejected": -0.40606409311294556, + "loss": 5.1578, + "rewards/accuracies": 0.375, + "rewards/chosen": -4.097307205200195, + "rewards/margins": -0.03666616231203079, + "rewards/rejected": -4.060640811920166, + "step": 22 + }, + { + "epoch": 0.07920792079207921, + "grad_norm": 61.29631937996971, + "learning_rate": 6.344827586206897e-07, + "logits/chosen": -4.587668418884277, + "logits/rejected": -4.847754001617432, + "logps/chosen": -0.5758047103881836, + "logps/rejected": -0.5185554027557373, + "loss": 4.8982, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.758047580718994, + "rewards/margins": -0.5724934339523315, + "rewards/rejected": -5.185554027557373, + "step": 23 + }, + { + "epoch": 0.08265174343521309, + "grad_norm": 93.2152404636039, + "learning_rate": 6.620689655172414e-07, + "logits/chosen": -4.800999641418457, + "logits/rejected": -5.004166603088379, + "logps/chosen": -0.3709278702735901, + "logps/rejected": -0.4135555624961853, + "loss": 4.8412, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.7092788219451904, + "rewards/margins": 0.42627638578414917, + "rewards/rejected": -4.135554790496826, + "step": 24 + }, + { + "epoch": 0.08609556607834697, + "grad_norm": 64.42035341096593, + "learning_rate": 6.89655172413793e-07, + "logits/chosen": -4.344979286193848, + "logits/rejected": -4.520305633544922, + "logps/chosen": -0.46768859028816223, + "logps/rejected": -0.4844363033771515, + "loss": 4.9329, + "rewards/accuracies": 0.4375, + "rewards/chosen": -4.676885604858398, + "rewards/margins": 0.16747775673866272, + "rewards/rejected": -4.844363212585449, + "step": 25 + }, + { + "epoch": 0.08953938872148084, + "grad_norm": 56.68263958973131, + "learning_rate": 7.172413793103448e-07, + "logits/chosen": -4.4868011474609375, + "logits/rejected": -4.451172828674316, + "logps/chosen": -0.3706013262271881, + "logps/rejected": -0.44255292415618896, + "loss": 4.8804, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.7060132026672363, + "rewards/margins": 0.7195163369178772, + "rewards/rejected": -4.425529479980469, + "step": 26 + }, + { + "epoch": 0.09298321136461472, + "grad_norm": 69.68637190790966, + "learning_rate": 7.448275862068965e-07, + "logits/chosen": -4.358417510986328, + "logits/rejected": -4.435417652130127, + "logps/chosen": -0.3981940746307373, + "logps/rejected": -0.4577309489250183, + "loss": 4.8387, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.981940984725952, + "rewards/margins": 0.5953686237335205, + "rewards/rejected": -4.5773091316223145, + "step": 27 + }, + { + "epoch": 0.0964270340077486, + "grad_norm": 58.3412204024468, + "learning_rate": 7.724137931034483e-07, + "logits/chosen": -4.649521350860596, + "logits/rejected": -4.8989362716674805, + "logps/chosen": -0.33310776948928833, + "logps/rejected": -0.37322184443473816, + "loss": 4.6174, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.331077814102173, + "rewards/margins": 0.4011409282684326, + "rewards/rejected": -3.7322187423706055, + "step": 28 + }, + { + "epoch": 0.09987085665088248, + "grad_norm": 56.195572626431165, + "learning_rate": 8e-07, + "logits/chosen": -4.673098087310791, + "logits/rejected": -5.0018768310546875, + "logps/chosen": -0.4161800742149353, + "logps/rejected": -0.3885071277618408, + "loss": 4.65, + "rewards/accuracies": 0.4375, + "rewards/chosen": -4.161801338195801, + "rewards/margins": -0.27672961354255676, + "rewards/rejected": -3.885071277618408, + "step": 29 + }, + { + "epoch": 0.10331467929401636, + "grad_norm": 67.80702382842614, + "learning_rate": 7.999710236630706e-07, + "logits/chosen": -4.643288612365723, + "logits/rejected": -4.589477062225342, + "logps/chosen": -0.4303164482116699, + "logps/rejected": -0.506043016910553, + "loss": 4.8244, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.303164482116699, + "rewards/margins": 0.7572658658027649, + "rewards/rejected": -5.06043004989624, + "step": 30 + }, + { + "epoch": 0.10675850193715024, + "grad_norm": 50.50628925100566, + "learning_rate": 7.998840988504232e-07, + "logits/chosen": -4.767556190490723, + "logits/rejected": -4.7690935134887695, + "logps/chosen": -0.403850257396698, + "logps/rejected": -0.44447407126426697, + "loss": 4.7897, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.0385026931762695, + "rewards/margins": 0.4062381088733673, + "rewards/rejected": -4.4447407722473145, + "step": 31 + }, + { + "epoch": 0.11020232458028412, + "grad_norm": 60.05314418873607, + "learning_rate": 7.997392381558708e-07, + "logits/chosen": -3.7635271549224854, + "logits/rejected": -3.760200262069702, + "logps/chosen": -0.5402004718780518, + "logps/rejected": -0.5654389262199402, + "loss": 4.7483, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.402004718780518, + "rewards/margins": 0.25238436460494995, + "rewards/rejected": -5.654389381408691, + "step": 32 + }, + { + "epoch": 0.113646147223418, + "grad_norm": 50.090026374394135, + "learning_rate": 7.99536462567075e-07, + "logits/chosen": -5.203555583953857, + "logits/rejected": -5.3314290046691895, + "logps/chosen": -0.4754854440689087, + "logps/rejected": -0.4819332957267761, + "loss": 4.8249, + "rewards/accuracies": 0.375, + "rewards/chosen": -4.754855155944824, + "rewards/margins": 0.06447845697402954, + "rewards/rejected": -4.819333076477051, + "step": 33 + }, + { + "epoch": 0.11708996986655187, + "grad_norm": 44.62210441192165, + "learning_rate": 7.992758014625048e-07, + "logits/chosen": -4.730749607086182, + "logits/rejected": -4.70894718170166, + "logps/chosen": -0.3653126657009125, + "logps/rejected": -0.47924527525901794, + "loss": 4.681, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.6531267166137695, + "rewards/margins": 1.1393256187438965, + "rewards/rejected": -4.792451858520508, + "step": 34 + }, + { + "epoch": 0.12053379250968575, + "grad_norm": 51.36875532303172, + "learning_rate": 7.989572926071799e-07, + "logits/chosen": -4.721662521362305, + "logits/rejected": -4.724156856536865, + "logps/chosen": -0.4223301410675049, + "logps/rejected": -0.4952259063720703, + "loss": 4.5665, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.223300933837891, + "rewards/margins": 0.7289580702781677, + "rewards/rejected": -4.952259063720703, + "step": 35 + }, + { + "epoch": 0.12397761515281963, + "grad_norm": 57.9863349338661, + "learning_rate": 7.985809821472e-07, + "logits/chosen": -4.691116809844971, + "logits/rejected": -4.813366413116455, + "logps/chosen": -0.4277626872062683, + "logps/rejected": -0.4881032705307007, + "loss": 4.6191, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.277626991271973, + "rewards/margins": 0.6034059524536133, + "rewards/rejected": -4.881032943725586, + "step": 36 + }, + { + "epoch": 0.1274214377959535, + "grad_norm": 63.01073545994464, + "learning_rate": 7.981469246030587e-07, + "logits/chosen": -4.308718204498291, + "logits/rejected": -4.413212776184082, + "logps/chosen": -0.4789758026599884, + "logps/rejected": -0.58740234375, + "loss": 4.7889, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.789758205413818, + "rewards/margins": 1.0842654705047607, + "rewards/rejected": -5.8740234375, + "step": 37 + }, + { + "epoch": 0.1308652604390874, + "grad_norm": 83.60559787534415, + "learning_rate": 7.976551828617438e-07, + "logits/chosen": -4.922616481781006, + "logits/rejected": -4.967951774597168, + "logps/chosen": -0.4330342710018158, + "logps/rejected": -0.4283411204814911, + "loss": 4.7034, + "rewards/accuracies": 0.4375, + "rewards/chosen": -4.330342769622803, + "rewards/margins": -0.0469314381480217, + "rewards/rejected": -4.283411026000977, + "step": 38 + }, + { + "epoch": 0.13430908308222125, + "grad_norm": 57.08486563674846, + "learning_rate": 7.971058281676275e-07, + "logits/chosen": -5.094006061553955, + "logits/rejected": -5.191053867340088, + "logps/chosen": -0.4875888228416443, + "logps/rejected": -0.6387084722518921, + "loss": 4.6644, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.875887870788574, + "rewards/margins": 1.5111969709396362, + "rewards/rejected": -6.3870849609375, + "step": 39 + }, + { + "epoch": 0.13775290572535515, + "grad_norm": 60.77964329495937, + "learning_rate": 7.964989401121432e-07, + "logits/chosen": -4.993417739868164, + "logits/rejected": -4.969631195068359, + "logps/chosen": -0.33883655071258545, + "logps/rejected": -0.3545013964176178, + "loss": 4.5686, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.3883657455444336, + "rewards/margins": 0.1566484272480011, + "rewards/rejected": -3.5450141429901123, + "step": 40 + }, + { + "epoch": 0.141196728368489, + "grad_norm": 40.340190041907185, + "learning_rate": 7.958346066222549e-07, + "logits/chosen": -4.525943756103516, + "logits/rejected": -4.55746603012085, + "logps/chosen": -0.45404478907585144, + "logps/rejected": -0.4469107389450073, + "loss": 4.5937, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.54044771194458, + "rewards/margins": -0.07134075462818146, + "rewards/rejected": -4.469107151031494, + "step": 41 + }, + { + "epoch": 0.1446405510116229, + "grad_norm": 57.514362166929665, + "learning_rate": 7.951129239477177e-07, + "logits/chosen": -5.132482528686523, + "logits/rejected": -5.1113786697387695, + "logps/chosen": -0.401109516620636, + "logps/rejected": -0.44438689947128296, + "loss": 4.6176, + "rewards/accuracies": 0.4375, + "rewards/chosen": -4.01109504699707, + "rewards/margins": 0.43277424573898315, + "rewards/rejected": -4.443869113922119, + "step": 42 + }, + { + "epoch": 0.14808437365475677, + "grad_norm": 60.34883135001456, + "learning_rate": 7.943339966471333e-07, + "logits/chosen": -4.517858982086182, + "logits/rejected": -4.421618461608887, + "logps/chosen": -0.6845810413360596, + "logps/rejected": -0.6314383745193481, + "loss": 4.6897, + "rewards/accuracies": 0.5, + "rewards/chosen": -6.8458099365234375, + "rewards/margins": -0.531426191329956, + "rewards/rejected": -6.3143839836120605, + "step": 43 + }, + { + "epoch": 0.15152819629789066, + "grad_norm": 65.95554033423765, + "learning_rate": 7.93497937572801e-07, + "logits/chosen": -5.128730297088623, + "logits/rejected": -5.0609660148620605, + "logps/chosen": -0.5347275137901306, + "logps/rejected": -0.5926434397697449, + "loss": 4.7481, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.3472747802734375, + "rewards/margins": 0.5791594386100769, + "rewards/rejected": -5.926434516906738, + "step": 44 + }, + { + "epoch": 0.15497201894102453, + "grad_norm": 65.36255184426133, + "learning_rate": 7.926048678543684e-07, + "logits/chosen": -4.324880599975586, + "logits/rejected": -4.221179485321045, + "logps/chosen": -0.5375354290008545, + "logps/rejected": -0.7417870163917542, + "loss": 4.4532, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.375354766845703, + "rewards/margins": 2.042515277862549, + "rewards/rejected": -7.417870044708252, + "step": 45 + }, + { + "epoch": 0.15841584158415842, + "grad_norm": 56.19140828662666, + "learning_rate": 7.916549168812805e-07, + "logits/chosen": -4.412731647491455, + "logits/rejected": -4.406851768493652, + "logps/chosen": -0.43062710762023926, + "logps/rejected": -0.5201414227485657, + "loss": 4.4945, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.306270599365234, + "rewards/margins": 0.8951433897018433, + "rewards/rejected": -5.201414585113525, + "step": 46 + }, + { + "epoch": 0.16185966422729228, + "grad_norm": 61.47712790834795, + "learning_rate": 7.906482222840346e-07, + "logits/chosen": -3.994800329208374, + "logits/rejected": -3.9059207439422607, + "logps/chosen": -0.4685822129249573, + "logps/rejected": -0.6291300058364868, + "loss": 4.5688, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.685822010040283, + "rewards/margins": 1.6054778099060059, + "rewards/rejected": -6.291299819946289, + "step": 47 + }, + { + "epoch": 0.16530348687042618, + "grad_norm": 71.97293938631553, + "learning_rate": 7.8958492991424e-07, + "logits/chosen": -4.644060134887695, + "logits/rejected": -4.552207946777344, + "logps/chosen": -0.49269717931747437, + "logps/rejected": -0.48811206221580505, + "loss": 4.3686, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.926971435546875, + "rewards/margins": -0.04585088789463043, + "rewards/rejected": -4.881120204925537, + "step": 48 + }, + { + "epoch": 0.16874730951356004, + "grad_norm": 50.91272883112091, + "learning_rate": 7.884651938234865e-07, + "logits/chosen": -4.6048712730407715, + "logits/rejected": -4.637516975402832, + "logps/chosen": -0.454245924949646, + "logps/rejected": -0.5149778723716736, + "loss": 4.4144, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.542459011077881, + "rewards/margins": 0.6073201894760132, + "rewards/rejected": -5.149779796600342, + "step": 49 + }, + { + "epoch": 0.17219113215669393, + "grad_norm": 70.84729258897266, + "learning_rate": 7.872891762410253e-07, + "logits/chosen": -4.788956642150879, + "logits/rejected": -4.830476760864258, + "logps/chosen": -0.5271515846252441, + "logps/rejected": -0.5558156967163086, + "loss": 4.398, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.271515846252441, + "rewards/margins": 0.2866411805152893, + "rewards/rejected": -5.558156967163086, + "step": 50 + }, + { + "epoch": 0.1756349547998278, + "grad_norm": 72.43110951103814, + "learning_rate": 7.860570475502648e-07, + "logits/chosen": -4.508288860321045, + "logits/rejected": -4.559998035430908, + "logps/chosen": -0.4371810257434845, + "logps/rejected": -0.5790078639984131, + "loss": 4.4228, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.371809959411621, + "rewards/margins": 1.4182684421539307, + "rewards/rejected": -5.790079116821289, + "step": 51 + }, + { + "epoch": 0.1790787774429617, + "grad_norm": 55.410292700087545, + "learning_rate": 7.847689862640855e-07, + "logits/chosen": -4.518070697784424, + "logits/rejected": -4.57796049118042, + "logps/chosen": -0.4647026062011719, + "logps/rejected": -0.5196883082389832, + "loss": 4.7694, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.647026062011719, + "rewards/margins": 0.5498570203781128, + "rewards/rejected": -5.196883201599121, + "step": 52 + }, + { + "epoch": 0.18252260008609555, + "grad_norm": 69.37009413960385, + "learning_rate": 7.834251789989765e-07, + "logits/chosen": -4.978256702423096, + "logits/rejected": -4.886575698852539, + "logps/chosen": -0.5333456993103027, + "logps/rejected": -0.7749611139297485, + "loss": 4.6385, + "rewards/accuracies": 0.5625, + "rewards/chosen": -5.333456516265869, + "rewards/margins": 2.4161548614501953, + "rewards/rejected": -7.7496113777160645, + "step": 53 + }, + { + "epoch": 0.18596642272922945, + "grad_norm": 72.61257423821304, + "learning_rate": 7.820258204479982e-07, + "logits/chosen": -4.223357677459717, + "logits/rejected": -4.151899337768555, + "logps/chosen": -0.5688156485557556, + "logps/rejected": -0.6057307124137878, + "loss": 4.811, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.688156604766846, + "rewards/margins": 0.3691507577896118, + "rewards/rejected": -6.05730676651001, + "step": 54 + }, + { + "epoch": 0.1894102453723633, + "grad_norm": 56.657706476039074, + "learning_rate": 7.805711133525747e-07, + "logits/chosen": -4.470883846282959, + "logits/rejected": -4.288090705871582, + "logps/chosen": -0.6821640729904175, + "logps/rejected": -0.6564118266105652, + "loss": 4.6001, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.8216400146484375, + "rewards/margins": -0.25752171874046326, + "rewards/rejected": -6.564118385314941, + "step": 55 + }, + { + "epoch": 0.1928540680154972, + "grad_norm": 78.23314078065798, + "learning_rate": 7.790612684731209e-07, + "logits/chosen": -4.282840728759766, + "logits/rejected": -4.223234176635742, + "logps/chosen": -0.6843351721763611, + "logps/rejected": -0.8569565415382385, + "loss": 4.456, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.843351364135742, + "rewards/margins": 1.7262136936187744, + "rewards/rejected": -8.569564819335938, + "step": 56 + }, + { + "epoch": 0.19629789065863107, + "grad_norm": 64.08169300905095, + "learning_rate": 7.774965045585064e-07, + "logits/chosen": -5.029541015625, + "logits/rejected": -5.061357498168945, + "logps/chosen": -0.5916852951049805, + "logps/rejected": -0.6251527667045593, + "loss": 4.3484, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.916852951049805, + "rewards/margins": 0.334674596786499, + "rewards/rejected": -6.251528263092041, + "step": 57 + }, + { + "epoch": 0.19974171330176496, + "grad_norm": 62.02026429799981, + "learning_rate": 7.758770483143634e-07, + "logits/chosen": -3.820904016494751, + "logits/rejected": -3.8994665145874023, + "logps/chosen": -0.6398332118988037, + "logps/rejected": -0.662560760974884, + "loss": 4.3431, + "rewards/accuracies": 0.5625, + "rewards/chosen": -6.398331642150879, + "rewards/margins": 0.22727595269680023, + "rewards/rejected": -6.625607967376709, + "step": 58 + }, + { + "epoch": 0.20318553594489883, + "grad_norm": 57.111582977153155, + "learning_rate": 7.742031343702404e-07, + "logits/chosen": -4.509333610534668, + "logits/rejected": -4.401131629943848, + "logps/chosen": -0.5554917454719543, + "logps/rejected": -0.6513252854347229, + "loss": 4.1657, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.554917335510254, + "rewards/margins": 0.9583351016044617, + "rewards/rejected": -6.5132527351379395, + "step": 59 + }, + { + "epoch": 0.20662935858803272, + "grad_norm": 68.28922293268536, + "learning_rate": 7.724750052456098e-07, + "logits/chosen": -4.062650680541992, + "logits/rejected": -3.9956672191619873, + "logps/chosen": -0.5649631023406982, + "logps/rejected": -0.7722354531288147, + "loss": 4.3439, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.649630546569824, + "rewards/margins": 2.072723627090454, + "rewards/rejected": -7.722353935241699, + "step": 60 + }, + { + "epoch": 0.21007318123116658, + "grad_norm": 65.73305092854736, + "learning_rate": 7.706929113147304e-07, + "logits/chosen": -4.709454536437988, + "logits/rejected": -4.698660850524902, + "logps/chosen": -0.6076084971427917, + "logps/rejected": -0.6684498190879822, + "loss": 4.2227, + "rewards/accuracies": 0.6875, + "rewards/chosen": -6.076085090637207, + "rewards/margins": 0.6084132790565491, + "rewards/rejected": -6.684497833251953, + "step": 61 + }, + { + "epoch": 0.21351700387430048, + "grad_norm": 67.46265604716064, + "learning_rate": 7.688571107703732e-07, + "logits/chosen": -3.963956832885742, + "logits/rejected": -3.938755512237549, + "logps/chosen": -0.5723408460617065, + "logps/rejected": -0.5089117288589478, + "loss": 4.4227, + "rewards/accuracies": 0.4375, + "rewards/chosen": -5.7234086990356445, + "rewards/margins": -0.6342912316322327, + "rewards/rejected": -5.089117527008057, + "step": 62 + }, + { + "epoch": 0.21696082651743434, + "grad_norm": 60.086542404476894, + "learning_rate": 7.669678695864137e-07, + "logits/chosen": -4.414982795715332, + "logits/rejected": -4.424773693084717, + "logps/chosen": -0.7808203101158142, + "logps/rejected": -0.9502580761909485, + "loss": 4.1876, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.80820369720459, + "rewards/margins": 1.6943775415420532, + "rewards/rejected": -9.502581596374512, + "step": 63 + }, + { + "epoch": 0.22040464916056823, + "grad_norm": 62.22132599200314, + "learning_rate": 7.650254614792972e-07, + "logits/chosen": -5.100131511688232, + "logits/rejected": -4.888442039489746, + "logps/chosen": -0.7664632797241211, + "logps/rejected": -0.7097909450531006, + "loss": 4.0675, + "rewards/accuracies": 0.5625, + "rewards/chosen": -7.664633274078369, + "rewards/margins": -0.5667227506637573, + "rewards/rejected": -7.097909927368164, + "step": 64 + }, + { + "epoch": 0.2238484718037021, + "grad_norm": 64.6980516756494, + "learning_rate": 7.630301678683828e-07, + "logits/chosen": -4.501206398010254, + "logits/rejected": -4.3760600090026855, + "logps/chosen": -0.582870602607727, + "logps/rejected": -0.7515184283256531, + "loss": 3.8879, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.82870626449585, + "rewards/margins": 1.686477780342102, + "rewards/rejected": -7.51518440246582, + "step": 65 + }, + { + "epoch": 0.227292294446836, + "grad_norm": 58.49975168550397, + "learning_rate": 7.6098227783517e-07, + "logits/chosen": -4.590901851654053, + "logits/rejected": -4.614831447601318, + "logps/chosen": -0.6885466575622559, + "logps/rejected": -0.6569955945014954, + "loss": 4.2328, + "rewards/accuracies": 0.4375, + "rewards/chosen": -6.885467052459717, + "rewards/margins": -0.31551113724708557, + "rewards/rejected": -6.569955825805664, + "step": 66 + }, + { + "epoch": 0.23073611708996986, + "grad_norm": 71.25690755989906, + "learning_rate": 7.588820880814168e-07, + "logits/chosen": -4.404972553253174, + "logits/rejected": -4.322005271911621, + "logps/chosen": -0.7880414128303528, + "logps/rejected": -0.8913244605064392, + "loss": 4.4685, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.8804144859313965, + "rewards/margins": 1.032829999923706, + "rewards/rejected": -8.913244247436523, + "step": 67 + }, + { + "epoch": 0.23417993973310375, + "grad_norm": 74.45643519661347, + "learning_rate": 7.567299028861528e-07, + "logits/chosen": -5.07747220993042, + "logits/rejected": -4.910668849945068, + "logps/chosen": -0.8106582760810852, + "logps/rejected": -0.8454375863075256, + "loss": 4.2067, + "rewards/accuracies": 0.8125, + "rewards/chosen": -8.106582641601562, + "rewards/margins": 0.3477928638458252, + "rewards/rejected": -8.454376220703125, + "step": 68 + }, + { + "epoch": 0.2376237623762376, + "grad_norm": 61.17870132728996, + "learning_rate": 7.54526034061595e-07, + "logits/chosen": -4.368528842926025, + "logits/rejected": -4.182857513427734, + "logps/chosen": -0.7671667337417603, + "logps/rejected": -0.8781678676605225, + "loss": 4.0971, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.67166805267334, + "rewards/margins": 1.110011100769043, + "rewards/rejected": -8.781679153442383, + "step": 69 + }, + { + "epoch": 0.2410675850193715, + "grad_norm": 80.03873573932812, + "learning_rate": 7.522708009079711e-07, + "logits/chosen": -3.757272720336914, + "logits/rejected": -3.6177382469177246, + "logps/chosen": -0.7591115832328796, + "logps/rejected": -1.0031275749206543, + "loss": 4.4004, + "rewards/accuracies": 0.6875, + "rewards/chosen": -7.591116905212402, + "rewards/margins": 2.4401588439941406, + "rewards/rejected": -10.031274795532227, + "step": 70 + }, + { + "epoch": 0.24451140766250537, + "grad_norm": 54.78863193631618, + "learning_rate": 7.499645301672599e-07, + "logits/chosen": -4.391002655029297, + "logits/rejected": -4.642823696136475, + "logps/chosen": -0.8277568817138672, + "logps/rejected": -0.8874188661575317, + "loss": 4.0832, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.277568817138672, + "rewards/margins": 0.5966211557388306, + "rewards/rejected": -8.874189376831055, + "step": 71 + }, + { + "epoch": 0.24795523030563926, + "grad_norm": 68.33693738907787, + "learning_rate": 7.476075559758513e-07, + "logits/chosen": -4.277254581451416, + "logits/rejected": -4.10576057434082, + "logps/chosen": -0.6340219378471375, + "logps/rejected": -0.8107688426971436, + "loss": 4.3541, + "rewards/accuracies": 0.6875, + "rewards/chosen": -6.340219974517822, + "rewards/margins": 1.767467975616455, + "rewards/rejected": -8.107687950134277, + "step": 72 + }, + { + "epoch": 0.2513990529487731, + "grad_norm": 56.379646600357916, + "learning_rate": 7.452002198161371e-07, + "logits/chosen": -4.682867050170898, + "logits/rejected": -4.608969211578369, + "logps/chosen": -0.7252380847930908, + "logps/rejected": -0.8175498247146606, + "loss": 3.8474, + "rewards/accuracies": 0.5625, + "rewards/chosen": -7.25238037109375, + "rewards/margins": 0.9231181144714355, + "rewards/rejected": -8.175498962402344, + "step": 73 + }, + { + "epoch": 0.254842875591907, + "grad_norm": 100.90328367426899, + "learning_rate": 7.427428704670356e-07, + "logits/chosen": -4.861872673034668, + "logits/rejected": -4.656722545623779, + "logps/chosen": -0.7617427706718445, + "logps/rejected": -0.9613173007965088, + "loss": 4.4928, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.617427825927734, + "rewards/margins": 1.9957445859909058, + "rewards/rejected": -9.61317253112793, + "step": 74 + }, + { + "epoch": 0.2582866982350409, + "grad_norm": 65.6279612427127, + "learning_rate": 7.402358639534602e-07, + "logits/chosen": -5.1001877784729, + "logits/rejected": -5.059464454650879, + "logps/chosen": -0.6768380403518677, + "logps/rejected": -0.8699934482574463, + "loss": 4.1233, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.768380641937256, + "rewards/margins": 1.931553840637207, + "rewards/rejected": -8.699934005737305, + "step": 75 + }, + { + "epoch": 0.2617305208781748, + "grad_norm": 69.40821628799613, + "learning_rate": 7.376795634947379e-07, + "logits/chosen": -4.4171576499938965, + "logits/rejected": -4.2465434074401855, + "logps/chosen": -0.7788955569267273, + "logps/rejected": -0.8167555332183838, + "loss": 4.309, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.788956165313721, + "rewards/margins": 0.37859874963760376, + "rewards/rejected": -8.16755485534668, + "step": 76 + }, + { + "epoch": 0.26517434352130864, + "grad_norm": 69.16350786587172, + "learning_rate": 7.350743394519858e-07, + "logits/chosen": -4.930624485015869, + "logits/rejected": -4.70862340927124, + "logps/chosen": -0.8845140933990479, + "logps/rejected": -0.9442533850669861, + "loss": 4.1944, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.845142364501953, + "rewards/margins": 0.5973912477493286, + "rewards/rejected": -9.442534446716309, + "step": 77 + }, + { + "epoch": 0.2686181661644425, + "grad_norm": 67.06499369198696, + "learning_rate": 7.324205692744521e-07, + "logits/chosen": -5.08651065826416, + "logits/rejected": -5.048566818237305, + "logps/chosen": -0.672334611415863, + "logps/rejected": -0.7581319808959961, + "loss": 4.2669, + "rewards/accuracies": 0.4375, + "rewards/chosen": -6.723345756530762, + "rewards/margins": 0.8579738140106201, + "rewards/rejected": -7.581319808959961, + "step": 78 + }, + { + "epoch": 0.2720619888075764, + "grad_norm": 85.80640569218886, + "learning_rate": 7.297186374448307e-07, + "logits/chosen": -5.137825012207031, + "logits/rejected": -5.172469139099121, + "logps/chosen": -0.9155557155609131, + "logps/rejected": -1.0751527547836304, + "loss": 4.1234, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.155557632446289, + "rewards/margins": 1.5959699153900146, + "rewards/rejected": -10.751527786254883, + "step": 79 + }, + { + "epoch": 0.2755058114507103, + "grad_norm": 63.74795493043287, + "learning_rate": 7.269689354235567e-07, + "logits/chosen": -5.289166450500488, + "logits/rejected": -4.827259540557861, + "logps/chosen": -0.7461143136024475, + "logps/rejected": -1.00174081325531, + "loss": 3.6397, + "rewards/accuracies": 0.8125, + "rewards/chosen": -7.461143493652344, + "rewards/margins": 2.5562655925750732, + "rewards/rejected": -10.01740837097168, + "step": 80 + }, + { + "epoch": 0.27894963409384416, + "grad_norm": 63.4036066323074, + "learning_rate": 7.241718615920916e-07, + "logits/chosen": -5.0095415115356445, + "logits/rejected": -4.8333353996276855, + "logps/chosen": -0.8599931597709656, + "logps/rejected": -1.064732551574707, + "loss": 3.9659, + "rewards/accuracies": 0.8125, + "rewards/chosen": -8.599931716918945, + "rewards/margins": 2.0473945140838623, + "rewards/rejected": -10.647326469421387, + "step": 81 + }, + { + "epoch": 0.282393456736978, + "grad_norm": 62.58097683888751, + "learning_rate": 7.213278211952038e-07, + "logits/chosen": -4.466184139251709, + "logits/rejected": -4.107361793518066, + "logps/chosen": -0.7377562522888184, + "logps/rejected": -0.9077808856964111, + "loss": 3.8192, + "rewards/accuracies": 0.8125, + "rewards/chosen": -7.377562046051025, + "rewards/margins": 1.7002463340759277, + "rewards/rejected": -9.07780933380127, + "step": 82 + }, + { + "epoch": 0.28583727938011194, + "grad_norm": 71.46678574585364, + "learning_rate": 7.184372262822574e-07, + "logits/chosen": -4.615472793579102, + "logits/rejected": -4.519737243652344, + "logps/chosen": -0.8602911233901978, + "logps/rejected": -0.9075096845626831, + "loss": 4.0224, + "rewards/accuracies": 0.5625, + "rewards/chosen": -8.602910995483398, + "rewards/margins": 0.47218504548072815, + "rewards/rejected": -9.07509708404541, + "step": 83 + }, + { + "epoch": 0.2892811020232458, + "grad_norm": 76.77633086094144, + "learning_rate": 7.155004956475131e-07, + "logits/chosen": -5.291561126708984, + "logits/rejected": -4.816816329956055, + "logps/chosen": -0.7795137166976929, + "logps/rejected": -0.9014157056808472, + "loss": 3.9316, + "rewards/accuracies": 0.8125, + "rewards/chosen": -7.79513692855835, + "rewards/margins": 1.2190203666687012, + "rewards/rejected": -9.01415729522705, + "step": 84 + }, + { + "epoch": 0.29272492466637967, + "grad_norm": 64.07924704882033, + "learning_rate": 7.125180547694526e-07, + "logits/chosen": -5.0156683921813965, + "logits/rejected": -4.72418737411499, + "logps/chosen": -0.8232897520065308, + "logps/rejected": -1.23880136013031, + "loss": 3.6287, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.23289680480957, + "rewards/margins": 4.155117034912109, + "rewards/rejected": -12.388014793395996, + "step": 85 + }, + { + "epoch": 0.29616874730951354, + "grad_norm": 60.650149825273516, + "learning_rate": 7.094903357491345e-07, + "logits/chosen": -4.864440441131592, + "logits/rejected": -4.457652568817139, + "logps/chosen": -0.8692309260368347, + "logps/rejected": -1.0946143865585327, + "loss": 3.9978, + "rewards/accuracies": 0.8125, + "rewards/chosen": -8.69230842590332, + "rewards/margins": 2.2538340091705322, + "rewards/rejected": -10.94614315032959, + "step": 86 + }, + { + "epoch": 0.29961256995264746, + "grad_norm": 57.554296714602465, + "learning_rate": 7.064177772475911e-07, + "logits/chosen": -5.011836528778076, + "logits/rejected": -5.0129570960998535, + "logps/chosen": -0.9953622221946716, + "logps/rejected": -1.1730215549468994, + "loss": 3.945, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.953622817993164, + "rewards/margins": 1.7765934467315674, + "rewards/rejected": -11.730216026306152, + "step": 87 + }, + { + "epoch": 0.3030563925957813, + "grad_norm": 76.32625861331243, + "learning_rate": 7.033008244222745e-07, + "logits/chosen": -5.204478740692139, + "logits/rejected": -4.811039924621582, + "logps/chosen": -1.0616154670715332, + "logps/rejected": -1.0835515260696411, + "loss": 3.803, + "rewards/accuracies": 0.6875, + "rewards/chosen": -10.616154670715332, + "rewards/margins": 0.2193598747253418, + "rewards/rejected": -10.835514068603516, + "step": 88 + }, + { + "epoch": 0.3065002152389152, + "grad_norm": 74.9886645079608, + "learning_rate": 7.001399288625609e-07, + "logits/chosen": -5.231860637664795, + "logits/rejected": -4.674942970275879, + "logps/chosen": -0.9697386026382446, + "logps/rejected": -1.13163423538208, + "loss": 3.7676, + "rewards/accuracies": 0.6875, + "rewards/chosen": -9.697385787963867, + "rewards/margins": 1.6189574003219604, + "rewards/rejected": -11.316343307495117, + "step": 89 + }, + { + "epoch": 0.30994403788204905, + "grad_norm": 103.40467284986448, + "learning_rate": 6.969355485243239e-07, + "logits/chosen": -5.283835411071777, + "logits/rejected": -5.210239410400391, + "logps/chosen": -0.9744136929512024, + "logps/rejected": -1.04610276222229, + "loss": 4.0352, + "rewards/accuracies": 0.4375, + "rewards/chosen": -9.744136810302734, + "rewards/margins": 0.7168899774551392, + "rewards/rejected": -10.461027145385742, + "step": 90 + }, + { + "epoch": 0.31338786052518297, + "grad_norm": 75.93291524225512, + "learning_rate": 6.936881476635852e-07, + "logits/chosen": -6.081892013549805, + "logits/rejected": -5.807435989379883, + "logps/chosen": -1.129875898361206, + "logps/rejected": -1.387671709060669, + "loss": 4.1486, + "rewards/accuracies": 0.6875, + "rewards/chosen": -11.298759460449219, + "rewards/margins": 2.5779573917388916, + "rewards/rejected": -13.876716613769531, + "step": 91 + }, + { + "epoch": 0.31683168316831684, + "grad_norm": 81.2673779492731, + "learning_rate": 6.903981967692524e-07, + "logits/chosen": -5.27292013168335, + "logits/rejected": -4.817174911499023, + "logps/chosen": -0.9680742621421814, + "logps/rejected": -1.3815770149230957, + "loss": 3.5383, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.680743217468262, + "rewards/margins": 4.135027885437012, + "rewards/rejected": -13.815771102905273, + "step": 92 + }, + { + "epoch": 0.3202755058114507, + "grad_norm": 74.43824451243964, + "learning_rate": 6.870661724949532e-07, + "logits/chosen": -5.829610824584961, + "logits/rejected": -5.776001453399658, + "logps/chosen": -0.9559181928634644, + "logps/rejected": -1.1338019371032715, + "loss": 3.772, + "rewards/accuracies": 0.6875, + "rewards/chosen": -9.559182167053223, + "rewards/margins": 1.778836965560913, + "rewards/rejected": -11.338018417358398, + "step": 93 + }, + { + "epoch": 0.32371932845458457, + "grad_norm": 80.85750752674423, + "learning_rate": 6.836925575899777e-07, + "logits/chosen": -5.458807468414307, + "logits/rejected": -5.102845668792725, + "logps/chosen": -1.3218635320663452, + "logps/rejected": -1.4797770977020264, + "loss": 3.6527, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.218635559082031, + "rewards/margins": 1.5791367292404175, + "rewards/rejected": -14.797771453857422, + "step": 94 + }, + { + "epoch": 0.3271631510977185, + "grad_norm": 91.67035563126446, + "learning_rate": 6.802778408293369e-07, + "logits/chosen": -6.600034713745117, + "logits/rejected": -5.972718238830566, + "logps/chosen": -1.1001228094100952, + "logps/rejected": -1.3483161926269531, + "loss": 3.2161, + "rewards/accuracies": 0.8125, + "rewards/chosen": -11.001227378845215, + "rewards/margins": 2.4819343090057373, + "rewards/rejected": -13.483161926269531, + "step": 95 + }, + { + "epoch": 0.33060697374085235, + "grad_norm": 82.93501550536419, + "learning_rate": 6.768225169429477e-07, + "logits/chosen": -5.710722923278809, + "logits/rejected": -5.354726314544678, + "logps/chosen": -1.1976306438446045, + "logps/rejected": -1.6553398370742798, + "loss": 3.474, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.976305961608887, + "rewards/margins": 4.577092170715332, + "rewards/rejected": -16.55339813232422, + "step": 96 + }, + { + "epoch": 0.3340507963839862, + "grad_norm": 83.33902780022994, + "learning_rate": 6.733270865439557e-07, + "logits/chosen": -6.448300361633301, + "logits/rejected": -6.226202487945557, + "logps/chosen": -1.595934510231018, + "logps/rejected": -1.6069419384002686, + "loss": 3.8929, + "rewards/accuracies": 0.5625, + "rewards/chosen": -15.959344863891602, + "rewards/margins": 0.11007285118103027, + "rewards/rejected": -16.069419860839844, + "step": 97 + }, + { + "epoch": 0.3374946190271201, + "grad_norm": 117.67802338372276, + "learning_rate": 6.697920560562055e-07, + "logits/chosen": -6.556612968444824, + "logits/rejected": -6.181111812591553, + "logps/chosen": -1.4487080574035645, + "logps/rejected": -1.8549811840057373, + "loss": 3.6244, + "rewards/accuracies": 0.6875, + "rewards/chosen": -14.487081527709961, + "rewards/margins": 4.062728404998779, + "rewards/rejected": -18.549808502197266, + "step": 98 + }, + { + "epoch": 0.340938441670254, + "grad_norm": 93.0037276638684, + "learning_rate": 6.662179376408698e-07, + "logits/chosen": -7.180575370788574, + "logits/rejected": -6.442221641540527, + "logps/chosen": -1.184888243675232, + "logps/rejected": -1.4313077926635742, + "loss": 2.8886, + "rewards/accuracies": 0.5625, + "rewards/chosen": -11.848883628845215, + "rewards/margins": 2.4641964435577393, + "rewards/rejected": -14.313077926635742, + "step": 99 + }, + { + "epoch": 0.34438226431338786, + "grad_norm": 103.70431976296841, + "learning_rate": 6.626052491222453e-07, + "logits/chosen": -7.366156101226807, + "logits/rejected": -6.646521091461182, + "logps/chosen": -1.50858736038208, + "logps/rejected": -1.6617248058319092, + "loss": 3.8069, + "rewards/accuracies": 0.6875, + "rewards/chosen": -15.0858736038208, + "rewards/margins": 1.5313715934753418, + "rewards/rejected": -16.617244720458984, + "step": 100 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 97.56464703755402, + "learning_rate": 6.589545139127311e-07, + "logits/chosen": -6.810091972351074, + "logits/rejected": -6.6775031089782715, + "logps/chosen": -1.1999897956848145, + "logps/rejected": -1.4419368505477905, + "loss": 3.0443, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.999898910522461, + "rewards/margins": 2.4194700717926025, + "rewards/rejected": -14.419368743896484, + "step": 101 + }, + { + "epoch": 0.3512699095996556, + "grad_norm": 109.40818246650933, + "learning_rate": 6.552662609369942e-07, + "logits/chosen": -9.70158576965332, + "logits/rejected": -9.41241455078125, + "logps/chosen": -1.6020368337631226, + "logps/rejected": -1.8466899394989014, + "loss": 3.8531, + "rewards/accuracies": 0.8125, + "rewards/chosen": -16.020368576049805, + "rewards/margins": 2.446530818939209, + "rewards/rejected": -18.466899871826172, + "step": 102 + }, + { + "epoch": 0.3547137322427895, + "grad_norm": 120.68703068267112, + "learning_rate": 6.515410245553393e-07, + "logits/chosen": -9.626636505126953, + "logits/rejected": -8.800621032714844, + "logps/chosen": -1.5177563428878784, + "logps/rejected": -2.0786399841308594, + "loss": 3.3957, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.177563667297363, + "rewards/margins": 5.608834266662598, + "rewards/rejected": -20.78639793395996, + "step": 103 + }, + { + "epoch": 0.3581575548859234, + "grad_norm": 115.8693954281069, + "learning_rate": 6.477793444862892e-07, + "logits/chosen": -8.715924263000488, + "logits/rejected": -8.530646324157715, + "logps/chosen": -1.4800488948822021, + "logps/rejected": -1.7589524984359741, + "loss": 3.1903, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.800487518310547, + "rewards/margins": 2.7890357971191406, + "rewards/rejected": -17.589523315429688, + "step": 104 + }, + { + "epoch": 0.36160137752905724, + "grad_norm": 152.94221871551687, + "learning_rate": 6.439817657283891e-07, + "logits/chosen": -9.968289375305176, + "logits/rejected": -9.650674819946289, + "logps/chosen": -1.2602545022964478, + "logps/rejected": -1.6873594522476196, + "loss": 3.4884, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.602544784545898, + "rewards/margins": 4.271048545837402, + "rewards/rejected": -16.873594284057617, + "step": 105 + }, + { + "epoch": 0.3650452001721911, + "grad_norm": 105.58797890414657, + "learning_rate": 6.401488384812473e-07, + "logits/chosen": -9.584343910217285, + "logits/rejected": -9.589265823364258, + "logps/chosen": -1.5995938777923584, + "logps/rejected": -1.774956226348877, + "loss": 3.6783, + "rewards/accuracies": 0.75, + "rewards/chosen": -15.995938301086426, + "rewards/margins": 1.7536234855651855, + "rewards/rejected": -17.749563217163086, + "step": 106 + }, + { + "epoch": 0.36848902281532503, + "grad_norm": 136.93293579796645, + "learning_rate": 6.362811180658203e-07, + "logits/chosen": -10.062201499938965, + "logits/rejected": -9.910536766052246, + "logps/chosen": -1.5935949087142944, + "logps/rejected": -1.9505418539047241, + "loss": 3.243, + "rewards/accuracies": 0.75, + "rewards/chosen": -15.93595027923584, + "rewards/margins": 3.5694689750671387, + "rewards/rejected": -19.505420684814453, + "step": 107 + }, + { + "epoch": 0.3719328454584589, + "grad_norm": 116.29861634629408, + "learning_rate": 6.323791648439579e-07, + "logits/chosen": -9.214845657348633, + "logits/rejected": -8.844350814819336, + "logps/chosen": -1.5258371829986572, + "logps/rejected": -1.9718682765960693, + "loss": 3.2289, + "rewards/accuracies": 0.8125, + "rewards/chosen": -15.258371353149414, + "rewards/margins": 4.460310935974121, + "rewards/rejected": -19.71868324279785, + "step": 108 + }, + { + "epoch": 0.37537666810159276, + "grad_norm": 155.48156040186888, + "learning_rate": 6.284435441372161e-07, + "logits/chosen": -11.504440307617188, + "logits/rejected": -10.832094192504883, + "logps/chosen": -1.9326715469360352, + "logps/rejected": -2.6018829345703125, + "loss": 3.0293, + "rewards/accuracies": 0.75, + "rewards/chosen": -19.32671546936035, + "rewards/margins": 6.692113876342773, + "rewards/rejected": -26.018831253051758, + "step": 109 + }, + { + "epoch": 0.3788204907447266, + "grad_norm": 136.87645547171363, + "learning_rate": 6.244748261449529e-07, + "logits/chosen": -11.773118019104004, + "logits/rejected": -11.45300006866455, + "logps/chosen": -1.7673592567443848, + "logps/rejected": -2.0853307247161865, + "loss": 2.9492, + "rewards/accuracies": 0.875, + "rewards/chosen": -17.67359161376953, + "rewards/margins": 3.179716110229492, + "rewards/rejected": -20.853307723999023, + "step": 110 + }, + { + "epoch": 0.38226431338786054, + "grad_norm": 118.1199403678312, + "learning_rate": 6.204735858617171e-07, + "logits/chosen": -11.518077850341797, + "logits/rejected": -10.808693885803223, + "logps/chosen": -1.9174708127975464, + "logps/rejected": -2.1425564289093018, + "loss": 3.0723, + "rewards/accuracies": 0.8125, + "rewards/chosen": -19.174705505371094, + "rewards/margins": 2.250857353210449, + "rewards/rejected": -21.425565719604492, + "step": 111 + }, + { + "epoch": 0.3857081360309944, + "grad_norm": 119.80139407969132, + "learning_rate": 6.164404029939416e-07, + "logits/chosen": -11.800997734069824, + "logits/rejected": -11.651061058044434, + "logps/chosen": -1.7692383527755737, + "logps/rejected": -2.131488800048828, + "loss": 3.1498, + "rewards/accuracies": 0.8125, + "rewards/chosen": -17.692384719848633, + "rewards/margins": 3.6225037574768066, + "rewards/rejected": -21.31488800048828, + "step": 112 + }, + { + "epoch": 0.3891519586741283, + "grad_norm": 163.89733948610942, + "learning_rate": 6.123758618759547e-07, + "logits/chosen": -11.592788696289062, + "logits/rejected": -11.94422435760498, + "logps/chosen": -2.020418405532837, + "logps/rejected": -2.4360499382019043, + "loss": 3.0169, + "rewards/accuracies": 0.75, + "rewards/chosen": -20.204185485839844, + "rewards/margins": 4.156314849853516, + "rewards/rejected": -24.36050033569336, + "step": 113 + }, + { + "epoch": 0.39259578131726214, + "grad_norm": 143.8629447599346, + "learning_rate": 6.082805513853209e-07, + "logits/chosen": -12.300226211547852, + "logits/rejected": -11.13952350616455, + "logps/chosen": -1.6513843536376953, + "logps/rejected": -2.2150726318359375, + "loss": 2.7393, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.513843536376953, + "rewards/margins": 5.636881351470947, + "rewards/rejected": -22.150726318359375, + "step": 114 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 173.63128250872595, + "learning_rate": 6.041550648575234e-07, + "logits/chosen": -11.796028137207031, + "logits/rejected": -11.548486709594727, + "logps/chosen": -2.350860595703125, + "logps/rejected": -2.701869010925293, + "loss": 3.1346, + "rewards/accuracies": 0.625, + "rewards/chosen": -23.50860595703125, + "rewards/margins": 3.510082721710205, + "rewards/rejected": -27.018688201904297, + "step": 115 + }, + { + "epoch": 0.3994834266035299, + "grad_norm": 138.3119351547436, + "learning_rate": 6e-07, + "logits/chosen": -12.655649185180664, + "logits/rejected": -12.318216323852539, + "logps/chosen": -1.7546292543411255, + "logps/rejected": -2.545380115509033, + "loss": 3.5043, + "rewards/accuracies": 0.8125, + "rewards/chosen": -17.54629135131836, + "rewards/margins": 7.907507419586182, + "rewards/rejected": -25.453800201416016, + "step": 116 + }, + { + "epoch": 0.4029272492466638, + "grad_norm": 145.98056778857588, + "learning_rate": 5.958159588055472e-07, + "logits/chosen": -13.69811725616455, + "logits/rejected": -13.62729549407959, + "logps/chosen": -1.6991206407546997, + "logps/rejected": -2.0454368591308594, + "loss": 3.1443, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.9912052154541, + "rewards/margins": 3.4631614685058594, + "rewards/rejected": -20.454364776611328, + "step": 117 + }, + { + "epoch": 0.40637107188979765, + "grad_norm": 169.50788721509392, + "learning_rate": 5.916035474651021e-07, + "logits/chosen": -13.291184425354004, + "logits/rejected": -13.162979125976562, + "logps/chosen": -1.7755848169326782, + "logps/rejected": -2.445830821990967, + "loss": 3.0084, + "rewards/accuracies": 0.875, + "rewards/chosen": -17.755847930908203, + "rewards/margins": 6.702462196350098, + "rewards/rejected": -24.458311080932617, + "step": 118 + }, + { + "epoch": 0.4098148945329316, + "grad_norm": 119.7129501335904, + "learning_rate": 5.87363376279916e-07, + "logits/chosen": -12.720142364501953, + "logits/rejected": -12.485799789428711, + "logps/chosen": -1.9099136590957642, + "logps/rejected": -2.9828622341156006, + "loss": 2.6101, + "rewards/accuracies": 0.8125, + "rewards/chosen": -19.099138259887695, + "rewards/margins": 10.729487419128418, + "rewards/rejected": -29.828622817993164, + "step": 119 + }, + { + "epoch": 0.41325871717606544, + "grad_norm": 175.87119643186315, + "learning_rate": 5.830960595731334e-07, + "logits/chosen": -11.896202087402344, + "logits/rejected": -12.151188850402832, + "logps/chosen": -1.8780018091201782, + "logps/rejected": -2.5524849891662598, + "loss": 2.5416, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.780019760131836, + "rewards/margins": 6.744830131530762, + "rewards/rejected": -25.52484893798828, + "step": 120 + }, + { + "epoch": 0.4167025398191993, + "grad_norm": 182.67604950430695, + "learning_rate": 5.788022156007876e-07, + "logits/chosen": -13.834617614746094, + "logits/rejected": -13.90433406829834, + "logps/chosen": -2.329464912414551, + "logps/rejected": -2.853301763534546, + "loss": 3.5812, + "rewards/accuracies": 0.75, + "rewards/chosen": -23.294649124145508, + "rewards/margins": 5.238368034362793, + "rewards/rejected": -28.533016204833984, + "step": 121 + }, + { + "epoch": 0.42014636246233317, + "grad_norm": 162.94046075177278, + "learning_rate": 5.744824664622269e-07, + "logits/chosen": -13.56065559387207, + "logits/rejected": -13.065262794494629, + "logps/chosen": -2.3431150913238525, + "logps/rejected": -2.9342470169067383, + "loss": 2.9716, + "rewards/accuracies": 0.8125, + "rewards/chosen": -23.431150436401367, + "rewards/margins": 5.911318778991699, + "rewards/rejected": -29.342470169067383, + "step": 122 + }, + { + "epoch": 0.4235901851054671, + "grad_norm": 140.20048424042622, + "learning_rate": 5.70137438009984e-07, + "logits/chosen": -14.668344497680664, + "logits/rejected": -13.541962623596191, + "logps/chosen": -2.4308154582977295, + "logps/rejected": -2.9639768600463867, + "loss": 3.1536, + "rewards/accuracies": 0.6875, + "rewards/chosen": -24.308155059814453, + "rewards/margins": 5.331615447998047, + "rewards/rejected": -29.639768600463867, + "step": 123 + }, + { + "epoch": 0.42703400774860095, + "grad_norm": 271.24268423314203, + "learning_rate": 5.657677597591007e-07, + "logits/chosen": -14.41106128692627, + "logits/rejected": -14.61843204498291, + "logps/chosen": -2.38899564743042, + "logps/rejected": -2.7550418376922607, + "loss": 3.5358, + "rewards/accuracies": 0.625, + "rewards/chosen": -23.889955520629883, + "rewards/margins": 3.660465717315674, + "rewards/rejected": -27.550418853759766, + "step": 124 + }, + { + "epoch": 0.4304778303917348, + "grad_norm": 165.17472401407244, + "learning_rate": 5.613740647959235e-07, + "logits/chosen": -12.676807403564453, + "logits/rejected": -12.471404075622559, + "logps/chosen": -1.8505709171295166, + "logps/rejected": -2.3470511436462402, + "loss": 2.8249, + "rewards/accuracies": 0.875, + "rewards/chosen": -18.505708694458008, + "rewards/margins": 4.964802265167236, + "rewards/rejected": -23.47051239013672, + "step": 125 + }, + { + "epoch": 0.4339216530348687, + "grad_norm": 143.69505743500713, + "learning_rate": 5.569569896863801e-07, + "logits/chosen": -13.985774993896484, + "logits/rejected": -13.654581069946289, + "logps/chosen": -1.782606601715088, + "logps/rejected": -2.051421642303467, + "loss": 3.256, + "rewards/accuracies": 0.8125, + "rewards/chosen": -17.826065063476562, + "rewards/margins": 2.688152551651001, + "rewards/rejected": -20.514219284057617, + "step": 126 + }, + { + "epoch": 0.4373654756780026, + "grad_norm": 198.98779014050083, + "learning_rate": 5.52517174383754e-07, + "logits/chosen": -13.829938888549805, + "logits/rejected": -13.86230182647705, + "logps/chosen": -2.4392776489257812, + "logps/rejected": -3.190002679824829, + "loss": 2.8292, + "rewards/accuracies": 0.8125, + "rewards/chosen": -24.39277458190918, + "rewards/margins": 7.50724983215332, + "rewards/rejected": -31.900026321411133, + "step": 127 + }, + { + "epoch": 0.44080929832113647, + "grad_norm": 189.97902844787896, + "learning_rate": 5.480552621359659e-07, + "logits/chosen": -14.226242065429688, + "logits/rejected": -14.341365814208984, + "logps/chosen": -2.10856294631958, + "logps/rejected": -2.4918906688690186, + "loss": 3.2012, + "rewards/accuracies": 0.625, + "rewards/chosen": -21.085630416870117, + "rewards/margins": 3.8332767486572266, + "rewards/rejected": -24.918907165527344, + "step": 128 + }, + { + "epoch": 0.44425312096427033, + "grad_norm": 136.19908022128857, + "learning_rate": 5.435718993923784e-07, + "logits/chosen": -13.451090812683105, + "logits/rejected": -12.785599708557129, + "logps/chosen": -1.6525287628173828, + "logps/rejected": -2.3770031929016113, + "loss": 2.4949, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.52528953552246, + "rewards/margins": 7.244744300842285, + "rewards/rejected": -23.77003288269043, + "step": 129 + }, + { + "epoch": 0.4476969436074042, + "grad_norm": 135.82081198678648, + "learning_rate": 5.39067735710139e-07, + "logits/chosen": -14.511775016784668, + "logits/rejected": -13.7813138961792, + "logps/chosen": -2.127079963684082, + "logps/rejected": -2.742191791534424, + "loss": 2.8888, + "rewards/accuracies": 0.5625, + "rewards/chosen": -21.27079963684082, + "rewards/margins": 6.151117324829102, + "rewards/rejected": -27.42191505432129, + "step": 130 + }, + { + "epoch": 0.4511407662505381, + "grad_norm": 183.19436612857393, + "learning_rate": 5.3454342366007e-07, + "logits/chosen": -14.230147361755371, + "logits/rejected": -13.983600616455078, + "logps/chosen": -2.1134510040283203, + "logps/rejected": -2.4505257606506348, + "loss": 3.9961, + "rewards/accuracies": 0.75, + "rewards/chosen": -21.134510040283203, + "rewards/margins": 3.3707499504089355, + "rewards/rejected": -24.505260467529297, + "step": 131 + }, + { + "epoch": 0.454584588893672, + "grad_norm": 207.54586277796528, + "learning_rate": 5.299996187321231e-07, + "logits/chosen": -15.675312042236328, + "logits/rejected": -15.437052726745605, + "logps/chosen": -1.9110677242279053, + "logps/rejected": -2.109048366546631, + "loss": 3.5684, + "rewards/accuracies": 0.5625, + "rewards/chosen": -19.110675811767578, + "rewards/margins": 1.9798049926757812, + "rewards/rejected": -21.090482711791992, + "step": 132 + }, + { + "epoch": 0.45802841153680585, + "grad_norm": 126.18602958529125, + "learning_rate": 5.254369792404108e-07, + "logits/chosen": -14.874656677246094, + "logits/rejected": -14.413407325744629, + "logps/chosen": -2.353257417678833, + "logps/rejected": -3.3936214447021484, + "loss": 2.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.532573699951172, + "rewards/margins": 10.403639793395996, + "rewards/rejected": -33.936214447021484, + "step": 133 + }, + { + "epoch": 0.4614722341799397, + "grad_norm": 143.02615992144584, + "learning_rate": 5.20856166227829e-07, + "logits/chosen": -15.980953216552734, + "logits/rejected": -15.589332580566406, + "logps/chosen": -2.683424472808838, + "logps/rejected": -3.0904102325439453, + "loss": 2.7952, + "rewards/accuracies": 0.75, + "rewards/chosen": -26.834243774414062, + "rewards/margins": 4.069858551025391, + "rewards/rejected": -30.904102325439453, + "step": 134 + }, + { + "epoch": 0.46491605682307363, + "grad_norm": 160.67163719835557, + "learning_rate": 5.162578433702844e-07, + "logits/chosen": -15.208805084228516, + "logits/rejected": -15.462963104248047, + "logps/chosen": -1.8320481777191162, + "logps/rejected": -2.23215651512146, + "loss": 2.9135, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.320480346679688, + "rewards/margins": 4.001082420349121, + "rewards/rejected": -22.321565628051758, + "step": 135 + }, + { + "epoch": 0.4683598794662075, + "grad_norm": 145.99038790062517, + "learning_rate": 5.116426768805387e-07, + "logits/chosen": -14.624232292175293, + "logits/rejected": -14.728387832641602, + "logps/chosen": -2.1977291107177734, + "logps/rejected": -2.507412910461426, + "loss": 3.0741, + "rewards/accuracies": 0.8125, + "rewards/chosen": -21.977291107177734, + "rewards/margins": 3.096836566925049, + "rewards/rejected": -25.07413101196289, + "step": 136 + }, + { + "epoch": 0.47180370210934136, + "grad_norm": 148.12763051461735, + "learning_rate": 5.070113354116884e-07, + "logits/chosen": -15.4700927734375, + "logits/rejected": -15.196715354919434, + "logps/chosen": -1.5759204626083374, + "logps/rejected": -2.449571132659912, + "loss": 2.46, + "rewards/accuracies": 0.9375, + "rewards/chosen": -15.759203910827637, + "rewards/margins": 8.736505508422852, + "rewards/rejected": -24.495710372924805, + "step": 137 + }, + { + "epoch": 0.4752475247524752, + "grad_norm": 170.6076112934297, + "learning_rate": 5.023644899602871e-07, + "logits/chosen": -15.85372257232666, + "logits/rejected": -15.770110130310059, + "logps/chosen": -2.3490283489227295, + "logps/rejected": -2.6556615829467773, + "loss": 2.5731, + "rewards/accuracies": 0.625, + "rewards/chosen": -23.490280151367188, + "rewards/margins": 3.066333293914795, + "rewards/rejected": -26.556615829467773, + "step": 138 + }, + { + "epoch": 0.47869134739560915, + "grad_norm": 160.9701483874417, + "learning_rate": 4.977028137691324e-07, + "logits/chosen": -14.690975189208984, + "logits/rejected": -14.133597373962402, + "logps/chosen": -2.249864101409912, + "logps/rejected": -2.9582080841064453, + "loss": 2.6498, + "rewards/accuracies": 0.9375, + "rewards/chosen": -22.498640060424805, + "rewards/margins": 7.083439826965332, + "rewards/rejected": -29.582080841064453, + "step": 139 + }, + { + "epoch": 0.482135170038743, + "grad_norm": 140.63079154740615, + "learning_rate": 4.930269822297241e-07, + "logits/chosen": -15.633464813232422, + "logits/rejected": -15.003985404968262, + "logps/chosen": -1.8792824745178223, + "logps/rejected": -2.4007444381713867, + "loss": 3.0883, + "rewards/accuracies": 0.8125, + "rewards/chosen": -18.79282569885254, + "rewards/margins": 5.214618682861328, + "rewards/rejected": -24.0074462890625, + "step": 140 + }, + { + "epoch": 0.4855789926818769, + "grad_norm": 164.90245104113825, + "learning_rate": 4.883376727844129e-07, + "logits/chosen": -17.06644058227539, + "logits/rejected": -16.71479034423828, + "logps/chosen": -1.984204888343811, + "logps/rejected": -2.5203452110290527, + "loss": 3.2745, + "rewards/accuracies": 0.6875, + "rewards/chosen": -19.842050552368164, + "rewards/margins": 5.361400127410889, + "rewards/rejected": -25.203449249267578, + "step": 141 + }, + { + "epoch": 0.48902281532501074, + "grad_norm": 158.4727771377469, + "learning_rate": 4.836355648282509e-07, + "logits/chosen": -15.427898406982422, + "logits/rejected": -15.480656623840332, + "logps/chosen": -1.6626648902893066, + "logps/rejected": -2.4167871475219727, + "loss": 2.6222, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.62664794921875, + "rewards/margins": 7.54122257232666, + "rewards/rejected": -24.16787338256836, + "step": 142 + }, + { + "epoch": 0.49246663796814466, + "grad_norm": 164.75096025896175, + "learning_rate": 4.7892133961056e-07, + "logits/chosen": -17.340797424316406, + "logits/rejected": -16.83738899230957, + "logps/chosen": -2.716911554336548, + "logps/rejected": -3.8089609146118164, + "loss": 3.2104, + "rewards/accuracies": 0.9375, + "rewards/chosen": -27.16911506652832, + "rewards/margins": 10.920495986938477, + "rewards/rejected": -38.08961486816406, + "step": 143 + }, + { + "epoch": 0.4959104606112785, + "grad_norm": 165.51865689779507, + "learning_rate": 4.7419568013623185e-07, + "logits/chosen": -17.844758987426758, + "logits/rejected": -17.37071418762207, + "logps/chosen": -2.3916749954223633, + "logps/rejected": -2.984015703201294, + "loss": 3.5064, + "rewards/accuracies": 0.75, + "rewards/chosen": -23.916751861572266, + "rewards/margins": 5.923404693603516, + "rewards/rejected": -29.84015655517578, + "step": 144 + }, + { + "epoch": 0.4993542832544124, + "grad_norm": 170.81988641182852, + "learning_rate": 4.694592710667722e-07, + "logits/chosen": -16.56422996520996, + "logits/rejected": -16.64710235595703, + "logps/chosen": -1.9375088214874268, + "logps/rejected": -2.7873284816741943, + "loss": 2.599, + "rewards/accuracies": 0.875, + "rewards/chosen": -19.375089645385742, + "rewards/margins": 8.498197555541992, + "rewards/rejected": -27.873287200927734, + "step": 145 + }, + { + "epoch": 0.5027981058975463, + "grad_norm": 152.83376641387204, + "learning_rate": 4.6471279862110594e-07, + "logits/chosen": -16.366130828857422, + "logits/rejected": -16.218612670898438, + "logps/chosen": -2.085256814956665, + "logps/rejected": -2.532078742980957, + "loss": 2.5765, + "rewards/accuracies": 0.75, + "rewards/chosen": -20.852569580078125, + "rewards/margins": 4.468219757080078, + "rewards/rejected": -25.320789337158203, + "step": 146 + }, + { + "epoch": 0.5062419285406802, + "grad_norm": 167.03745018894716, + "learning_rate": 4.5995695047615724e-07, + "logits/chosen": -16.575876235961914, + "logits/rejected": -16.29088020324707, + "logps/chosen": -1.7308956384658813, + "logps/rejected": -2.0768227577209473, + "loss": 3.2235, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.308956146240234, + "rewards/margins": 3.459270477294922, + "rewards/rejected": -20.768226623535156, + "step": 147 + }, + { + "epoch": 0.509685751183814, + "grad_norm": 178.30073567921698, + "learning_rate": 4.5519241566721724e-07, + "logits/chosen": -15.774458885192871, + "logits/rejected": -15.673702239990234, + "logps/chosen": -2.2769925594329834, + "logps/rejected": -2.522907257080078, + "loss": 3.6175, + "rewards/accuracies": 0.75, + "rewards/chosen": -22.769929885864258, + "rewards/margins": 2.45914363861084, + "rewards/rejected": -25.229076385498047, + "step": 148 + }, + { + "epoch": 0.5131295738269479, + "grad_norm": 137.31263512738002, + "learning_rate": 4.5041988448811574e-07, + "logits/chosen": -15.081258773803711, + "logits/rejected": -15.273090362548828, + "logps/chosen": -1.8598787784576416, + "logps/rejected": -2.1195287704467773, + "loss": 2.687, + "rewards/accuracies": 0.6875, + "rewards/chosen": -18.598787307739258, + "rewards/margins": 2.596500873565674, + "rewards/rejected": -21.195289611816406, + "step": 149 + }, + { + "epoch": 0.5165733964700818, + "grad_norm": 125.93526672957817, + "learning_rate": 4.456400483912099e-07, + "logits/chosen": -16.464996337890625, + "logits/rejected": -16.59218978881836, + "logps/chosen": -2.1750199794769287, + "logps/rejected": -2.6353604793548584, + "loss": 2.8572, + "rewards/accuracies": 0.875, + "rewards/chosen": -21.750200271606445, + "rewards/margins": 4.603403568267822, + "rewards/rejected": -26.35360336303711, + "step": 150 + }, + { + "epoch": 0.5200172191132156, + "grad_norm": 142.20133174481876, + "learning_rate": 4.4085359988720583e-07, + "logits/chosen": -15.427270889282227, + "logits/rejected": -15.429370880126953, + "logps/chosen": -1.9908243417739868, + "logps/rejected": -2.447384834289551, + "loss": 2.1271, + "rewards/accuracies": 0.875, + "rewards/chosen": -19.908245086669922, + "rewards/margins": 4.565605163574219, + "rewards/rejected": -24.473848342895508, + "step": 151 + }, + { + "epoch": 0.5234610417563496, + "grad_norm": 149.43131783531857, + "learning_rate": 4.3606123244482615e-07, + "logits/chosen": -16.817100524902344, + "logits/rejected": -16.41914176940918, + "logps/chosen": -2.2302825450897217, + "logps/rejected": -3.1961381435394287, + "loss": 2.7684, + "rewards/accuracies": 0.8125, + "rewards/chosen": -22.302825927734375, + "rewards/margins": 9.658554077148438, + "rewards/rejected": -31.961380004882812, + "step": 152 + }, + { + "epoch": 0.5269048643994835, + "grad_norm": 175.08007654534825, + "learning_rate": 4.3126364039033934e-07, + "logits/chosen": -16.285236358642578, + "logits/rejected": -16.360095977783203, + "logps/chosen": -1.9334690570831299, + "logps/rejected": -2.5686261653900146, + "loss": 2.9525, + "rewards/accuracies": 0.6875, + "rewards/chosen": -19.33469009399414, + "rewards/margins": 6.3515706062316895, + "rewards/rejected": -25.686262130737305, + "step": 153 + }, + { + "epoch": 0.5303486870426173, + "grad_norm": 147.77300608746174, + "learning_rate": 4.2646151880696466e-07, + "logits/chosen": -15.203396797180176, + "logits/rejected": -15.251398086547852, + "logps/chosen": -2.082613945007324, + "logps/rejected": -2.380194664001465, + "loss": 3.1857, + "rewards/accuracies": 0.8125, + "rewards/chosen": -20.826141357421875, + "rewards/margins": 2.9758081436157227, + "rewards/rejected": -23.80194854736328, + "step": 154 + }, + { + "epoch": 0.5337925096857512, + "grad_norm": 191.413155487678, + "learning_rate": 4.21655563434167e-07, + "logits/chosen": -16.2647762298584, + "logits/rejected": -16.06024742126465, + "logps/chosen": -1.8659639358520508, + "logps/rejected": -2.6368792057037354, + "loss": 2.8876, + "rewards/accuracies": 0.875, + "rewards/chosen": -18.659639358520508, + "rewards/margins": 7.7091522216796875, + "rewards/rejected": -26.368793487548828, + "step": 155 + }, + { + "epoch": 0.537236332328885, + "grad_norm": 124.88412207047179, + "learning_rate": 4.16846470566857e-07, + "logits/chosen": -16.624813079833984, + "logits/rejected": -16.500511169433594, + "logps/chosen": -1.741891622543335, + "logps/rejected": -2.405266284942627, + "loss": 1.9418, + "rewards/accuracies": 0.8125, + "rewards/chosen": -17.418916702270508, + "rewards/margins": 6.633745193481445, + "rewards/rejected": -24.052661895751953, + "step": 156 + }, + { + "epoch": 0.5406801549720189, + "grad_norm": 141.77331157520445, + "learning_rate": 4.120349369545109e-07, + "logits/chosen": -15.149438858032227, + "logits/rejected": -15.287237167358398, + "logps/chosen": -2.173785448074341, + "logps/rejected": -3.180941104888916, + "loss": 2.7363, + "rewards/accuracies": 0.625, + "rewards/chosen": -21.73785400390625, + "rewards/margins": 10.071558952331543, + "rewards/rejected": -31.809410095214844, + "step": 157 + }, + { + "epoch": 0.5441239776151529, + "grad_norm": 155.44419089941894, + "learning_rate": 4.0722165970022414e-07, + "logits/chosen": -16.01889419555664, + "logits/rejected": -16.09550666809082, + "logps/chosen": -2.3958230018615723, + "logps/rejected": -2.5509209632873535, + "loss": 3.3508, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.95823097229004, + "rewards/margins": 1.5509822368621826, + "rewards/rejected": -25.509214401245117, + "step": 158 + }, + { + "epoch": 0.5475678002582867, + "grad_norm": 129.29743296707403, + "learning_rate": 4.024073361597142e-07, + "logits/chosen": -17.30500030517578, + "logits/rejected": -16.847618103027344, + "logps/chosen": -2.4113364219665527, + "logps/rejected": -3.3326172828674316, + "loss": 2.5569, + "rewards/accuracies": 0.8125, + "rewards/chosen": -24.113361358642578, + "rewards/margins": 9.212811470031738, + "rewards/rejected": -33.326175689697266, + "step": 159 + }, + { + "epoch": 0.5510116229014206, + "grad_norm": 161.70130038708717, + "learning_rate": 3.9759266384028583e-07, + "logits/chosen": -15.621679306030273, + "logits/rejected": -15.098061561584473, + "logps/chosen": -2.271921157836914, + "logps/rejected": -2.7090096473693848, + "loss": 2.7771, + "rewards/accuracies": 0.625, + "rewards/chosen": -22.71921157836914, + "rewards/margins": 4.370884895324707, + "rewards/rejected": -27.090097427368164, + "step": 160 + }, + { + "epoch": 0.5544554455445545, + "grad_norm": 157.66640514865557, + "learning_rate": 3.927783402997757e-07, + "logits/chosen": -15.658122062683105, + "logits/rejected": -15.553414344787598, + "logps/chosen": -2.2297635078430176, + "logps/rejected": -2.9377989768981934, + "loss": 2.6828, + "rewards/accuracies": 0.8125, + "rewards/chosen": -22.29763412475586, + "rewards/margins": 7.080355644226074, + "rewards/rejected": -29.37799072265625, + "step": 161 + }, + { + "epoch": 0.5578992681876883, + "grad_norm": 135.88851128774647, + "learning_rate": 3.879650630454892e-07, + "logits/chosen": -16.659839630126953, + "logits/rejected": -16.298494338989258, + "logps/chosen": -2.3507156372070312, + "logps/rejected": -2.968364715576172, + "loss": 2.8013, + "rewards/accuracies": 0.875, + "rewards/chosen": -23.50715446472168, + "rewards/margins": 6.176491737365723, + "rewards/rejected": -29.683645248413086, + "step": 162 + }, + { + "epoch": 0.5613430908308222, + "grad_norm": 113.40296909004411, + "learning_rate": 3.83153529433143e-07, + "logits/chosen": -14.723609924316406, + "logits/rejected": -14.707221984863281, + "logps/chosen": -2.070491313934326, + "logps/rejected": -2.8369667530059814, + "loss": 2.4633, + "rewards/accuracies": 0.8125, + "rewards/chosen": -20.704910278320312, + "rewards/margins": 7.664756774902344, + "rewards/rejected": -28.369670867919922, + "step": 163 + }, + { + "epoch": 0.564786913473956, + "grad_norm": 158.36069390792653, + "learning_rate": 3.78344436565833e-07, + "logits/chosen": -16.133251190185547, + "logits/rejected": -15.6480712890625, + "logps/chosen": -2.447453498840332, + "logps/rejected": -3.0984373092651367, + "loss": 2.7807, + "rewards/accuracies": 0.75, + "rewards/chosen": -24.47453498840332, + "rewards/margins": 6.50984001159668, + "rewards/rejected": -30.984373092651367, + "step": 164 + }, + { + "epoch": 0.56823073611709, + "grad_norm": 144.30140846289407, + "learning_rate": 3.7353848119303536e-07, + "logits/chosen": -14.615021705627441, + "logits/rejected": -14.4873685836792, + "logps/chosen": -2.1710031032562256, + "logps/rejected": -3.150233030319214, + "loss": 2.3923, + "rewards/accuracies": 0.9375, + "rewards/chosen": -21.710033416748047, + "rewards/margins": 9.79229736328125, + "rewards/rejected": -31.502328872680664, + "step": 165 + }, + { + "epoch": 0.5716745587602239, + "grad_norm": 156.39504517806822, + "learning_rate": 3.687363596096607e-07, + "logits/chosen": -13.180891036987305, + "logits/rejected": -13.603325843811035, + "logps/chosen": -2.1853692531585693, + "logps/rejected": -2.7947943210601807, + "loss": 2.5098, + "rewards/accuracies": 0.8125, + "rewards/chosen": -21.85369300842285, + "rewards/margins": 6.09425163269043, + "rewards/rejected": -27.94794464111328, + "step": 166 + }, + { + "epoch": 0.5751183814033577, + "grad_norm": 189.2555316232527, + "learning_rate": 3.639387675551739e-07, + "logits/chosen": -16.535764694213867, + "logits/rejected": -16.17355728149414, + "logps/chosen": -2.0116500854492188, + "logps/rejected": -2.5565733909606934, + "loss": 2.8736, + "rewards/accuracies": 0.625, + "rewards/chosen": -20.11650276184082, + "rewards/margins": 5.449231147766113, + "rewards/rejected": -25.56573486328125, + "step": 167 + }, + { + "epoch": 0.5785622040464916, + "grad_norm": 168.96863729883424, + "learning_rate": 3.5914640011279424e-07, + "logits/chosen": -17.7143497467041, + "logits/rejected": -17.613689422607422, + "logps/chosen": -2.375505208969116, + "logps/rejected": -3.2843146324157715, + "loss": 1.8044, + "rewards/accuracies": 0.9375, + "rewards/chosen": -23.75505256652832, + "rewards/margins": 9.088095664978027, + "rewards/rejected": -32.84314727783203, + "step": 168 + }, + { + "epoch": 0.5820060266896255, + "grad_norm": 154.96333991867238, + "learning_rate": 3.543599516087901e-07, + "logits/chosen": -16.464033126831055, + "logits/rejected": -16.16067886352539, + "logps/chosen": -2.4366371631622314, + "logps/rejected": -2.9191558361053467, + "loss": 2.8393, + "rewards/accuracies": 0.75, + "rewards/chosen": -24.366371154785156, + "rewards/margins": 4.825188636779785, + "rewards/rejected": -29.191558837890625, + "step": 169 + }, + { + "epoch": 0.5854498493327593, + "grad_norm": 133.1585711778442, + "learning_rate": 3.495801155118843e-07, + "logits/chosen": -17.312694549560547, + "logits/rejected": -17.00650978088379, + "logps/chosen": -2.205974817276001, + "logps/rejected": -2.8471574783325195, + "loss": 2.1623, + "rewards/accuracies": 1.0, + "rewards/chosen": -22.059749603271484, + "rewards/margins": 6.411825656890869, + "rewards/rejected": -28.471574783325195, + "step": 170 + }, + { + "epoch": 0.5888936719758933, + "grad_norm": 161.54792608348458, + "learning_rate": 3.448075843327827e-07, + "logits/chosen": -16.923572540283203, + "logits/rejected": -16.892681121826172, + "logps/chosen": -2.1601150035858154, + "logps/rejected": -2.8166677951812744, + "loss": 2.3029, + "rewards/accuracies": 0.75, + "rewards/chosen": -21.601150512695312, + "rewards/margins": 6.56552791595459, + "rewards/rejected": -28.16668128967285, + "step": 171 + }, + { + "epoch": 0.5923374946190271, + "grad_norm": 147.52184027924474, + "learning_rate": 3.4004304952384283e-07, + "logits/chosen": -17.819061279296875, + "logits/rejected": -17.36820411682129, + "logps/chosen": -2.7700634002685547, + "logps/rejected": -3.871890068054199, + "loss": 2.3046, + "rewards/accuracies": 0.9375, + "rewards/chosen": -27.700634002685547, + "rewards/margins": 11.018264770507812, + "rewards/rejected": -38.71889877319336, + "step": 172 + }, + { + "epoch": 0.595781317262161, + "grad_norm": 133.0174741810263, + "learning_rate": 3.352872013788941e-07, + "logits/chosen": -15.862306594848633, + "logits/rejected": -15.657508850097656, + "logps/chosen": -1.8171809911727905, + "logps/rejected": -2.790759325027466, + "loss": 1.9991, + "rewards/accuracies": 0.875, + "rewards/chosen": -18.171810150146484, + "rewards/margins": 9.735782623291016, + "rewards/rejected": -27.907590866088867, + "step": 173 + }, + { + "epoch": 0.5992251399052949, + "grad_norm": 140.03974045337986, + "learning_rate": 3.3054072893322785e-07, + "logits/chosen": -18.625810623168945, + "logits/rejected": -18.38481330871582, + "logps/chosen": -2.596587896347046, + "logps/rejected": -2.94968843460083, + "loss": 2.901, + "rewards/accuracies": 0.6875, + "rewards/chosen": -25.965877532958984, + "rewards/margins": 3.5310049057006836, + "rewards/rejected": -29.496883392333984, + "step": 174 + }, + { + "epoch": 0.6026689625484287, + "grad_norm": 183.98457603492201, + "learning_rate": 3.258043198637682e-07, + "logits/chosen": -14.411111831665039, + "logits/rejected": -14.562616348266602, + "logps/chosen": -2.5059261322021484, + "logps/rejected": -3.6475634574890137, + "loss": 2.2702, + "rewards/accuracies": 0.875, + "rewards/chosen": -25.059261322021484, + "rewards/margins": 11.41637134552002, + "rewards/rejected": -36.47563171386719, + "step": 175 + }, + { + "epoch": 0.6061127851915626, + "grad_norm": 213.58061636651394, + "learning_rate": 3.2107866038944004e-07, + "logits/chosen": -18.129159927368164, + "logits/rejected": -18.01100730895996, + "logps/chosen": -3.2717809677124023, + "logps/rejected": -3.870783805847168, + "loss": 2.6386, + "rewards/accuracies": 0.875, + "rewards/chosen": -32.71781539916992, + "rewards/margins": 5.990023612976074, + "rewards/rejected": -38.70783615112305, + "step": 176 + }, + { + "epoch": 0.6095566078346966, + "grad_norm": 179.1745395557081, + "learning_rate": 3.163644351717492e-07, + "logits/chosen": -17.755756378173828, + "logits/rejected": -17.911243438720703, + "logps/chosen": -2.3970510959625244, + "logps/rejected": -3.1361923217773438, + "loss": 2.4945, + "rewards/accuracies": 0.75, + "rewards/chosen": -23.970510482788086, + "rewards/margins": 7.391412734985352, + "rewards/rejected": -31.36192512512207, + "step": 177 + }, + { + "epoch": 0.6130004304778304, + "grad_norm": 167.61011422532053, + "learning_rate": 3.1166232721558714e-07, + "logits/chosen": -17.961496353149414, + "logits/rejected": -18.155773162841797, + "logps/chosen": -2.3098530769348145, + "logps/rejected": -3.018498182296753, + "loss": 2.8513, + "rewards/accuracies": 0.75, + "rewards/chosen": -23.098527908325195, + "rewards/margins": 7.086452484130859, + "rewards/rejected": -30.184980392456055, + "step": 178 + }, + { + "epoch": 0.6164442531209643, + "grad_norm": 166.57075495491398, + "learning_rate": 3.069730177702759e-07, + "logits/chosen": -16.301759719848633, + "logits/rejected": -16.399494171142578, + "logps/chosen": -1.7774579524993896, + "logps/rejected": -3.1513190269470215, + "loss": 2.6985, + "rewards/accuracies": 0.875, + "rewards/chosen": -17.774578094482422, + "rewards/margins": 13.73861312866211, + "rewards/rejected": -31.51319122314453, + "step": 179 + }, + { + "epoch": 0.6198880757640981, + "grad_norm": 176.06415844976797, + "learning_rate": 3.022971862308676e-07, + "logits/chosen": -18.093582153320312, + "logits/rejected": -18.556367874145508, + "logps/chosen": -2.9991700649261475, + "logps/rejected": -3.220672845840454, + "loss": 3.4972, + "rewards/accuracies": 0.6875, + "rewards/chosen": -29.991701126098633, + "rewards/margins": 2.215024709701538, + "rewards/rejected": -32.20672607421875, + "step": 180 + }, + { + "epoch": 0.623331898407232, + "grad_norm": 165.59558701094664, + "learning_rate": 2.9763551003971285e-07, + "logits/chosen": -17.380640029907227, + "logits/rejected": -17.27768325805664, + "logps/chosen": -2.5434718132019043, + "logps/rejected": -3.3419389724731445, + "loss": 2.149, + "rewards/accuracies": 0.6875, + "rewards/chosen": -25.434715270996094, + "rewards/margins": 7.984673976898193, + "rewards/rejected": -33.41939163208008, + "step": 181 + }, + { + "epoch": 0.6267757210503659, + "grad_norm": 202.61030946129074, + "learning_rate": 2.929886645883117e-07, + "logits/chosen": -18.67276954650879, + "logits/rejected": -18.613061904907227, + "logps/chosen": -2.4041404724121094, + "logps/rejected": -3.169337749481201, + "loss": 3.206, + "rewards/accuracies": 0.875, + "rewards/chosen": -24.041404724121094, + "rewards/margins": 7.651971340179443, + "rewards/rejected": -31.693378448486328, + "step": 182 + }, + { + "epoch": 0.6302195436934998, + "grad_norm": 187.65675738898932, + "learning_rate": 2.883573231194613e-07, + "logits/chosen": -17.532236099243164, + "logits/rejected": -17.939956665039062, + "logps/chosen": -2.346766233444214, + "logps/rejected": -3.403451681137085, + "loss": 2.2531, + "rewards/accuracies": 0.875, + "rewards/chosen": -23.46766471862793, + "rewards/margins": 10.566852569580078, + "rewards/rejected": -34.034515380859375, + "step": 183 + }, + { + "epoch": 0.6336633663366337, + "grad_norm": 195.90592847716476, + "learning_rate": 2.837421566297156e-07, + "logits/chosen": -17.615713119506836, + "logits/rejected": -17.56720542907715, + "logps/chosen": -2.333320140838623, + "logps/rejected": -2.91764760017395, + "loss": 3.0132, + "rewards/accuracies": 0.625, + "rewards/chosen": -23.333202362060547, + "rewards/margins": 5.8432722091674805, + "rewards/rejected": -29.176475524902344, + "step": 184 + }, + { + "epoch": 0.6371071889797676, + "grad_norm": 183.34560091532057, + "learning_rate": 2.7914383377217083e-07, + "logits/chosen": -18.328622817993164, + "logits/rejected": -18.28988265991211, + "logps/chosen": -2.4505515098571777, + "logps/rejected": -3.1104087829589844, + "loss": 2.7728, + "rewards/accuracies": 0.6875, + "rewards/chosen": -24.50551414489746, + "rewards/margins": 6.598570823669434, + "rewards/rejected": -31.104084014892578, + "step": 185 + }, + { + "epoch": 0.6405510116229014, + "grad_norm": 143.6928758781207, + "learning_rate": 2.745630207595893e-07, + "logits/chosen": -17.590606689453125, + "logits/rejected": -17.842931747436523, + "logps/chosen": -2.3018007278442383, + "logps/rejected": -3.050802707672119, + "loss": 2.3707, + "rewards/accuracies": 0.875, + "rewards/chosen": -23.018009185791016, + "rewards/margins": 7.490016937255859, + "rewards/rejected": -30.508028030395508, + "step": 186 + }, + { + "epoch": 0.6439948342660353, + "grad_norm": 184.0549446666674, + "learning_rate": 2.70000381267877e-07, + "logits/chosen": -17.26044273376465, + "logits/rejected": -16.797651290893555, + "logps/chosen": -2.5893783569335938, + "logps/rejected": -3.3172554969787598, + "loss": 2.5419, + "rewards/accuracies": 0.8125, + "rewards/chosen": -25.893783569335938, + "rewards/margins": 7.278769016265869, + "rewards/rejected": -33.17255401611328, + "step": 187 + }, + { + "epoch": 0.6474386569091691, + "grad_norm": 162.70714561559674, + "learning_rate": 2.654565763399299e-07, + "logits/chosen": -17.615966796875, + "logits/rejected": -17.202425003051758, + "logps/chosen": -2.06203556060791, + "logps/rejected": -2.5958409309387207, + "loss": 2.3888, + "rewards/accuracies": 0.8125, + "rewards/chosen": -20.62035369873047, + "rewards/margins": 5.338054180145264, + "rewards/rejected": -25.958410263061523, + "step": 188 + }, + { + "epoch": 0.650882479552303, + "grad_norm": 211.4855934809745, + "learning_rate": 2.6093226428986103e-07, + "logits/chosen": -17.890888214111328, + "logits/rejected": -18.181949615478516, + "logps/chosen": -2.5792300701141357, + "logps/rejected": -2.9869372844696045, + "loss": 3.2394, + "rewards/accuracies": 0.625, + "rewards/chosen": -25.792301177978516, + "rewards/margins": 4.077073574066162, + "rewards/rejected": -29.869373321533203, + "step": 189 + }, + { + "epoch": 0.654326302195437, + "grad_norm": 241.71964487552518, + "learning_rate": 2.564281006076217e-07, + "logits/chosen": -17.281450271606445, + "logits/rejected": -17.177658081054688, + "logps/chosen": -2.0290184020996094, + "logps/rejected": -2.531289577484131, + "loss": 2.9749, + "rewards/accuracies": 0.625, + "rewards/chosen": -20.29018211364746, + "rewards/margins": 5.022716999053955, + "rewards/rejected": -25.31290054321289, + "step": 190 + }, + { + "epoch": 0.6577701248385708, + "grad_norm": 140.2272151723766, + "learning_rate": 2.519447378640342e-07, + "logits/chosen": -18.04941749572754, + "logits/rejected": -17.695268630981445, + "logps/chosen": -2.407273292541504, + "logps/rejected": -3.1553428173065186, + "loss": 2.5175, + "rewards/accuracies": 0.8125, + "rewards/chosen": -24.07273292541504, + "rewards/margins": 7.480693340301514, + "rewards/rejected": -31.553424835205078, + "step": 191 + }, + { + "epoch": 0.6612139474817047, + "grad_norm": 122.61240060144814, + "learning_rate": 2.4748282561624587e-07, + "logits/chosen": -18.940771102905273, + "logits/rejected": -19.209871292114258, + "logps/chosen": -2.9477639198303223, + "logps/rejected": -3.3782527446746826, + "loss": 2.231, + "rewards/accuracies": 0.8125, + "rewards/chosen": -29.47764015197754, + "rewards/margins": 4.304885387420654, + "rewards/rejected": -33.78252410888672, + "step": 192 + }, + { + "epoch": 0.6646577701248386, + "grad_norm": 154.6658255017087, + "learning_rate": 2.4304301031361993e-07, + "logits/chosen": -17.39430046081543, + "logits/rejected": -17.464357376098633, + "logps/chosen": -1.6274338960647583, + "logps/rejected": -2.39015531539917, + "loss": 2.0168, + "rewards/accuracies": 0.8125, + "rewards/chosen": -16.274337768554688, + "rewards/margins": 7.627217769622803, + "rewards/rejected": -23.901554107666016, + "step": 193 + }, + { + "epoch": 0.6681015927679724, + "grad_norm": 183.06073957842517, + "learning_rate": 2.386259352040766e-07, + "logits/chosen": -17.165082931518555, + "logits/rejected": -17.03466796875, + "logps/chosen": -2.3020198345184326, + "logps/rejected": -3.141963481903076, + "loss": 2.5949, + "rewards/accuracies": 0.9375, + "rewards/chosen": -23.020198822021484, + "rewards/margins": 8.399435997009277, + "rewards/rejected": -31.419633865356445, + "step": 194 + }, + { + "epoch": 0.6715454154111064, + "grad_norm": 195.7328576465893, + "learning_rate": 2.3423224024089924e-07, + "logits/chosen": -16.67756462097168, + "logits/rejected": -16.06248664855957, + "logps/chosen": -1.9835630655288696, + "logps/rejected": -2.165980815887451, + "loss": 2.8892, + "rewards/accuracies": 0.6875, + "rewards/chosen": -19.83563232421875, + "rewards/margins": 1.8241767883300781, + "rewards/rejected": -21.659809112548828, + "step": 195 + }, + { + "epoch": 0.6749892380542402, + "grad_norm": 185.40100121088884, + "learning_rate": 2.2986256199001607e-07, + "logits/chosen": -16.720062255859375, + "logits/rejected": -17.210546493530273, + "logps/chosen": -2.1794826984405518, + "logps/rejected": -3.011854887008667, + "loss": 3.2752, + "rewards/accuracies": 0.9375, + "rewards/chosen": -21.79482650756836, + "rewards/margins": 8.323722839355469, + "rewards/rejected": -30.118549346923828, + "step": 196 + }, + { + "epoch": 0.6784330606973741, + "grad_norm": 169.01081696151877, + "learning_rate": 2.2551753353777298e-07, + "logits/chosen": -16.483049392700195, + "logits/rejected": -15.803775787353516, + "logps/chosen": -1.923852562904358, + "logps/rejected": -2.4508516788482666, + "loss": 2.4423, + "rewards/accuracies": 0.8125, + "rewards/chosen": -19.238523483276367, + "rewards/margins": 5.269989967346191, + "rewards/rejected": -24.508514404296875, + "step": 197 + }, + { + "epoch": 0.681876883340508, + "grad_norm": 184.28493562958423, + "learning_rate": 2.2119778439921243e-07, + "logits/chosen": -17.207120895385742, + "logits/rejected": -17.228199005126953, + "logps/chosen": -2.1767778396606445, + "logps/rejected": -2.8498196601867676, + "loss": 2.6463, + "rewards/accuracies": 0.875, + "rewards/chosen": -21.767776489257812, + "rewards/margins": 6.730417251586914, + "rewards/rejected": -28.49819564819336, + "step": 198 + }, + { + "epoch": 0.6853207059836418, + "grad_norm": 185.34805729694065, + "learning_rate": 2.169039404268666e-07, + "logits/chosen": -15.101792335510254, + "logits/rejected": -15.015120506286621, + "logps/chosen": -1.9498858451843262, + "logps/rejected": -2.9745125770568848, + "loss": 2.4206, + "rewards/accuracies": 0.875, + "rewards/chosen": -19.498859405517578, + "rewards/margins": 10.246267318725586, + "rewards/rejected": -29.745128631591797, + "step": 199 + }, + { + "epoch": 0.6887645286267757, + "grad_norm": 171.9250040897193, + "learning_rate": 2.1263662372008397e-07, + "logits/chosen": -17.20707893371582, + "logits/rejected": -17.392990112304688, + "logps/chosen": -2.209491491317749, + "logps/rejected": -3.219839334487915, + "loss": 2.3084, + "rewards/accuracies": 0.875, + "rewards/chosen": -22.094913482666016, + "rewards/margins": 10.103475570678711, + "rewards/rejected": -32.19839096069336, + "step": 200 + }, + { + "epoch": 0.6922083512699096, + "grad_norm": 134.1599441897618, + "learning_rate": 2.0839645253489785e-07, + "logits/chosen": -17.15797233581543, + "logits/rejected": -17.089859008789062, + "logps/chosen": -2.460911512374878, + "logps/rejected": -3.360152244567871, + "loss": 2.7262, + "rewards/accuracies": 0.875, + "rewards/chosen": -24.60911750793457, + "rewards/margins": 8.992408752441406, + "rewards/rejected": -33.601524353027344, + "step": 201 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 225.89869134193762, + "learning_rate": 2.0418404119445257e-07, + "logits/chosen": -18.76146697998047, + "logits/rejected": -18.73739242553711, + "logps/chosen": -2.1693127155303955, + "logps/rejected": -2.5993404388427734, + "loss": 2.7532, + "rewards/accuracies": 0.75, + "rewards/chosen": -21.693126678466797, + "rewards/margins": 4.3002777099609375, + "rewards/rejected": -25.993404388427734, + "step": 202 + }, + { + "epoch": 0.6990959965561774, + "grad_norm": 168.18364451173605, + "learning_rate": 2.0000000000000007e-07, + "logits/chosen": -16.125919342041016, + "logits/rejected": -16.450841903686523, + "logps/chosen": -2.2542104721069336, + "logps/rejected": -2.954272985458374, + "loss": 2.3868, + "rewards/accuracies": 0.8125, + "rewards/chosen": -22.542104721069336, + "rewards/margins": 7.000626087188721, + "rewards/rejected": -29.542734146118164, + "step": 203 + }, + { + "epoch": 0.7025398191993112, + "grad_norm": 169.8912048470285, + "learning_rate": 1.9584493514247673e-07, + "logits/chosen": -15.31773567199707, + "logits/rejected": -15.480231285095215, + "logps/chosen": -2.3769216537475586, + "logps/rejected": -3.0659618377685547, + "loss": 2.829, + "rewards/accuracies": 0.75, + "rewards/chosen": -23.769216537475586, + "rewards/margins": 6.890398979187012, + "rewards/rejected": -30.65961456298828, + "step": 204 + }, + { + "epoch": 0.7059836418424451, + "grad_norm": 164.25344840367887, + "learning_rate": 1.91719448614679e-07, + "logits/chosen": -18.408157348632812, + "logits/rejected": -18.19486427307129, + "logps/chosen": -2.329998016357422, + "logps/rejected": -2.998572587966919, + "loss": 2.6702, + "rewards/accuracies": 0.75, + "rewards/chosen": -23.29998016357422, + "rewards/margins": 6.685744285583496, + "rewards/rejected": -29.9857234954834, + "step": 205 + }, + { + "epoch": 0.709427464485579, + "grad_norm": 160.2297179052625, + "learning_rate": 1.8762413812404537e-07, + "logits/chosen": -15.564806938171387, + "logits/rejected": -15.336395263671875, + "logps/chosen": -2.414278507232666, + "logps/rejected": -3.099297523498535, + "loss": 2.3344, + "rewards/accuracies": 0.875, + "rewards/chosen": -24.142784118652344, + "rewards/margins": 6.850188255310059, + "rewards/rejected": -30.992971420288086, + "step": 206 + }, + { + "epoch": 0.7128712871287128, + "grad_norm": 159.59891745599342, + "learning_rate": 1.8355959700605835e-07, + "logits/chosen": -16.28492546081543, + "logits/rejected": -15.95881462097168, + "logps/chosen": -2.8605947494506836, + "logps/rejected": -3.867330551147461, + "loss": 2.8207, + "rewards/accuracies": 0.9375, + "rewards/chosen": -28.60594367980957, + "rewards/margins": 10.067360877990723, + "rewards/rejected": -38.67330551147461, + "step": 207 + }, + { + "epoch": 0.7163151097718468, + "grad_norm": 138.3874486270341, + "learning_rate": 1.7952641413828285e-07, + "logits/chosen": -14.021824836730957, + "logits/rejected": -14.172553062438965, + "logps/chosen": -1.9960724115371704, + "logps/rejected": -2.703728199005127, + "loss": 2.7886, + "rewards/accuracies": 0.8125, + "rewards/chosen": -19.960723876953125, + "rewards/margins": 7.076559066772461, + "rewards/rejected": -27.037281036376953, + "step": 208 + }, + { + "epoch": 0.7197589324149807, + "grad_norm": 139.88886492945034, + "learning_rate": 1.755251738550471e-07, + "logits/chosen": -17.956972122192383, + "logits/rejected": -17.393238067626953, + "logps/chosen": -2.526200294494629, + "logps/rejected": -3.3326597213745117, + "loss": 2.5612, + "rewards/accuracies": 0.8125, + "rewards/chosen": -25.262001037597656, + "rewards/margins": 8.064594268798828, + "rewards/rejected": -33.32659149169922, + "step": 209 + }, + { + "epoch": 0.7232027550581145, + "grad_norm": 121.87663059806626, + "learning_rate": 1.7155645586278396e-07, + "logits/chosen": -16.801706314086914, + "logits/rejected": -17.29227638244629, + "logps/chosen": -2.4827024936676025, + "logps/rejected": -3.251868724822998, + "loss": 2.3086, + "rewards/accuracies": 1.0, + "rewards/chosen": -24.827022552490234, + "rewards/margins": 7.691664218902588, + "rewards/rejected": -32.5186882019043, + "step": 210 + }, + { + "epoch": 0.7266465777012484, + "grad_norm": 183.5598825145353, + "learning_rate": 1.6762083515604205e-07, + "logits/chosen": -16.376623153686523, + "logits/rejected": -16.853988647460938, + "logps/chosen": -2.1823861598968506, + "logps/rejected": -2.469951629638672, + "loss": 3.0611, + "rewards/accuracies": 0.625, + "rewards/chosen": -21.82386016845703, + "rewards/margins": 2.8756532669067383, + "rewards/rejected": -24.69951629638672, + "step": 211 + }, + { + "epoch": 0.7300904003443822, + "grad_norm": 140.6766155274216, + "learning_rate": 1.6371888193417962e-07, + "logits/chosen": -16.161523818969727, + "logits/rejected": -15.664430618286133, + "logps/chosen": -2.5164270401000977, + "logps/rejected": -3.5702481269836426, + "loss": 1.5852, + "rewards/accuracies": 0.8125, + "rewards/chosen": -25.16427230834961, + "rewards/margins": 10.538207054138184, + "rewards/rejected": -35.702476501464844, + "step": 212 + }, + { + "epoch": 0.7335342229875161, + "grad_norm": 145.77478214483807, + "learning_rate": 1.598511615187527e-07, + "logits/chosen": -16.284515380859375, + "logits/rejected": -15.916769027709961, + "logps/chosen": -1.8510792255401611, + "logps/rejected": -2.855922222137451, + "loss": 2.4349, + "rewards/accuracies": 0.8125, + "rewards/chosen": -18.51078987121582, + "rewards/margins": 10.048429489135742, + "rewards/rejected": -28.559221267700195, + "step": 213 + }, + { + "epoch": 0.7369780456306501, + "grad_norm": 165.3902758912234, + "learning_rate": 1.560182342716109e-07, + "logits/chosen": -16.97911834716797, + "logits/rejected": -17.0489444732666, + "logps/chosen": -2.855278253555298, + "logps/rejected": -3.1526296138763428, + "loss": 2.8624, + "rewards/accuracies": 0.75, + "rewards/chosen": -28.552780151367188, + "rewards/margins": 2.9735140800476074, + "rewards/rejected": -31.526294708251953, + "step": 214 + }, + { + "epoch": 0.7404218682737839, + "grad_norm": 169.1752973988815, + "learning_rate": 1.5222065551371078e-07, + "logits/chosen": -16.086669921875, + "logits/rejected": -15.614689826965332, + "logps/chosen": -2.2419023513793945, + "logps/rejected": -2.8294296264648438, + "loss": 2.1771, + "rewards/accuracies": 0.875, + "rewards/chosen": -22.41902732849121, + "rewards/margins": 5.875270843505859, + "rewards/rejected": -28.294296264648438, + "step": 215 + }, + { + "epoch": 0.7438656909169178, + "grad_norm": 175.5809330835783, + "learning_rate": 1.4845897544466062e-07, + "logits/chosen": -15.881332397460938, + "logits/rejected": -16.038084030151367, + "logps/chosen": -1.938501238822937, + "logps/rejected": -2.6783177852630615, + "loss": 2.5021, + "rewards/accuracies": 0.875, + "rewards/chosen": -19.385011672973633, + "rewards/margins": 7.398165702819824, + "rewards/rejected": -26.783178329467773, + "step": 216 + }, + { + "epoch": 0.7473095135600517, + "grad_norm": 138.7906401630708, + "learning_rate": 1.4473373906300576e-07, + "logits/chosen": -14.417771339416504, + "logits/rejected": -14.370369911193848, + "logps/chosen": -1.7395800352096558, + "logps/rejected": -2.279273271560669, + "loss": 2.329, + "rewards/accuracies": 0.875, + "rewards/chosen": -17.39579963684082, + "rewards/margins": 5.396934986114502, + "rewards/rejected": -22.79273223876953, + "step": 217 + }, + { + "epoch": 0.7507533362031855, + "grad_norm": 177.5157013301922, + "learning_rate": 1.4104548608726895e-07, + "logits/chosen": -17.694894790649414, + "logits/rejected": -17.320873260498047, + "logps/chosen": -2.5375585556030273, + "logps/rejected": -3.9845151901245117, + "loss": 2.2671, + "rewards/accuracies": 0.9375, + "rewards/chosen": -25.375581741333008, + "rewards/margins": 14.46957015991211, + "rewards/rejected": -39.84515380859375, + "step": 218 + }, + { + "epoch": 0.7541971588463194, + "grad_norm": 134.9327740603631, + "learning_rate": 1.3739475087775466e-07, + "logits/chosen": -15.686095237731934, + "logits/rejected": -15.889823913574219, + "logps/chosen": -2.468545913696289, + "logps/rejected": -2.932441234588623, + "loss": 2.5443, + "rewards/accuracies": 0.75, + "rewards/chosen": -24.685457229614258, + "rewards/margins": 4.638955116271973, + "rewards/rejected": -29.324413299560547, + "step": 219 + }, + { + "epoch": 0.7576409814894532, + "grad_norm": 135.47400869262708, + "learning_rate": 1.3378206235913028e-07, + "logits/chosen": -15.72805404663086, + "logits/rejected": -15.899078369140625, + "logps/chosen": -1.9772777557373047, + "logps/rejected": -2.4077036380767822, + "loss": 2.3468, + "rewards/accuracies": 0.6875, + "rewards/chosen": -19.77277946472168, + "rewards/margins": 4.304256916046143, + "rewards/rejected": -24.07703399658203, + "step": 220 + }, + { + "epoch": 0.7610848041325872, + "grad_norm": 162.24671967985978, + "learning_rate": 1.3020794394379447e-07, + "logits/chosen": -16.417884826660156, + "logits/rejected": -15.766003608703613, + "logps/chosen": -3.0476391315460205, + "logps/rejected": -4.174098491668701, + "loss": 2.7984, + "rewards/accuracies": 0.9375, + "rewards/chosen": -30.476388931274414, + "rewards/margins": 11.264592170715332, + "rewards/rejected": -41.74098205566406, + "step": 221 + }, + { + "epoch": 0.7645286267757211, + "grad_norm": 153.11579053938232, + "learning_rate": 1.2667291345604433e-07, + "logits/chosen": -16.348756790161133, + "logits/rejected": -16.798192977905273, + "logps/chosen": -2.2396018505096436, + "logps/rejected": -2.7667808532714844, + "loss": 2.4575, + "rewards/accuracies": 0.6875, + "rewards/chosen": -22.39601707458496, + "rewards/margins": 5.271792411804199, + "rewards/rejected": -27.66781234741211, + "step": 222 + }, + { + "epoch": 0.7679724494188549, + "grad_norm": 125.46348058673289, + "learning_rate": 1.2317748305705217e-07, + "logits/chosen": -17.994367599487305, + "logits/rejected": -18.30419921875, + "logps/chosen": -2.42231822013855, + "logps/rejected": -2.860621690750122, + "loss": 2.578, + "rewards/accuracies": 0.75, + "rewards/chosen": -24.223180770874023, + "rewards/margins": 4.383035659790039, + "rewards/rejected": -28.606216430664062, + "step": 223 + }, + { + "epoch": 0.7714162720619888, + "grad_norm": 130.3454994188522, + "learning_rate": 1.1972215917066307e-07, + "logits/chosen": -17.247783660888672, + "logits/rejected": -16.855878829956055, + "logps/chosen": -2.419665813446045, + "logps/rejected": -3.496854782104492, + "loss": 2.3658, + "rewards/accuracies": 0.875, + "rewards/chosen": -24.196657180786133, + "rewards/margins": 10.771889686584473, + "rewards/rejected": -34.968544006347656, + "step": 224 + }, + { + "epoch": 0.7748600947051227, + "grad_norm": 172.91319963893733, + "learning_rate": 1.1630744241002223e-07, + "logits/chosen": -17.969970703125, + "logits/rejected": -18.13035011291504, + "logps/chosen": -2.1760947704315186, + "logps/rejected": -2.7468371391296387, + "loss": 2.4968, + "rewards/accuracies": 0.8125, + "rewards/chosen": -21.760944366455078, + "rewards/margins": 5.707423210144043, + "rewards/rejected": -27.46837043762207, + "step": 225 + }, + { + "epoch": 0.7783039173482565, + "grad_norm": 113.76103569407015, + "learning_rate": 1.1293382750504688e-07, + "logits/chosen": -18.275222778320312, + "logits/rejected": -17.712696075439453, + "logps/chosen": -2.3341901302337646, + "logps/rejected": -3.101623773574829, + "loss": 2.1953, + "rewards/accuracies": 0.875, + "rewards/chosen": -23.341899871826172, + "rewards/margins": 7.674335479736328, + "rewards/rejected": -31.016237258911133, + "step": 226 + }, + { + "epoch": 0.7817477399913905, + "grad_norm": 132.91068429588344, + "learning_rate": 1.0960180323074774e-07, + "logits/chosen": -18.44281578063965, + "logits/rejected": -18.48580551147461, + "logps/chosen": -2.326096296310425, + "logps/rejected": -3.35933518409729, + "loss": 1.9983, + "rewards/accuracies": 0.875, + "rewards/chosen": -23.260963439941406, + "rewards/margins": 10.332388877868652, + "rewards/rejected": -33.593353271484375, + "step": 227 + }, + { + "epoch": 0.7851915626345243, + "grad_norm": 148.59038179673675, + "learning_rate": 1.0631185233641474e-07, + "logits/chosen": -18.724830627441406, + "logits/rejected": -18.60055923461914, + "logps/chosen": -2.2051844596862793, + "logps/rejected": -3.1807661056518555, + "loss": 2.2086, + "rewards/accuracies": 0.8125, + "rewards/chosen": -22.05184555053711, + "rewards/margins": 9.755819320678711, + "rewards/rejected": -31.80766487121582, + "step": 228 + }, + { + "epoch": 0.7886353852776582, + "grad_norm": 168.3363506588075, + "learning_rate": 1.0306445147567604e-07, + "logits/chosen": -16.910625457763672, + "logits/rejected": -17.089317321777344, + "logps/chosen": -2.1940903663635254, + "logps/rejected": -2.7529542446136475, + "loss": 3.2585, + "rewards/accuracies": 0.75, + "rewards/chosen": -21.940902709960938, + "rewards/margins": 5.588636875152588, + "rewards/rejected": -27.529541015625, + "step": 229 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 140.0112579178903, + "learning_rate": 9.986007113743906e-08, + "logits/chosen": -17.158119201660156, + "logits/rejected": -17.688983917236328, + "logps/chosen": -2.0375654697418213, + "logps/rejected": -2.7925596237182617, + "loss": 2.1608, + "rewards/accuracies": 0.8125, + "rewards/chosen": -20.375656127929688, + "rewards/margins": 7.549938201904297, + "rewards/rejected": -27.925594329833984, + "step": 230 + }, + { + "epoch": 0.7955230305639259, + "grad_norm": 147.83687034828287, + "learning_rate": 9.669917557772542e-08, + "logits/chosen": -17.468488693237305, + "logits/rejected": -17.5322208404541, + "logps/chosen": -2.3444223403930664, + "logps/rejected": -2.8957479000091553, + "loss": 2.1804, + "rewards/accuracies": 0.8125, + "rewards/chosen": -23.44422149658203, + "rewards/margins": 5.513256549835205, + "rewards/rejected": -28.95747947692871, + "step": 231 + }, + { + "epoch": 0.7989668532070598, + "grad_norm": 182.36869206020535, + "learning_rate": 9.358222275240884e-08, + "logits/chosen": -17.69605255126953, + "logits/rejected": -17.339937210083008, + "logps/chosen": -2.6390442848205566, + "logps/rejected": -3.474766254425049, + "loss": 2.8717, + "rewards/accuracies": 0.8125, + "rewards/chosen": -26.39044189453125, + "rewards/margins": 8.357217788696289, + "rewards/rejected": -34.74766159057617, + "step": 232 + }, + { + "epoch": 0.8024106758501938, + "grad_norm": 253.44665519838688, + "learning_rate": 9.050966425086546e-08, + "logits/chosen": -17.90776252746582, + "logits/rejected": -18.13881492614746, + "logps/chosen": -2.3884530067443848, + "logps/rejected": -3.7100043296813965, + "loss": 3.4066, + "rewards/accuracies": 0.875, + "rewards/chosen": -23.884531021118164, + "rewards/margins": 13.215514183044434, + "rewards/rejected": -37.10004425048828, + "step": 233 + }, + { + "epoch": 0.8058544984933276, + "grad_norm": 133.61246579427012, + "learning_rate": 8.748194523054748e-08, + "logits/chosen": -17.281970977783203, + "logits/rejected": -17.455997467041016, + "logps/chosen": -2.43415904045105, + "logps/rejected": -3.0494344234466553, + "loss": 1.9898, + "rewards/accuracies": 0.8125, + "rewards/chosen": -24.341590881347656, + "rewards/margins": 6.152754783630371, + "rewards/rejected": -30.49434471130371, + "step": 234 + }, + { + "epoch": 0.8092983211364615, + "grad_norm": 147.12423003196992, + "learning_rate": 8.449950435248676e-08, + "logits/chosen": -17.537288665771484, + "logits/rejected": -17.6503963470459, + "logps/chosen": -2.411252737045288, + "logps/rejected": -2.5560195446014404, + "loss": 2.959, + "rewards/accuracies": 0.625, + "rewards/chosen": -24.11252784729004, + "rewards/margins": 1.447667121887207, + "rewards/rejected": -25.56019401550293, + "step": 235 + }, + { + "epoch": 0.8127421437795953, + "grad_norm": 173.4351632725469, + "learning_rate": 8.15627737177425e-08, + "logits/chosen": -15.300152778625488, + "logits/rejected": -14.813128471374512, + "logps/chosen": -2.2649199962615967, + "logps/rejected": -3.06396222114563, + "loss": 3.1114, + "rewards/accuracies": 0.8125, + "rewards/chosen": -22.649198532104492, + "rewards/margins": 7.99042272567749, + "rewards/rejected": -30.63962173461914, + "step": 236 + }, + { + "epoch": 0.8161859664227292, + "grad_norm": 148.7657685969793, + "learning_rate": 7.867217880479629e-08, + "logits/chosen": -15.767210006713867, + "logits/rejected": -15.650933265686035, + "logps/chosen": -2.087116003036499, + "logps/rejected": -3.3104443550109863, + "loss": 2.4406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -20.87116241455078, + "rewards/margins": 12.23327922821045, + "rewards/rejected": -33.10444259643555, + "step": 237 + }, + { + "epoch": 0.8196297890658631, + "grad_norm": 151.07749718687427, + "learning_rate": 7.582813840790847e-08, + "logits/chosen": -15.618885040283203, + "logits/rejected": -15.997750282287598, + "logps/chosen": -2.0786385536193848, + "logps/rejected": -2.7961952686309814, + "loss": 2.7965, + "rewards/accuracies": 0.6875, + "rewards/chosen": -20.786386489868164, + "rewards/margins": 7.175565719604492, + "rewards/rejected": -27.96195411682129, + "step": 238 + }, + { + "epoch": 0.823073611708997, + "grad_norm": 144.2174329571498, + "learning_rate": 7.303106457644328e-08, + "logits/chosen": -16.712963104248047, + "logits/rejected": -16.825284957885742, + "logps/chosen": -2.238386392593384, + "logps/rejected": -3.5630311965942383, + "loss": 2.8129, + "rewards/accuracies": 1.0, + "rewards/chosen": -22.383865356445312, + "rewards/margins": 13.246443748474121, + "rewards/rejected": -35.63031005859375, + "step": 239 + }, + { + "epoch": 0.8265174343521309, + "grad_norm": 169.73215342488882, + "learning_rate": 7.028136255516938e-08, + "logits/chosen": -18.084627151489258, + "logits/rejected": -18.091758728027344, + "logps/chosen": -3.0889220237731934, + "logps/rejected": -3.6106934547424316, + "loss": 2.5796, + "rewards/accuracies": 0.875, + "rewards/chosen": -30.88922119140625, + "rewards/margins": 5.217716693878174, + "rewards/rejected": -36.106937408447266, + "step": 240 + }, + { + "epoch": 0.8299612569952648, + "grad_norm": 149.39660539357982, + "learning_rate": 6.75794307255479e-08, + "logits/chosen": -17.41098976135254, + "logits/rejected": -17.147449493408203, + "logps/chosen": -2.440295457839966, + "logps/rejected": -3.6117165088653564, + "loss": 2.1983, + "rewards/accuracies": 0.9375, + "rewards/chosen": -24.4029541015625, + "rewards/margins": 11.714213371276855, + "rewards/rejected": -36.11716842651367, + "step": 241 + }, + { + "epoch": 0.8334050796383986, + "grad_norm": 154.41052222889476, + "learning_rate": 6.492566054801414e-08, + "logits/chosen": -17.25594139099121, + "logits/rejected": -17.346635818481445, + "logps/chosen": -2.8043160438537598, + "logps/rejected": -3.589071750640869, + "loss": 2.6675, + "rewards/accuracies": 0.9375, + "rewards/chosen": -28.04315948486328, + "rewards/margins": 7.847556114196777, + "rewards/rejected": -35.890716552734375, + "step": 242 + }, + { + "epoch": 0.8368489022815325, + "grad_norm": 144.1864772937289, + "learning_rate": 6.232043650526195e-08, + "logits/chosen": -19.001102447509766, + "logits/rejected": -19.054353713989258, + "logps/chosen": -2.5245139598846436, + "logps/rejected": -2.9145162105560303, + "loss": 2.6573, + "rewards/accuracies": 0.75, + "rewards/chosen": -25.245140075683594, + "rewards/margins": 3.9000210762023926, + "rewards/rejected": -29.145160675048828, + "step": 243 + }, + { + "epoch": 0.8402927249246663, + "grad_norm": 159.15922893834966, + "learning_rate": 5.976413604653978e-08, + "logits/chosen": -16.62332534790039, + "logits/rejected": -17.053430557250977, + "logps/chosen": -2.617964267730713, + "logps/rejected": -3.058418035507202, + "loss": 2.3253, + "rewards/accuracies": 0.875, + "rewards/chosen": -26.179641723632812, + "rewards/margins": 4.404535293579102, + "rewards/rejected": -30.58417510986328, + "step": 244 + }, + { + "epoch": 0.8437365475678003, + "grad_norm": 140.0984307995632, + "learning_rate": 5.725712953296438e-08, + "logits/chosen": -15.821372985839844, + "logits/rejected": -15.742339134216309, + "logps/chosen": -1.8620578050613403, + "logps/rejected": -2.808138847351074, + "loss": 2.4966, + "rewards/accuracies": 0.9375, + "rewards/chosen": -18.62057876586914, + "rewards/margins": 9.460811614990234, + "rewards/rejected": -28.081388473510742, + "step": 245 + }, + { + "epoch": 0.8471803702109342, + "grad_norm": 147.6831357797836, + "learning_rate": 5.479978018386275e-08, + "logits/chosen": -17.423532485961914, + "logits/rejected": -17.499662399291992, + "logps/chosen": -2.267676591873169, + "logps/rejected": -2.954185962677002, + "loss": 2.4517, + "rewards/accuracies": 0.8125, + "rewards/chosen": -22.676767349243164, + "rewards/margins": 6.8650922775268555, + "rewards/rejected": -29.541858673095703, + "step": 246 + }, + { + "epoch": 0.850624192854068, + "grad_norm": 146.93865641824823, + "learning_rate": 5.23924440241486e-08, + "logits/chosen": -17.467391967773438, + "logits/rejected": -17.293798446655273, + "logps/chosen": -2.765316963195801, + "logps/rejected": -3.1737022399902344, + "loss": 3.1905, + "rewards/accuracies": 0.6875, + "rewards/chosen": -27.65317153930664, + "rewards/margins": 4.083853244781494, + "rewards/rejected": -31.737022399902344, + "step": 247 + }, + { + "epoch": 0.8540680154972019, + "grad_norm": 128.25480467364565, + "learning_rate": 5.003546983274014e-08, + "logits/chosen": -17.457887649536133, + "logits/rejected": -17.364229202270508, + "logps/chosen": -2.2277069091796875, + "logps/rejected": -3.1269993782043457, + "loss": 2.1543, + "rewards/accuracies": 0.8125, + "rewards/chosen": -22.277069091796875, + "rewards/margins": 8.992926597595215, + "rewards/rejected": -31.26999282836914, + "step": 248 + }, + { + "epoch": 0.8575118381403358, + "grad_norm": 126.61876835603796, + "learning_rate": 4.77291990920289e-08, + "logits/chosen": -15.818652153015137, + "logits/rejected": -15.359832763671875, + "logps/chosen": -1.8357523679733276, + "logps/rejected": -2.8696041107177734, + "loss": 2.5736, + "rewards/accuracies": 0.875, + "rewards/chosen": -18.357524871826172, + "rewards/margins": 10.33851432800293, + "rewards/rejected": -28.69603729248047, + "step": 249 + }, + { + "epoch": 0.8609556607834696, + "grad_norm": 147.3951096860917, + "learning_rate": 4.5473965938405e-08, + "logits/chosen": -17.267982482910156, + "logits/rejected": -17.239002227783203, + "logps/chosen": -2.3899097442626953, + "logps/rejected": -3.6304116249084473, + "loss": 2.1633, + "rewards/accuracies": 0.9375, + "rewards/chosen": -23.899097442626953, + "rewards/margins": 12.405017852783203, + "rewards/rejected": -36.304115295410156, + "step": 250 + }, + { + "epoch": 0.8643994834266036, + "grad_norm": 131.8214642518433, + "learning_rate": 4.32700971138471e-08, + "logits/chosen": -16.42135238647461, + "logits/rejected": -17.10132598876953, + "logps/chosen": -2.0020933151245117, + "logps/rejected": -2.6545495986938477, + "loss": 2.1802, + "rewards/accuracies": 0.75, + "rewards/chosen": -20.02093505859375, + "rewards/margins": 6.524560928344727, + "rewards/rejected": -26.54549789428711, + "step": 251 + }, + { + "epoch": 0.8678433060697374, + "grad_norm": 129.82185059004897, + "learning_rate": 4.11179119185832e-08, + "logits/chosen": -15.84195327758789, + "logits/rejected": -15.13234806060791, + "logps/chosen": -2.1187713146209717, + "logps/rejected": -3.074852228164673, + "loss": 2.0957, + "rewards/accuracies": 0.8125, + "rewards/chosen": -21.187713623046875, + "rewards/margins": 9.560807228088379, + "rewards/rejected": -30.748519897460938, + "step": 252 + }, + { + "epoch": 0.8712871287128713, + "grad_norm": 147.62415325752085, + "learning_rate": 3.9017722164830014e-08, + "logits/chosen": -15.684465408325195, + "logits/rejected": -15.583179473876953, + "logps/chosen": -2.2432422637939453, + "logps/rejected": -3.128464698791504, + "loss": 2.2196, + "rewards/accuracies": 0.8125, + "rewards/chosen": -22.432418823242188, + "rewards/margins": 8.852226257324219, + "rewards/rejected": -31.284645080566406, + "step": 253 + }, + { + "epoch": 0.8747309513560052, + "grad_norm": 129.59541911753672, + "learning_rate": 3.696983213161724e-08, + "logits/chosen": -16.10622215270996, + "logits/rejected": -15.97018814086914, + "logps/chosen": -2.268676519393921, + "logps/rejected": -3.3363349437713623, + "loss": 1.8693, + "rewards/accuracies": 0.9375, + "rewards/chosen": -22.686765670776367, + "rewards/margins": 10.676584243774414, + "rewards/rejected": -33.363346099853516, + "step": 254 + }, + { + "epoch": 0.878174773999139, + "grad_norm": 137.48943851483648, + "learning_rate": 3.4974538520702756e-08, + "logits/chosen": -14.73385238647461, + "logits/rejected": -14.685803413391113, + "logps/chosen": -2.011627197265625, + "logps/rejected": -2.8314733505249023, + "loss": 2.1826, + "rewards/accuracies": 0.875, + "rewards/chosen": -20.11627197265625, + "rewards/margins": 8.198460578918457, + "rewards/rejected": -28.314733505249023, + "step": 255 + }, + { + "epoch": 0.8816185966422729, + "grad_norm": 159.72968498568136, + "learning_rate": 3.303213041358628e-08, + "logits/chosen": -16.42586898803711, + "logits/rejected": -16.5064697265625, + "logps/chosen": -2.3388772010803223, + "logps/rejected": -3.104121208190918, + "loss": 2.3098, + "rewards/accuracies": 0.625, + "rewards/chosen": -23.388771057128906, + "rewards/margins": 7.652439117431641, + "rewards/rejected": -31.041210174560547, + "step": 256 + }, + { + "epoch": 0.8850624192854069, + "grad_norm": 139.996512233341, + "learning_rate": 3.114288922962673e-08, + "logits/chosen": -15.805761337280273, + "logits/rejected": -15.97050666809082, + "logps/chosen": -2.2719080448150635, + "logps/rejected": -2.8697736263275146, + "loss": 1.9082, + "rewards/accuracies": 0.75, + "rewards/chosen": -22.719078063964844, + "rewards/margins": 5.9786577224731445, + "rewards/rejected": -28.697734832763672, + "step": 257 + }, + { + "epoch": 0.8885062419285407, + "grad_norm": 142.0757665384917, + "learning_rate": 2.9307088685269544e-08, + "logits/chosen": -16.687719345092773, + "logits/rejected": -16.818946838378906, + "logps/chosen": -2.10107421875, + "logps/rejected": -2.945895195007324, + "loss": 2.0949, + "rewards/accuracies": 0.8125, + "rewards/chosen": -21.010740280151367, + "rewards/margins": 8.448209762573242, + "rewards/rejected": -29.45895004272461, + "step": 258 + }, + { + "epoch": 0.8919500645716746, + "grad_norm": 132.30438095070775, + "learning_rate": 2.7524994754390206e-08, + "logits/chosen": -18.812986373901367, + "logits/rejected": -18.67325210571289, + "logps/chosen": -2.727328300476074, + "logps/rejected": -3.1050448417663574, + "loss": 2.0166, + "rewards/accuracies": 0.8125, + "rewards/chosen": -27.27328109741211, + "rewards/margins": 3.7771661281585693, + "rewards/rejected": -31.050447463989258, + "step": 259 + }, + { + "epoch": 0.8953938872148084, + "grad_norm": 146.74697878065496, + "learning_rate": 2.5796865629759622e-08, + "logits/chosen": -16.785795211791992, + "logits/rejected": -16.06755256652832, + "logps/chosen": -2.190410852432251, + "logps/rejected": -3.289196014404297, + "loss": 2.7578, + "rewards/accuracies": 0.6875, + "rewards/chosen": -21.90410614013672, + "rewards/margins": 10.987853050231934, + "rewards/rejected": -32.89196014404297, + "step": 260 + }, + { + "epoch": 0.8988377098579423, + "grad_norm": 206.78249442348837, + "learning_rate": 2.4122951685636674e-08, + "logits/chosen": -16.686670303344727, + "logits/rejected": -16.18621826171875, + "logps/chosen": -2.611176013946533, + "logps/rejected": -3.921743869781494, + "loss": 3.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": -26.11176109313965, + "rewards/margins": 13.10567855834961, + "rewards/rejected": -39.217437744140625, + "step": 261 + }, + { + "epoch": 0.9022815325010762, + "grad_norm": 1949.594124426928, + "learning_rate": 2.2503495441493503e-08, + "logits/chosen": -16.57187271118164, + "logits/rejected": -16.915918350219727, + "logps/chosen": -1.6403616666793823, + "logps/rejected": -2.541311740875244, + "loss": 2.0577, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.40361785888672, + "rewards/margins": 9.009501457214355, + "rewards/rejected": -25.41312026977539, + "step": 262 + }, + { + "epoch": 0.90572535514421, + "grad_norm": 144.45425989954398, + "learning_rate": 2.093873152687906e-08, + "logits/chosen": -16.985397338867188, + "logits/rejected": -16.533916473388672, + "logps/chosen": -2.0635147094726562, + "logps/rejected": -3.0182905197143555, + "loss": 2.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.635149002075195, + "rewards/margins": 9.547759056091309, + "rewards/rejected": -30.18290901184082, + "step": 263 + }, + { + "epoch": 0.909169177787344, + "grad_norm": 161.70566070216898, + "learning_rate": 1.9428886647425214e-08, + "logits/chosen": -18.097816467285156, + "logits/rejected": -17.79208755493164, + "logps/chosen": -2.152082681655884, + "logps/rejected": -2.9253194332122803, + "loss": 2.632, + "rewards/accuracies": 0.75, + "rewards/chosen": -21.52082633972168, + "rewards/margins": 7.732368469238281, + "rewards/rejected": -29.253196716308594, + "step": 264 + }, + { + "epoch": 0.9126130004304779, + "grad_norm": 129.93371206599926, + "learning_rate": 1.7974179552001866e-08, + "logits/chosen": -17.505081176757812, + "logits/rejected": -17.78946304321289, + "logps/chosen": -2.234717845916748, + "logps/rejected": -2.539454936981201, + "loss": 2.4342, + "rewards/accuracies": 0.6875, + "rewards/chosen": -22.347179412841797, + "rewards/margins": 3.0473670959472656, + "rewards/rejected": -25.394546508789062, + "step": 265 + }, + { + "epoch": 0.9160568230736117, + "grad_norm": 151.94990436818313, + "learning_rate": 1.6574821001023474e-08, + "logits/chosen": -18.333892822265625, + "logits/rejected": -18.197607040405273, + "logps/chosen": -1.9889158010482788, + "logps/rejected": -2.735637903213501, + "loss": 2.2676, + "rewards/accuracies": 0.9375, + "rewards/chosen": -19.889158248901367, + "rewards/margins": 7.467221260070801, + "rewards/rejected": -27.356380462646484, + "step": 266 + }, + { + "epoch": 0.9195006457167456, + "grad_norm": 157.25320445860555, + "learning_rate": 1.5231013735914444e-08, + "logits/chosen": -16.340839385986328, + "logits/rejected": -16.46707534790039, + "logps/chosen": -2.3339834213256836, + "logps/rejected": -2.750584840774536, + "loss": 2.4662, + "rewards/accuracies": 0.5625, + "rewards/chosen": -23.339832305908203, + "rewards/margins": 4.166016101837158, + "rewards/rejected": -27.50585174560547, + "step": 267 + }, + { + "epoch": 0.9229444683598794, + "grad_norm": 153.71273684620283, + "learning_rate": 1.3942952449735201e-08, + "logits/chosen": -17.994409561157227, + "logits/rejected": -17.976512908935547, + "logps/chosen": -2.3722710609436035, + "logps/rejected": -3.688662528991699, + "loss": 2.6692, + "rewards/accuracies": 0.875, + "rewards/chosen": -23.722707748413086, + "rewards/margins": 13.163920402526855, + "rewards/rejected": -36.886627197265625, + "step": 268 + }, + { + "epoch": 0.9263882910030133, + "grad_norm": 159.9085920533978, + "learning_rate": 1.2710823758974676e-08, + "logits/chosen": -17.915321350097656, + "logits/rejected": -17.735774993896484, + "logps/chosen": -2.341430425643921, + "logps/rejected": -3.303116798400879, + "loss": 2.5158, + "rewards/accuracies": 0.8125, + "rewards/chosen": -23.414304733276367, + "rewards/margins": 9.616860389709473, + "rewards/rejected": -33.031166076660156, + "step": 269 + }, + { + "epoch": 0.9298321136461473, + "grad_norm": 173.1432244469156, + "learning_rate": 1.1534806176513434e-08, + "logits/chosen": -17.020999908447266, + "logits/rejected": -16.834182739257812, + "logps/chosen": -3.0662012100219727, + "logps/rejected": -4.367783546447754, + "loss": 2.8725, + "rewards/accuracies": 0.75, + "rewards/chosen": -30.66200828552246, + "rewards/margins": 13.01583194732666, + "rewards/rejected": -43.67784118652344, + "step": 270 + }, + { + "epoch": 0.9332759362892811, + "grad_norm": 175.82982939834184, + "learning_rate": 1.0415070085759925e-08, + "logits/chosen": -18.364770889282227, + "logits/rejected": -17.562824249267578, + "logps/chosen": -1.9744817018508911, + "logps/rejected": -2.404961347579956, + "loss": 2.7178, + "rewards/accuracies": 0.9375, + "rewards/chosen": -19.744815826416016, + "rewards/margins": 4.304797172546387, + "rewards/rejected": -24.049612045288086, + "step": 271 + }, + { + "epoch": 0.936719758932415, + "grad_norm": 138.65454205055815, + "learning_rate": 9.351777715965337e-09, + "logits/chosen": -18.179439544677734, + "logits/rejected": -18.3562068939209, + "logps/chosen": -2.5122263431549072, + "logps/rejected": -2.9324097633361816, + "loss": 2.0112, + "rewards/accuracies": 0.625, + "rewards/chosen": -25.122264862060547, + "rewards/margins": 4.2018327713012695, + "rewards/rejected": -29.3240966796875, + "step": 272 + }, + { + "epoch": 0.9401635815755489, + "grad_norm": 166.37792305846952, + "learning_rate": 8.345083118719509e-09, + "logits/chosen": -17.025907516479492, + "logits/rejected": -16.50684928894043, + "logps/chosen": -2.3154404163360596, + "logps/rejected": -3.7841498851776123, + "loss": 2.4035, + "rewards/accuracies": 0.875, + "rewards/chosen": -23.154401779174805, + "rewards/margins": 14.68709659576416, + "rewards/rejected": -37.84150314331055, + "step": 273 + }, + { + "epoch": 0.9436074042186827, + "grad_norm": 156.2441873918835, + "learning_rate": 7.395132145631544e-09, + "logits/chosen": -16.066808700561523, + "logits/rejected": -15.90978717803955, + "logps/chosen": -1.8371270895004272, + "logps/rejected": -2.678525924682617, + "loss": 2.0809, + "rewards/accuracies": 0.875, + "rewards/chosen": -18.37127113342285, + "rewards/margins": 8.413987159729004, + "rewards/rejected": -26.785259246826172, + "step": 274 + }, + { + "epoch": 0.9470512268618166, + "grad_norm": 153.24700496665506, + "learning_rate": 6.502062427198929e-09, + "logits/chosen": -17.553876876831055, + "logits/rejected": -17.5835018157959, + "logps/chosen": -2.4539918899536133, + "logps/rejected": -3.212017774581909, + "loss": 2.8158, + "rewards/accuracies": 0.875, + "rewards/chosen": -24.539915084838867, + "rewards/margins": 7.580263137817383, + "rewards/rejected": -32.12017822265625, + "step": 275 + }, + { + "epoch": 0.9504950495049505, + "grad_norm": 143.41309705270993, + "learning_rate": 5.666003352866733e-09, + "logits/chosen": -16.96306610107422, + "logits/rejected": -17.010374069213867, + "logps/chosen": -2.3547351360321045, + "logps/rejected": -3.560419797897339, + "loss": 2.1713, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.547353744506836, + "rewards/margins": 12.056845664978027, + "rewards/rejected": -35.60419845581055, + "step": 276 + }, + { + "epoch": 0.9539388721480844, + "grad_norm": 127.28128452614312, + "learning_rate": 4.887076052282291e-09, + "logits/chosen": -16.996347427368164, + "logits/rejected": -17.045413970947266, + "logps/chosen": -2.568338394165039, + "logps/rejected": -3.7449233531951904, + "loss": 2.2934, + "rewards/accuracies": 0.75, + "rewards/chosen": -25.68338394165039, + "rewards/margins": 11.765849113464355, + "rewards/rejected": -37.44923400878906, + "step": 277 + }, + { + "epoch": 0.9573826947912183, + "grad_norm": 157.32940289604164, + "learning_rate": 4.165393377745108e-09, + "logits/chosen": -15.926298141479492, + "logits/rejected": -15.70862102508545, + "logps/chosen": -2.419119358062744, + "logps/rejected": -3.12341046333313, + "loss": 2.6577, + "rewards/accuracies": 0.8125, + "rewards/chosen": -24.191190719604492, + "rewards/margins": 7.042915344238281, + "rewards/rejected": -31.234106063842773, + "step": 278 + }, + { + "epoch": 0.9608265174343521, + "grad_norm": 166.13315912995816, + "learning_rate": 3.5010598878567387e-09, + "logits/chosen": -17.470443725585938, + "logits/rejected": -17.031620025634766, + "logps/chosen": -2.174730062484741, + "logps/rejected": -2.8888866901397705, + "loss": 2.6471, + "rewards/accuracies": 0.875, + "rewards/chosen": -21.747299194335938, + "rewards/margins": 7.1415696144104, + "rewards/rejected": -28.888866424560547, + "step": 279 + }, + { + "epoch": 0.964270340077486, + "grad_norm": 147.30485634603158, + "learning_rate": 2.8941718323724605e-09, + "logits/chosen": -18.49142837524414, + "logits/rejected": -18.490829467773438, + "logps/chosen": -2.174673080444336, + "logps/rejected": -2.49884295463562, + "loss": 2.4879, + "rewards/accuracies": 0.625, + "rewards/chosen": -21.746734619140625, + "rewards/margins": 3.241696357727051, + "rewards/rejected": -24.98843002319336, + "step": 280 + }, + { + "epoch": 0.9677141627206199, + "grad_norm": 164.2306340847377, + "learning_rate": 2.344817138256161e-09, + "logits/chosen": -16.280027389526367, + "logits/rejected": -17.136314392089844, + "logps/chosen": -2.1512930393218994, + "logps/rejected": -3.025418519973755, + "loss": 2.543, + "rewards/accuracies": 0.875, + "rewards/chosen": -21.512928009033203, + "rewards/margins": 8.741255760192871, + "rewards/rejected": -30.254182815551758, + "step": 281 + }, + { + "epoch": 0.9711579853637538, + "grad_norm": 150.11832572590862, + "learning_rate": 1.8530753969413282e-09, + "logits/chosen": -18.478666305541992, + "logits/rejected": -18.377622604370117, + "logps/chosen": -1.8686823844909668, + "logps/rejected": -2.416599750518799, + "loss": 2.3333, + "rewards/accuracies": 0.8125, + "rewards/chosen": -18.686824798583984, + "rewards/margins": 5.479173183441162, + "rewards/rejected": -24.165996551513672, + "step": 282 + }, + { + "epoch": 0.9746018080068877, + "grad_norm": 148.19843248369733, + "learning_rate": 1.4190178527999198e-09, + "logits/chosen": -18.65854263305664, + "logits/rejected": -18.510950088500977, + "logps/chosen": -2.684150457382202, + "logps/rejected": -3.3109562397003174, + "loss": 2.5508, + "rewards/accuracies": 0.75, + "rewards/chosen": -26.84150505065918, + "rewards/margins": 6.268057823181152, + "rewards/rejected": -33.109561920166016, + "step": 283 + }, + { + "epoch": 0.9780456306500215, + "grad_norm": 145.3408171333848, + "learning_rate": 1.0427073928200857e-09, + "logits/chosen": -16.74791717529297, + "logits/rejected": -17.11324119567871, + "logps/chosen": -1.9323251247406006, + "logps/rejected": -2.5761024951934814, + "loss": 2.6734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -19.32324981689453, + "rewards/margins": 6.437774658203125, + "rewards/rejected": -25.761024475097656, + "step": 284 + }, + { + "epoch": 0.9814894532931554, + "grad_norm": 157.29447155335754, + "learning_rate": 7.241985374952797e-10, + "logits/chosen": -15.43560791015625, + "logits/rejected": -16.016206741333008, + "logps/chosen": -2.186183452606201, + "logps/rejected": -2.557718515396118, + "loss": 2.6831, + "rewards/accuracies": 0.75, + "rewards/chosen": -21.86183738708496, + "rewards/margins": 3.7153472900390625, + "rewards/rejected": -25.57718276977539, + "step": 285 + }, + { + "epoch": 0.9849332759362893, + "grad_norm": 169.85671811117746, + "learning_rate": 4.6353743292497637e-10, + "logits/chosen": -17.268774032592773, + "logits/rejected": -17.37803077697754, + "logps/chosen": -2.315028429031372, + "logps/rejected": -2.8562231063842773, + "loss": 3.1469, + "rewards/accuracies": 0.75, + "rewards/chosen": -23.150283813476562, + "rewards/margins": 5.411944389343262, + "rewards/rejected": -28.56222915649414, + "step": 286 + }, + { + "epoch": 0.9883770985794231, + "grad_norm": 134.22079815084547, + "learning_rate": 2.607618441292203e-10, + "logits/chosen": -17.780881881713867, + "logits/rejected": -17.691925048828125, + "logps/chosen": -2.1940627098083496, + "logps/rejected": -2.694575786590576, + "loss": 2.1033, + "rewards/accuracies": 0.75, + "rewards/chosen": -21.940628051757812, + "rewards/margins": 5.005130290985107, + "rewards/rejected": -26.945756912231445, + "step": 287 + }, + { + "epoch": 0.991820921222557, + "grad_norm": 159.97180537715076, + "learning_rate": 1.1590114957682473e-10, + "logits/chosen": -18.85245704650879, + "logits/rejected": -18.812564849853516, + "logps/chosen": -2.036210536956787, + "logps/rejected": -2.694965362548828, + "loss": 2.2158, + "rewards/accuracies": 0.8125, + "rewards/chosen": -20.362106323242188, + "rewards/margins": 6.58754825592041, + "rewards/rejected": -26.94965362548828, + "step": 288 + }, + { + "epoch": 0.995264743865691, + "grad_norm": 170.03207418013815, + "learning_rate": 2.8976336929353863e-11, + "logits/chosen": -17.584726333618164, + "logits/rejected": -17.696880340576172, + "logps/chosen": -2.0683584213256836, + "logps/rejected": -2.683166980743408, + "loss": 2.4093, + "rewards/accuracies": 0.75, + "rewards/chosen": -20.683582305908203, + "rewards/margins": 6.1480865478515625, + "rewards/rejected": -26.8316707611084, + "step": 289 + }, + { + "epoch": 0.9987085665088248, + "grad_norm": 122.7613085104077, + "learning_rate": 0.0, + "logits/chosen": -18.123817443847656, + "logits/rejected": -18.210723876953125, + "logps/chosen": -2.4845879077911377, + "logps/rejected": -3.084573268890381, + "loss": 2.7978, + "rewards/accuracies": 0.8125, + "rewards/chosen": -24.84588050842285, + "rewards/margins": 5.999850273132324, + "rewards/rejected": -30.845731735229492, + "step": 290 + }, + { + "epoch": 0.9987085665088248, + "step": 290, + "total_flos": 0.0, + "train_loss": 3.2915380025732106, + "train_runtime": 46073.3913, + "train_samples_per_second": 0.807, + "train_steps_per_second": 0.006 + } + ], + "logging_steps": 1, + "max_steps": 290, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} \ No newline at end of file