{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 2290, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004366812227074236, "grad_norm": 1.8752956704333947, "learning_rate": 8.733624454148472e-07, "loss": 3.7085, "step": 1 }, { "epoch": 0.021834061135371178, "grad_norm": 1.8028899921940393, "learning_rate": 4.3668122270742355e-06, "loss": 3.6291, "step": 5 }, { "epoch": 0.043668122270742356, "grad_norm": 1.75783914939411, "learning_rate": 8.733624454148471e-06, "loss": 3.547, "step": 10 }, { "epoch": 0.06550218340611354, "grad_norm": 2.1823195400139026, "learning_rate": 1.3100436681222708e-05, "loss": 3.6303, "step": 15 }, { "epoch": 0.08733624454148471, "grad_norm": 2.7703299276693256, "learning_rate": 1.7467248908296942e-05, "loss": 3.5805, "step": 20 }, { "epoch": 0.1091703056768559, "grad_norm": 3.0501154969163244, "learning_rate": 2.183406113537118e-05, "loss": 3.277, "step": 25 }, { "epoch": 0.13100436681222707, "grad_norm": 2.0575132099752795, "learning_rate": 2.6200873362445416e-05, "loss": 2.8289, "step": 30 }, { "epoch": 0.15283842794759825, "grad_norm": 1.2985876224099822, "learning_rate": 3.056768558951965e-05, "loss": 2.6017, "step": 35 }, { "epoch": 0.17467248908296942, "grad_norm": 0.7330786544465717, "learning_rate": 3.4934497816593884e-05, "loss": 2.4112, "step": 40 }, { "epoch": 0.1965065502183406, "grad_norm": 0.7933262369374595, "learning_rate": 3.930131004366812e-05, "loss": 2.2821, "step": 45 }, { "epoch": 0.2183406113537118, "grad_norm": 0.7790118551391272, "learning_rate": 4.366812227074236e-05, "loss": 2.0793, "step": 50 }, { "epoch": 0.24017467248908297, "grad_norm": 0.7082020251604142, "learning_rate": 4.8034934497816594e-05, "loss": 2.0505, "step": 55 }, { "epoch": 0.26200873362445415, "grad_norm": 0.5696342576311191, "learning_rate": 5.240174672489083e-05, "loss": 1.8173, "step": 60 }, { "epoch": 0.2838427947598253, "grad_norm": 0.5023621326635465, "learning_rate": 5.6768558951965065e-05, "loss": 1.8912, "step": 65 }, { "epoch": 0.3056768558951965, "grad_norm": 0.4787415611645182, "learning_rate": 6.11353711790393e-05, "loss": 1.7972, "step": 70 }, { "epoch": 0.32751091703056767, "grad_norm": 0.48422783401495123, "learning_rate": 6.550218340611354e-05, "loss": 1.7183, "step": 75 }, { "epoch": 0.34934497816593885, "grad_norm": 0.3910949318663936, "learning_rate": 6.986899563318777e-05, "loss": 1.6366, "step": 80 }, { "epoch": 0.37117903930131, "grad_norm": 0.3902520624627953, "learning_rate": 7.423580786026201e-05, "loss": 1.7019, "step": 85 }, { "epoch": 0.3930131004366812, "grad_norm": 0.3853385163155022, "learning_rate": 7.860262008733625e-05, "loss": 1.7606, "step": 90 }, { "epoch": 0.4148471615720524, "grad_norm": 0.3911758987224519, "learning_rate": 8.296943231441049e-05, "loss": 1.6395, "step": 95 }, { "epoch": 0.4366812227074236, "grad_norm": 0.42385759736776996, "learning_rate": 8.733624454148472e-05, "loss": 1.6328, "step": 100 }, { "epoch": 0.4585152838427948, "grad_norm": 0.3975926298253002, "learning_rate": 9.170305676855896e-05, "loss": 1.6775, "step": 105 }, { "epoch": 0.48034934497816595, "grad_norm": 0.4355288111027398, "learning_rate": 9.606986899563319e-05, "loss": 1.6008, "step": 110 }, { "epoch": 0.5021834061135371, "grad_norm": 0.44774118787630435, "learning_rate": 0.00010043668122270742, "loss": 1.6046, "step": 115 }, { "epoch": 0.5240174672489083, "grad_norm": 0.45365568335085893, "learning_rate": 0.00010480349344978167, "loss": 1.6348, "step": 120 }, { "epoch": 0.5458515283842795, "grad_norm": 0.3985940875605887, "learning_rate": 0.00010917030567685591, "loss": 1.616, "step": 125 }, { "epoch": 0.5676855895196506, "grad_norm": 0.4103041259632963, "learning_rate": 0.00011353711790393013, "loss": 1.6063, "step": 130 }, { "epoch": 0.5895196506550219, "grad_norm": 0.45594857125060284, "learning_rate": 0.00011790393013100438, "loss": 1.5782, "step": 135 }, { "epoch": 0.611353711790393, "grad_norm": 0.406753522288533, "learning_rate": 0.0001222707423580786, "loss": 1.5361, "step": 140 }, { "epoch": 0.6331877729257642, "grad_norm": 0.45489448779672886, "learning_rate": 0.00012663755458515284, "loss": 1.6416, "step": 145 }, { "epoch": 0.6550218340611353, "grad_norm": 0.4232268425851412, "learning_rate": 0.00013100436681222707, "loss": 1.5449, "step": 150 }, { "epoch": 0.6768558951965066, "grad_norm": 0.4008200720858846, "learning_rate": 0.00013537117903930133, "loss": 1.6322, "step": 155 }, { "epoch": 0.6986899563318777, "grad_norm": 0.41073046435729793, "learning_rate": 0.00013973799126637554, "loss": 1.6144, "step": 160 }, { "epoch": 0.7205240174672489, "grad_norm": 0.43150859357010535, "learning_rate": 0.0001441048034934498, "loss": 1.6816, "step": 165 }, { "epoch": 0.74235807860262, "grad_norm": 0.4209643739546475, "learning_rate": 0.00014847161572052403, "loss": 1.6049, "step": 170 }, { "epoch": 0.7641921397379913, "grad_norm": 0.4646259107508816, "learning_rate": 0.00015283842794759826, "loss": 1.6193, "step": 175 }, { "epoch": 0.7860262008733624, "grad_norm": 0.42132133209440126, "learning_rate": 0.0001572052401746725, "loss": 1.5542, "step": 180 }, { "epoch": 0.8078602620087336, "grad_norm": 0.4068655673248684, "learning_rate": 0.00016157205240174672, "loss": 1.5172, "step": 185 }, { "epoch": 0.8296943231441049, "grad_norm": 0.45022442420363395, "learning_rate": 0.00016593886462882098, "loss": 1.65, "step": 190 }, { "epoch": 0.851528384279476, "grad_norm": 0.4218769408186785, "learning_rate": 0.00017030567685589521, "loss": 1.7073, "step": 195 }, { "epoch": 0.8733624454148472, "grad_norm": 0.44363856749896563, "learning_rate": 0.00017467248908296945, "loss": 1.6647, "step": 200 }, { "epoch": 0.8951965065502183, "grad_norm": 0.39452894148369905, "learning_rate": 0.00017903930131004368, "loss": 1.4932, "step": 205 }, { "epoch": 0.9170305676855895, "grad_norm": 0.407234590774645, "learning_rate": 0.0001834061135371179, "loss": 1.5987, "step": 210 }, { "epoch": 0.9388646288209607, "grad_norm": 0.4299787387863718, "learning_rate": 0.00018777292576419214, "loss": 1.606, "step": 215 }, { "epoch": 0.9606986899563319, "grad_norm": 0.4459993359246055, "learning_rate": 0.00019213973799126638, "loss": 1.6248, "step": 220 }, { "epoch": 0.982532751091703, "grad_norm": 0.42477289910814814, "learning_rate": 0.0001965065502183406, "loss": 1.5145, "step": 225 }, { "epoch": 1.0043668122270741, "grad_norm": 0.4524366274873438, "learning_rate": 0.00019999988382473225, "loss": 1.6031, "step": 230 }, { "epoch": 1.0262008733624455, "grad_norm": 0.4084729303647835, "learning_rate": 0.00019999581771870396, "loss": 1.5467, "step": 235 }, { "epoch": 1.0480349344978166, "grad_norm": 0.5172909354442642, "learning_rate": 0.0001999859431192192, "loss": 1.4636, "step": 240 }, { "epoch": 1.0698689956331877, "grad_norm": 0.4686179138565006, "learning_rate": 0.00019997026059986742, "loss": 1.5244, "step": 245 }, { "epoch": 1.091703056768559, "grad_norm": 0.47437945549478044, "learning_rate": 0.00019994877107160482, "loss": 1.4414, "step": 250 }, { "epoch": 1.1135371179039302, "grad_norm": 0.45613574001819357, "learning_rate": 0.00019992147578270142, "loss": 1.4545, "step": 255 }, { "epoch": 1.1353711790393013, "grad_norm": 0.4440788329210085, "learning_rate": 0.00019988837631866864, "loss": 1.4727, "step": 260 }, { "epoch": 1.1572052401746724, "grad_norm": 0.43804218551099794, "learning_rate": 0.00019984947460216707, "loss": 1.5721, "step": 265 }, { "epoch": 1.1790393013100438, "grad_norm": 0.4712591099454086, "learning_rate": 0.0001998047728928949, "loss": 1.45, "step": 270 }, { "epoch": 1.2008733624454149, "grad_norm": 0.46682322093255346, "learning_rate": 0.00019975427378745659, "loss": 1.5364, "step": 275 }, { "epoch": 1.222707423580786, "grad_norm": 0.44776219185560584, "learning_rate": 0.00019969798021921201, "loss": 1.4799, "step": 280 }, { "epoch": 1.244541484716157, "grad_norm": 0.43694406427786026, "learning_rate": 0.0001996358954581062, "loss": 1.3916, "step": 285 }, { "epoch": 1.2663755458515285, "grad_norm": 0.4369949578213358, "learning_rate": 0.00019956802311047925, "loss": 1.5629, "step": 290 }, { "epoch": 1.2882096069868996, "grad_norm": 0.4507447731242532, "learning_rate": 0.00019949436711885686, "loss": 1.5553, "step": 295 }, { "epoch": 1.3100436681222707, "grad_norm": 0.4238763083224714, "learning_rate": 0.00019941493176172154, "loss": 1.555, "step": 300 }, { "epoch": 1.3318777292576418, "grad_norm": 0.44296845533811896, "learning_rate": 0.0001993297216532637, "loss": 1.5952, "step": 305 }, { "epoch": 1.3537117903930131, "grad_norm": 0.44395910006889616, "learning_rate": 0.00019923874174311394, "loss": 1.4769, "step": 310 }, { "epoch": 1.3755458515283843, "grad_norm": 0.44391981727033414, "learning_rate": 0.00019914199731605546, "loss": 1.5458, "step": 315 }, { "epoch": 1.3973799126637554, "grad_norm": 0.46314929089631696, "learning_rate": 0.00019903949399171692, "loss": 1.5994, "step": 320 }, { "epoch": 1.4192139737991267, "grad_norm": 0.4902612617938975, "learning_rate": 0.0001989312377242463, "loss": 1.5253, "step": 325 }, { "epoch": 1.4410480349344978, "grad_norm": 0.4264196408790441, "learning_rate": 0.0001988172348019648, "loss": 1.5378, "step": 330 }, { "epoch": 1.462882096069869, "grad_norm": 0.4125915654790707, "learning_rate": 0.00019869749184700156, "loss": 1.4231, "step": 335 }, { "epoch": 1.48471615720524, "grad_norm": 0.4121352169920894, "learning_rate": 0.00019857201581490933, "loss": 1.4937, "step": 340 }, { "epoch": 1.5065502183406112, "grad_norm": 0.46111501316021164, "learning_rate": 0.00019844081399425997, "loss": 1.6366, "step": 345 }, { "epoch": 1.5283842794759825, "grad_norm": 0.41020501591518393, "learning_rate": 0.0001983038940062214, "loss": 1.5345, "step": 350 }, { "epoch": 1.5502183406113537, "grad_norm": 0.42368595569558, "learning_rate": 0.00019816126380411476, "loss": 1.5478, "step": 355 }, { "epoch": 1.572052401746725, "grad_norm": 0.41418005567795424, "learning_rate": 0.0001980129316729526, "loss": 1.5202, "step": 360 }, { "epoch": 1.5938864628820961, "grad_norm": 0.4293992491929074, "learning_rate": 0.0001978589062289573, "loss": 1.4605, "step": 365 }, { "epoch": 1.6157205240174672, "grad_norm": 0.4361713850313702, "learning_rate": 0.00019769919641906097, "loss": 1.5154, "step": 370 }, { "epoch": 1.6375545851528384, "grad_norm": 0.3915518977683478, "learning_rate": 0.0001975338115203854, "loss": 1.3845, "step": 375 }, { "epoch": 1.6593886462882095, "grad_norm": 0.40952561447487074, "learning_rate": 0.0001973627611397034, "loss": 1.5663, "step": 380 }, { "epoch": 1.6812227074235808, "grad_norm": 0.41401676412381255, "learning_rate": 0.00019718605521288073, "loss": 1.5892, "step": 385 }, { "epoch": 1.703056768558952, "grad_norm": 0.40948687994041144, "learning_rate": 0.00019700370400429885, "loss": 1.5853, "step": 390 }, { "epoch": 1.7248908296943233, "grad_norm": 0.39912451760472517, "learning_rate": 0.00019681571810625873, "loss": 1.5086, "step": 395 }, { "epoch": 1.7467248908296944, "grad_norm": 0.4274906140041681, "learning_rate": 0.00019662210843836574, "loss": 1.5361, "step": 400 }, { "epoch": 1.7685589519650655, "grad_norm": 0.45496243462203195, "learning_rate": 0.00019642288624689501, "loss": 1.5281, "step": 405 }, { "epoch": 1.7903930131004366, "grad_norm": 0.40754588577831136, "learning_rate": 0.00019621806310413857, "loss": 1.4146, "step": 410 }, { "epoch": 1.8122270742358078, "grad_norm": 0.46020764826270205, "learning_rate": 0.00019600765090773282, "loss": 1.509, "step": 415 }, { "epoch": 1.8340611353711789, "grad_norm": 0.41433203928546175, "learning_rate": 0.0001957916618799676, "loss": 1.4521, "step": 420 }, { "epoch": 1.8558951965065502, "grad_norm": 0.4076208140451916, "learning_rate": 0.00019557010856707617, "loss": 1.5177, "step": 425 }, { "epoch": 1.8777292576419216, "grad_norm": 0.3861676767334766, "learning_rate": 0.00019534300383850642, "loss": 1.5334, "step": 430 }, { "epoch": 1.8995633187772927, "grad_norm": 0.38833060515579254, "learning_rate": 0.00019511036088617342, "loss": 1.5405, "step": 435 }, { "epoch": 1.9213973799126638, "grad_norm": 0.42695215319286783, "learning_rate": 0.000194872193223693, "loss": 1.5458, "step": 440 }, { "epoch": 1.943231441048035, "grad_norm": 0.4089286925168508, "learning_rate": 0.0001946285146855968, "loss": 1.5466, "step": 445 }, { "epoch": 1.965065502183406, "grad_norm": 0.42155330336215135, "learning_rate": 0.00019437933942652885, "loss": 1.566, "step": 450 }, { "epoch": 1.9868995633187772, "grad_norm": 0.3751384149186164, "learning_rate": 0.000194124681920423, "loss": 1.4511, "step": 455 }, { "epoch": 2.0087336244541483, "grad_norm": 0.4161005311145859, "learning_rate": 0.00019386455695966253, "loss": 1.4751, "step": 460 }, { "epoch": 2.03056768558952, "grad_norm": 0.4391122093349958, "learning_rate": 0.0001935989796542207, "loss": 1.4673, "step": 465 }, { "epoch": 2.052401746724891, "grad_norm": 0.4822422647436377, "learning_rate": 0.00019332796543078314, "loss": 1.4212, "step": 470 }, { "epoch": 2.074235807860262, "grad_norm": 0.47440157836581254, "learning_rate": 0.00019305153003185165, "loss": 1.4117, "step": 475 }, { "epoch": 2.096069868995633, "grad_norm": 0.5171235330183517, "learning_rate": 0.00019276968951482986, "loss": 1.377, "step": 480 }, { "epoch": 2.1179039301310043, "grad_norm": 0.5213453701595562, "learning_rate": 0.00019248246025109045, "loss": 1.3892, "step": 485 }, { "epoch": 2.1397379912663754, "grad_norm": 0.5031391500788147, "learning_rate": 0.0001921898589250242, "loss": 1.3859, "step": 490 }, { "epoch": 2.1615720524017465, "grad_norm": 0.48354639648956227, "learning_rate": 0.00019189190253307082, "loss": 1.3916, "step": 495 }, { "epoch": 2.183406113537118, "grad_norm": 0.5173445212952421, "learning_rate": 0.00019158860838273172, "loss": 1.3977, "step": 500 }, { "epoch": 2.2052401746724892, "grad_norm": 0.513094289634623, "learning_rate": 0.00019127999409156453, "loss": 1.3707, "step": 505 }, { "epoch": 2.2270742358078603, "grad_norm": 0.5024827997518101, "learning_rate": 0.00019096607758615998, "loss": 1.3482, "step": 510 }, { "epoch": 2.2489082969432315, "grad_norm": 0.5798563945761437, "learning_rate": 0.0001906468771011003, "loss": 1.4178, "step": 515 }, { "epoch": 2.2707423580786026, "grad_norm": 0.5150755731275006, "learning_rate": 0.00019032241117790028, "loss": 1.4191, "step": 520 }, { "epoch": 2.2925764192139737, "grad_norm": 0.5437625450565541, "learning_rate": 0.00018999269866393006, "loss": 1.3817, "step": 525 }, { "epoch": 2.314410480349345, "grad_norm": 0.4924027196915137, "learning_rate": 0.00018965775871132044, "loss": 1.3745, "step": 530 }, { "epoch": 2.3362445414847164, "grad_norm": 0.5287496980456949, "learning_rate": 0.00018931761077585035, "loss": 1.3749, "step": 535 }, { "epoch": 2.3580786026200875, "grad_norm": 0.5123509887446156, "learning_rate": 0.00018897227461581672, "loss": 1.4476, "step": 540 }, { "epoch": 2.3799126637554586, "grad_norm": 0.5054089069691902, "learning_rate": 0.00018862177029088675, "loss": 1.4103, "step": 545 }, { "epoch": 2.4017467248908297, "grad_norm": 0.5339659464397467, "learning_rate": 0.00018826611816093273, "loss": 1.421, "step": 550 }, { "epoch": 2.423580786026201, "grad_norm": 0.5367901761635062, "learning_rate": 0.00018790533888484937, "loss": 1.4725, "step": 555 }, { "epoch": 2.445414847161572, "grad_norm": 0.5453912898301467, "learning_rate": 0.00018753945341935376, "loss": 1.4671, "step": 560 }, { "epoch": 2.467248908296943, "grad_norm": 0.5114254543022997, "learning_rate": 0.0001871684830177681, "loss": 1.5483, "step": 565 }, { "epoch": 2.489082969432314, "grad_norm": 0.5183339258600979, "learning_rate": 0.00018679244922878516, "loss": 1.4277, "step": 570 }, { "epoch": 2.5109170305676853, "grad_norm": 0.5202383438539178, "learning_rate": 0.00018641137389521645, "loss": 1.4767, "step": 575 }, { "epoch": 2.532751091703057, "grad_norm": 0.5293048970285786, "learning_rate": 0.0001860252791527236, "loss": 1.4691, "step": 580 }, { "epoch": 2.554585152838428, "grad_norm": 0.5307661790603934, "learning_rate": 0.0001856341874285324, "loss": 1.484, "step": 585 }, { "epoch": 2.576419213973799, "grad_norm": 0.5298444162459437, "learning_rate": 0.0001852381214401302, "loss": 1.3704, "step": 590 }, { "epoch": 2.5982532751091703, "grad_norm": 0.5045622495753448, "learning_rate": 0.00018483710419394615, "loss": 1.4273, "step": 595 }, { "epoch": 2.6200873362445414, "grad_norm": 0.516020284616684, "learning_rate": 0.00018443115898401504, "loss": 1.5253, "step": 600 }, { "epoch": 2.641921397379913, "grad_norm": 0.5212259321852013, "learning_rate": 0.000184020309390624, "loss": 1.4966, "step": 605 }, { "epoch": 2.6637554585152836, "grad_norm": 0.571739861550278, "learning_rate": 0.00018360457927894287, "loss": 1.489, "step": 610 }, { "epoch": 2.685589519650655, "grad_norm": 0.5165260257002361, "learning_rate": 0.00018318399279763797, "loss": 1.419, "step": 615 }, { "epoch": 2.7074235807860263, "grad_norm": 0.5022734014528262, "learning_rate": 0.00018275857437746932, "loss": 1.5218, "step": 620 }, { "epoch": 2.7292576419213974, "grad_norm": 0.5065667081884927, "learning_rate": 0.00018232834872987147, "loss": 1.3765, "step": 625 }, { "epoch": 2.7510917030567685, "grad_norm": 0.5208990486632071, "learning_rate": 0.00018189334084551826, "loss": 1.4514, "step": 630 }, { "epoch": 2.7729257641921397, "grad_norm": 0.4881264484974513, "learning_rate": 0.00018145357599287095, "loss": 1.4477, "step": 635 }, { "epoch": 2.7947598253275108, "grad_norm": 0.513600593429205, "learning_rate": 0.00018100907971671054, "loss": 1.4449, "step": 640 }, { "epoch": 2.816593886462882, "grad_norm": 0.6046158817500052, "learning_rate": 0.00018055987783665404, "loss": 1.3161, "step": 645 }, { "epoch": 2.8384279475982535, "grad_norm": 0.5208894858087225, "learning_rate": 0.00018010599644565457, "loss": 1.4693, "step": 650 }, { "epoch": 2.8602620087336246, "grad_norm": 0.5746066357275653, "learning_rate": 0.0001796474619084856, "loss": 1.4347, "step": 655 }, { "epoch": 2.8820960698689957, "grad_norm": 0.5569260278390025, "learning_rate": 0.00017918430086020975, "loss": 1.4628, "step": 660 }, { "epoch": 2.903930131004367, "grad_norm": 0.5015533731915597, "learning_rate": 0.0001787165402046313, "loss": 1.4082, "step": 665 }, { "epoch": 2.925764192139738, "grad_norm": 0.4930090755602876, "learning_rate": 0.0001782442071127338, "loss": 1.4412, "step": 670 }, { "epoch": 2.947598253275109, "grad_norm": 0.5351113450288878, "learning_rate": 0.0001777673290211014, "loss": 1.3765, "step": 675 }, { "epoch": 2.96943231441048, "grad_norm": 0.5333783396073188, "learning_rate": 0.00017728593363032532, "loss": 1.4074, "step": 680 }, { "epoch": 2.9912663755458517, "grad_norm": 0.5148622195538735, "learning_rate": 0.0001768000489033949, "loss": 1.355, "step": 685 }, { "epoch": 3.013100436681223, "grad_norm": 0.5136549354887358, "learning_rate": 0.00017630970306407311, "loss": 1.33, "step": 690 }, { "epoch": 3.034934497816594, "grad_norm": 0.587330448131839, "learning_rate": 0.00017581492459525712, "loss": 1.267, "step": 695 }, { "epoch": 3.056768558951965, "grad_norm": 0.5781502863776671, "learning_rate": 0.00017531574223732396, "loss": 1.3391, "step": 700 }, { "epoch": 3.078602620087336, "grad_norm": 0.5801593817600947, "learning_rate": 0.0001748121849864609, "loss": 1.3398, "step": 705 }, { "epoch": 3.1004366812227073, "grad_norm": 0.6358481598657785, "learning_rate": 0.00017430428209298126, "loss": 1.3191, "step": 710 }, { "epoch": 3.1222707423580784, "grad_norm": 0.635414065898168, "learning_rate": 0.00017379206305962526, "loss": 1.3233, "step": 715 }, { "epoch": 3.14410480349345, "grad_norm": 0.6721891418870005, "learning_rate": 0.0001732755576398463, "loss": 1.2795, "step": 720 }, { "epoch": 3.165938864628821, "grad_norm": 0.6262467411308055, "learning_rate": 0.00017275479583608261, "loss": 1.3117, "step": 725 }, { "epoch": 3.1877729257641922, "grad_norm": 0.7147271334112754, "learning_rate": 0.00017222980789801477, "loss": 1.3604, "step": 730 }, { "epoch": 3.2096069868995634, "grad_norm": 0.677887647709634, "learning_rate": 0.00017170062432080805, "loss": 1.3356, "step": 735 }, { "epoch": 3.2314410480349345, "grad_norm": 0.6529188195262589, "learning_rate": 0.00017116727584334159, "loss": 1.3092, "step": 740 }, { "epoch": 3.2532751091703056, "grad_norm": 0.6545432757758792, "learning_rate": 0.00017062979344642244, "loss": 1.3272, "step": 745 }, { "epoch": 3.2751091703056767, "grad_norm": 0.6417623946150004, "learning_rate": 0.00017008820835098627, "loss": 1.3712, "step": 750 }, { "epoch": 3.2969432314410483, "grad_norm": 0.6419119938295037, "learning_rate": 0.00016954255201628358, "loss": 1.372, "step": 755 }, { "epoch": 3.3187772925764194, "grad_norm": 0.6643469655488602, "learning_rate": 0.00016899285613805246, "loss": 1.3883, "step": 760 }, { "epoch": 3.3406113537117905, "grad_norm": 0.6617592247751748, "learning_rate": 0.00016843915264667746, "loss": 1.3131, "step": 765 }, { "epoch": 3.3624454148471616, "grad_norm": 0.6721660291620549, "learning_rate": 0.00016788147370533482, "loss": 1.3677, "step": 770 }, { "epoch": 3.3842794759825328, "grad_norm": 0.6696065641348963, "learning_rate": 0.00016731985170812414, "loss": 1.3612, "step": 775 }, { "epoch": 3.406113537117904, "grad_norm": 0.6442936479861974, "learning_rate": 0.00016675431927818678, "loss": 1.3288, "step": 780 }, { "epoch": 3.427947598253275, "grad_norm": 0.6665292623524364, "learning_rate": 0.00016618490926581086, "loss": 1.3302, "step": 785 }, { "epoch": 3.449781659388646, "grad_norm": 0.6832138001277586, "learning_rate": 0.00016561165474652292, "loss": 1.296, "step": 790 }, { "epoch": 3.4716157205240172, "grad_norm": 0.6676829472258946, "learning_rate": 0.0001650345890191669, "loss": 1.258, "step": 795 }, { "epoch": 3.493449781659389, "grad_norm": 0.6462889175077688, "learning_rate": 0.00016445374560396974, "loss": 1.3108, "step": 800 }, { "epoch": 3.51528384279476, "grad_norm": 0.6538188867916314, "learning_rate": 0.00016386915824059427, "loss": 1.2225, "step": 805 }, { "epoch": 3.537117903930131, "grad_norm": 0.6573367032320154, "learning_rate": 0.0001632808608861794, "loss": 1.2692, "step": 810 }, { "epoch": 3.558951965065502, "grad_norm": 0.6707468426806011, "learning_rate": 0.0001626888877133677, "loss": 1.2621, "step": 815 }, { "epoch": 3.5807860262008733, "grad_norm": 0.6607806582929415, "learning_rate": 0.00016209327310832028, "loss": 1.3217, "step": 820 }, { "epoch": 3.6026200873362444, "grad_norm": 0.6695542325826566, "learning_rate": 0.00016149405166871947, "loss": 1.2445, "step": 825 }, { "epoch": 3.6244541484716155, "grad_norm": 0.6854945885270477, "learning_rate": 0.00016089125820175913, "loss": 1.2334, "step": 830 }, { "epoch": 3.646288209606987, "grad_norm": 0.6998882406491346, "learning_rate": 0.00016028492772212277, "loss": 1.3228, "step": 835 }, { "epoch": 3.668122270742358, "grad_norm": 0.6481594617699246, "learning_rate": 0.00015967509544994959, "loss": 1.3119, "step": 840 }, { "epoch": 3.6899563318777293, "grad_norm": 0.6775238829866298, "learning_rate": 0.00015906179680878876, "loss": 1.2587, "step": 845 }, { "epoch": 3.7117903930131004, "grad_norm": 0.667250254807951, "learning_rate": 0.00015844506742354164, "loss": 1.335, "step": 850 }, { "epoch": 3.7336244541484715, "grad_norm": 0.6850864766225458, "learning_rate": 0.00015782494311839248, "loss": 1.3585, "step": 855 }, { "epoch": 3.7554585152838427, "grad_norm": 0.7100136757699026, "learning_rate": 0.00015720145991472746, "loss": 1.3494, "step": 860 }, { "epoch": 3.777292576419214, "grad_norm": 0.6455753266089419, "learning_rate": 0.00015657465402904239, "loss": 1.32, "step": 865 }, { "epoch": 3.7991266375545854, "grad_norm": 0.6774008615272976, "learning_rate": 0.00015594456187083887, "loss": 1.3053, "step": 870 }, { "epoch": 3.8209606986899565, "grad_norm": 0.6412117270718072, "learning_rate": 0.0001553112200405094, "loss": 1.3468, "step": 875 }, { "epoch": 3.8427947598253276, "grad_norm": 0.6299987093163749, "learning_rate": 0.00015467466532721136, "loss": 1.2464, "step": 880 }, { "epoch": 3.8646288209606987, "grad_norm": 0.6648608762625429, "learning_rate": 0.00015403493470673006, "loss": 1.4054, "step": 885 }, { "epoch": 3.88646288209607, "grad_norm": 0.6973437539106749, "learning_rate": 0.00015339206533933087, "loss": 1.3005, "step": 890 }, { "epoch": 3.908296943231441, "grad_norm": 0.6609891540439728, "learning_rate": 0.00015274609456760073, "loss": 1.3751, "step": 895 }, { "epoch": 3.930131004366812, "grad_norm": 0.6243806427317503, "learning_rate": 0.0001520970599142789, "loss": 1.309, "step": 900 }, { "epoch": 3.9519650655021836, "grad_norm": 0.6701604629698161, "learning_rate": 0.00015144499908007757, "loss": 1.3302, "step": 905 }, { "epoch": 3.9737991266375547, "grad_norm": 0.6417002641455068, "learning_rate": 0.00015078994994149167, "loss": 1.3244, "step": 910 }, { "epoch": 3.995633187772926, "grad_norm": 0.6227627176738441, "learning_rate": 0.00015013195054859894, "loss": 1.3739, "step": 915 }, { "epoch": 4.0174672489082965, "grad_norm": 0.69565512329475, "learning_rate": 0.00014947103912284958, "loss": 1.1587, "step": 920 }, { "epoch": 4.039301310043668, "grad_norm": 0.9168936710349829, "learning_rate": 0.0001488072540548461, "loss": 1.183, "step": 925 }, { "epoch": 4.06113537117904, "grad_norm": 0.768969897857725, "learning_rate": 0.00014814063390211334, "loss": 1.1114, "step": 930 }, { "epoch": 4.08296943231441, "grad_norm": 0.8549092972669946, "learning_rate": 0.00014747121738685874, "loss": 1.2111, "step": 935 }, { "epoch": 4.104803493449782, "grad_norm": 0.8316566189643829, "learning_rate": 0.00014679904339372302, "loss": 1.1581, "step": 940 }, { "epoch": 4.126637554585153, "grad_norm": 0.8226115993114171, "learning_rate": 0.00014612415096752155, "loss": 1.1881, "step": 945 }, { "epoch": 4.148471615720524, "grad_norm": 0.8505226801184157, "learning_rate": 0.0001454465793109763, "loss": 1.135, "step": 950 }, { "epoch": 4.170305676855895, "grad_norm": 0.8021211984624651, "learning_rate": 0.00014476636778243878, "loss": 1.1768, "step": 955 }, { "epoch": 4.192139737991266, "grad_norm": 0.8578093150750806, "learning_rate": 0.00014408355589360348, "loss": 1.0631, "step": 960 }, { "epoch": 4.213973799126638, "grad_norm": 0.8812497659618362, "learning_rate": 0.00014339818330721314, "loss": 1.1288, "step": 965 }, { "epoch": 4.235807860262009, "grad_norm": 0.7816446218878502, "learning_rate": 0.0001427102898347546, "loss": 1.1777, "step": 970 }, { "epoch": 4.25764192139738, "grad_norm": 0.8163412581216741, "learning_rate": 0.0001420199154341464, "loss": 1.1469, "step": 975 }, { "epoch": 4.279475982532751, "grad_norm": 0.8410801009958802, "learning_rate": 0.0001413271002074176, "loss": 1.1547, "step": 980 }, { "epoch": 4.301310043668122, "grad_norm": 0.8957226442397316, "learning_rate": 0.00014063188439837832, "loss": 1.1054, "step": 985 }, { "epoch": 4.323144104803493, "grad_norm": 0.8533966074014762, "learning_rate": 0.0001399343083902824, "loss": 1.1468, "step": 990 }, { "epoch": 4.344978165938865, "grad_norm": 0.7969709395883895, "learning_rate": 0.00013923441270348124, "loss": 1.1661, "step": 995 }, { "epoch": 4.366812227074236, "grad_norm": 0.830675985424985, "learning_rate": 0.00013853223799307031, "loss": 1.1714, "step": 1000 }, { "epoch": 4.388646288209607, "grad_norm": 0.879665119495671, "learning_rate": 0.00013782782504652763, "loss": 1.2237, "step": 1005 }, { "epoch": 4.4104803493449785, "grad_norm": 0.8513132295064585, "learning_rate": 0.0001371212147813443, "loss": 1.2524, "step": 1010 }, { "epoch": 4.432314410480349, "grad_norm": 0.8345543594033096, "learning_rate": 0.00013641244824264803, "loss": 1.2055, "step": 1015 }, { "epoch": 4.454148471615721, "grad_norm": 0.8449282094486232, "learning_rate": 0.00013570156660081868, "loss": 1.1459, "step": 1020 }, { "epoch": 4.475982532751091, "grad_norm": 0.8491089050635324, "learning_rate": 0.00013498861114909685, "loss": 1.165, "step": 1025 }, { "epoch": 4.497816593886463, "grad_norm": 0.8675954453498238, "learning_rate": 0.00013427362330118543, "loss": 1.1048, "step": 1030 }, { "epoch": 4.5196506550218345, "grad_norm": 0.9120386243780424, "learning_rate": 0.0001335566445888437, "loss": 1.2427, "step": 1035 }, { "epoch": 4.541484716157205, "grad_norm": 0.8105081633081175, "learning_rate": 0.00013283771665947505, "loss": 1.278, "step": 1040 }, { "epoch": 4.563318777292577, "grad_norm": 0.8869239311496004, "learning_rate": 0.00013211688127370784, "loss": 1.1099, "step": 1045 }, { "epoch": 4.585152838427947, "grad_norm": 0.8873085989909458, "learning_rate": 0.00013139418030296937, "loss": 1.1783, "step": 1050 }, { "epoch": 4.606986899563319, "grad_norm": 0.80023844006387, "learning_rate": 0.00013066965572705401, "loss": 1.1504, "step": 1055 }, { "epoch": 4.62882096069869, "grad_norm": 0.8438162486126547, "learning_rate": 0.00012994334963168443, "loss": 1.2292, "step": 1060 }, { "epoch": 4.650655021834061, "grad_norm": 0.8687319952846376, "learning_rate": 0.00012921530420606714, "loss": 1.2132, "step": 1065 }, { "epoch": 4.672489082969433, "grad_norm": 0.8481724475183398, "learning_rate": 0.00012848556174044183, "loss": 1.2114, "step": 1070 }, { "epoch": 4.6943231441048034, "grad_norm": 0.8170589588250686, "learning_rate": 0.00012775416462362457, "loss": 1.2152, "step": 1075 }, { "epoch": 4.716157205240175, "grad_norm": 0.8800579649975868, "learning_rate": 0.00012702115534054593, "loss": 1.1693, "step": 1080 }, { "epoch": 4.737991266375546, "grad_norm": 0.8601610801550544, "learning_rate": 0.0001262865764697829, "loss": 1.1846, "step": 1085 }, { "epoch": 4.759825327510917, "grad_norm": 0.8386680065719362, "learning_rate": 0.00012555047068108568, "loss": 1.249, "step": 1090 }, { "epoch": 4.781659388646288, "grad_norm": 0.8517305343726155, "learning_rate": 0.00012481288073289912, "loss": 1.1364, "step": 1095 }, { "epoch": 4.8034934497816595, "grad_norm": 0.8088860139786535, "learning_rate": 0.00012407384946987898, "loss": 1.1527, "step": 1100 }, { "epoch": 4.825327510917031, "grad_norm": 0.8583326581924249, "learning_rate": 0.00012333341982040323, "loss": 1.1515, "step": 1105 }, { "epoch": 4.847161572052402, "grad_norm": 0.9360072468671379, "learning_rate": 0.00012259163479407832, "loss": 1.0865, "step": 1110 }, { "epoch": 4.868995633187773, "grad_norm": 0.8650317997007926, "learning_rate": 0.00012184853747924112, "loss": 1.131, "step": 1115 }, { "epoch": 4.890829694323144, "grad_norm": 0.8102946053666945, "learning_rate": 0.00012110417104045575, "loss": 1.111, "step": 1120 }, { "epoch": 4.9126637554585155, "grad_norm": 0.9358255576727259, "learning_rate": 0.00012035857871600649, "loss": 1.2429, "step": 1125 }, { "epoch": 4.934497816593886, "grad_norm": 0.8877109383416729, "learning_rate": 0.00011961180381538599, "loss": 1.1798, "step": 1130 }, { "epoch": 4.956331877729258, "grad_norm": 0.8523579756065384, "learning_rate": 0.0001188638897167797, "loss": 1.1524, "step": 1135 }, { "epoch": 4.978165938864628, "grad_norm": 0.8309844177132272, "learning_rate": 0.00011811487986454612, "loss": 1.2469, "step": 1140 }, { "epoch": 5.0, "grad_norm": 0.8482507314409842, "learning_rate": 0.00011736481776669306, "loss": 1.1823, "step": 1145 }, { "epoch": 5.021834061135372, "grad_norm": 1.0322243914151594, "learning_rate": 0.00011661374699235057, "loss": 1.0325, "step": 1150 }, { "epoch": 5.043668122270742, "grad_norm": 1.0324089865002601, "learning_rate": 0.00011586171116924014, "loss": 1.0234, "step": 1155 }, { "epoch": 5.065502183406114, "grad_norm": 0.9762094077290032, "learning_rate": 0.00011510875398114027, "loss": 1.0794, "step": 1160 }, { "epoch": 5.0873362445414845, "grad_norm": 1.1952296969892835, "learning_rate": 0.00011435491916534919, "loss": 1.0145, "step": 1165 }, { "epoch": 5.109170305676856, "grad_norm": 1.0895050768888697, "learning_rate": 0.0001136002505101442, "loss": 1.0151, "step": 1170 }, { "epoch": 5.131004366812227, "grad_norm": 1.0235853080588493, "learning_rate": 0.00011284479185223812, "loss": 1.0388, "step": 1175 }, { "epoch": 5.152838427947598, "grad_norm": 1.0490311233025102, "learning_rate": 0.00011208858707423299, "loss": 1.0072, "step": 1180 }, { "epoch": 5.17467248908297, "grad_norm": 1.1856197953264118, "learning_rate": 0.00011133168010207091, "loss": 1.0504, "step": 1185 }, { "epoch": 5.1965065502183405, "grad_norm": 1.0065811557379292, "learning_rate": 0.00011057411490248266, "loss": 0.9977, "step": 1190 }, { "epoch": 5.218340611353712, "grad_norm": 1.1043632872539175, "learning_rate": 0.00010981593548043374, "loss": 0.9932, "step": 1195 }, { "epoch": 5.240174672489083, "grad_norm": 1.0753914347833422, "learning_rate": 0.00010905718587656811, "loss": 1.092, "step": 1200 }, { "epoch": 5.262008733624454, "grad_norm": 1.0266820326577377, "learning_rate": 0.0001082979101646502, "loss": 1.0655, "step": 1205 }, { "epoch": 5.283842794759825, "grad_norm": 0.9941194317158725, "learning_rate": 0.00010753815244900458, "loss": 0.9828, "step": 1210 }, { "epoch": 5.3056768558951966, "grad_norm": 1.084324048580049, "learning_rate": 0.00010677795686195422, "loss": 1.0229, "step": 1215 }, { "epoch": 5.327510917030567, "grad_norm": 1.051439528201926, "learning_rate": 0.00010601736756125685, "loss": 1.0168, "step": 1220 }, { "epoch": 5.349344978165939, "grad_norm": 1.1580814102197374, "learning_rate": 0.00010525642872753996, "loss": 0.935, "step": 1225 }, { "epoch": 5.37117903930131, "grad_norm": 1.0710336894680983, "learning_rate": 0.00010449518456173456, "loss": 1.067, "step": 1230 }, { "epoch": 5.393013100436681, "grad_norm": 1.0463850478020345, "learning_rate": 0.00010373367928250749, "loss": 1.0489, "step": 1235 }, { "epoch": 5.414847161572053, "grad_norm": 1.1057562523337745, "learning_rate": 0.00010297195712369311, "loss": 0.954, "step": 1240 }, { "epoch": 5.436681222707423, "grad_norm": 1.0647508212261871, "learning_rate": 0.0001022100623317237, "loss": 0.9094, "step": 1245 }, { "epoch": 5.458515283842795, "grad_norm": 1.0854376270937827, "learning_rate": 0.00010144803916305925, "loss": 0.9996, "step": 1250 }, { "epoch": 5.4803493449781655, "grad_norm": 1.0768231071114585, "learning_rate": 0.00010068593188161697, "loss": 1.0098, "step": 1255 }, { "epoch": 5.502183406113537, "grad_norm": 1.0629470347878223, "learning_rate": 9.992378475619981e-05, "loss": 1.0252, "step": 1260 }, { "epoch": 5.524017467248909, "grad_norm": 1.051250411684801, "learning_rate": 9.916164205792527e-05, "loss": 0.9879, "step": 1265 }, { "epoch": 5.545851528384279, "grad_norm": 1.0100574406024927, "learning_rate": 9.839954805765364e-05, "loss": 1.0638, "step": 1270 }, { "epoch": 5.567685589519651, "grad_norm": 1.0416962539798005, "learning_rate": 9.763754702341646e-05, "loss": 0.9556, "step": 1275 }, { "epoch": 5.5895196506550215, "grad_norm": 1.041656249717949, "learning_rate": 9.687568321784509e-05, "loss": 1.0295, "step": 1280 }, { "epoch": 5.611353711790393, "grad_norm": 1.057095267929098, "learning_rate": 9.611400089559975e-05, "loss": 1.0233, "step": 1285 }, { "epoch": 5.633187772925764, "grad_norm": 1.0153521926215252, "learning_rate": 9.535254430079864e-05, "loss": 0.9867, "step": 1290 }, { "epoch": 5.655021834061135, "grad_norm": 1.1345238181227135, "learning_rate": 9.459135766444815e-05, "loss": 1.0027, "step": 1295 }, { "epoch": 5.676855895196507, "grad_norm": 1.1187108189925221, "learning_rate": 9.383048520187344e-05, "loss": 0.9987, "step": 1300 }, { "epoch": 5.698689956331878, "grad_norm": 1.0636989170846824, "learning_rate": 9.306997111015014e-05, "loss": 1.0486, "step": 1305 }, { "epoch": 5.720524017467249, "grad_norm": 1.0706396029944643, "learning_rate": 9.23098595655371e-05, "loss": 0.9931, "step": 1310 }, { "epoch": 5.74235807860262, "grad_norm": 1.0475360658888055, "learning_rate": 9.155019472091022e-05, "loss": 0.9749, "step": 1315 }, { "epoch": 5.764192139737991, "grad_norm": 1.0214139834738702, "learning_rate": 9.079102070319786e-05, "loss": 1.0693, "step": 1320 }, { "epoch": 5.786026200873362, "grad_norm": 1.0530973134949948, "learning_rate": 9.003238161081743e-05, "loss": 1.0228, "step": 1325 }, { "epoch": 5.807860262008734, "grad_norm": 1.103677474707846, "learning_rate": 8.9274321511114e-05, "loss": 0.9761, "step": 1330 }, { "epoch": 5.829694323144105, "grad_norm": 1.0644633976825475, "learning_rate": 8.851688443780043e-05, "loss": 1.0239, "step": 1335 }, { "epoch": 5.851528384279476, "grad_norm": 1.0555551246349646, "learning_rate": 8.776011438839977e-05, "loss": 1.0473, "step": 1340 }, { "epoch": 5.873362445414847, "grad_norm": 1.122056836899885, "learning_rate": 8.70040553216892e-05, "loss": 0.9723, "step": 1345 }, { "epoch": 5.895196506550218, "grad_norm": 0.9963873601564418, "learning_rate": 8.624875115514697e-05, "loss": 1.0268, "step": 1350 }, { "epoch": 5.91703056768559, "grad_norm": 1.0336940380478288, "learning_rate": 8.549424576240102e-05, "loss": 0.9574, "step": 1355 }, { "epoch": 5.93886462882096, "grad_norm": 1.0760501661341102, "learning_rate": 8.474058297068071e-05, "loss": 1.0979, "step": 1360 }, { "epoch": 5.960698689956332, "grad_norm": 1.0890942343064978, "learning_rate": 8.398780655827096e-05, "loss": 0.9427, "step": 1365 }, { "epoch": 5.9825327510917035, "grad_norm": 1.1415589599070428, "learning_rate": 8.323596025196911e-05, "loss": 1.0041, "step": 1370 }, { "epoch": 6.004366812227074, "grad_norm": 1.1509480354124522, "learning_rate": 8.248508772454529e-05, "loss": 0.9545, "step": 1375 }, { "epoch": 6.026200873362446, "grad_norm": 1.4252192004046074, "learning_rate": 8.173523259220521e-05, "loss": 0.8584, "step": 1380 }, { "epoch": 6.048034934497816, "grad_norm": 1.37449264500959, "learning_rate": 8.098643841205685e-05, "loss": 0.8417, "step": 1385 }, { "epoch": 6.069868995633188, "grad_norm": 1.1455224268481572, "learning_rate": 8.023874867958027e-05, "loss": 0.8365, "step": 1390 }, { "epoch": 6.091703056768559, "grad_norm": 1.4517829308762, "learning_rate": 7.949220682610109e-05, "loss": 0.8772, "step": 1395 }, { "epoch": 6.11353711790393, "grad_norm": 1.2198167511326543, "learning_rate": 7.874685621626767e-05, "loss": 0.7638, "step": 1400 }, { "epoch": 6.135371179039302, "grad_norm": 1.2285129578196883, "learning_rate": 7.80027401455321e-05, "loss": 0.8632, "step": 1405 }, { "epoch": 6.157205240174672, "grad_norm": 1.3128304307256697, "learning_rate": 7.725990183763541e-05, "loss": 0.7864, "step": 1410 }, { "epoch": 6.179039301310044, "grad_norm": 1.2223078968334138, "learning_rate": 7.651838444209678e-05, "loss": 0.8107, "step": 1415 }, { "epoch": 6.200873362445415, "grad_norm": 1.2116494464520935, "learning_rate": 7.577823103170695e-05, "loss": 0.7665, "step": 1420 }, { "epoch": 6.222707423580786, "grad_norm": 1.2710441404657735, "learning_rate": 7.503948460002651e-05, "loss": 0.8755, "step": 1425 }, { "epoch": 6.244541484716157, "grad_norm": 1.3090624013120402, "learning_rate": 7.430218805888831e-05, "loss": 0.8635, "step": 1430 }, { "epoch": 6.2663755458515285, "grad_norm": 1.3421074934086965, "learning_rate": 7.356638423590485e-05, "loss": 0.8408, "step": 1435 }, { "epoch": 6.2882096069869, "grad_norm": 1.2190845509165837, "learning_rate": 7.283211587198056e-05, "loss": 0.901, "step": 1440 }, { "epoch": 6.310043668122271, "grad_norm": 1.2848215636192764, "learning_rate": 7.209942561882914e-05, "loss": 0.8183, "step": 1445 }, { "epoch": 6.331877729257642, "grad_norm": 1.2782891875098124, "learning_rate": 7.136835603649599e-05, "loss": 0.8144, "step": 1450 }, { "epoch": 6.353711790393013, "grad_norm": 1.3643650683015465, "learning_rate": 7.0638949590886e-05, "loss": 0.815, "step": 1455 }, { "epoch": 6.3755458515283845, "grad_norm": 1.3747527932453862, "learning_rate": 6.991124865129683e-05, "loss": 0.8058, "step": 1460 }, { "epoch": 6.397379912663755, "grad_norm": 1.324272429862148, "learning_rate": 6.918529548795781e-05, "loss": 0.8359, "step": 1465 }, { "epoch": 6.419213973799127, "grad_norm": 1.329733488330745, "learning_rate": 6.846113226957456e-05, "loss": 0.8081, "step": 1470 }, { "epoch": 6.441048034934497, "grad_norm": 1.3973165024504683, "learning_rate": 6.773880106087945e-05, "loss": 0.9255, "step": 1475 }, { "epoch": 6.462882096069869, "grad_norm": 1.2579802684349493, "learning_rate": 6.701834382018832e-05, "loss": 0.8932, "step": 1480 }, { "epoch": 6.4847161572052405, "grad_norm": 1.3102021281865468, "learning_rate": 6.629980239696315e-05, "loss": 0.8651, "step": 1485 }, { "epoch": 6.506550218340611, "grad_norm": 1.3096833467828455, "learning_rate": 6.558321852938099e-05, "loss": 0.8145, "step": 1490 }, { "epoch": 6.528384279475983, "grad_norm": 1.3466234774293075, "learning_rate": 6.486863384190987e-05, "loss": 0.8885, "step": 1495 }, { "epoch": 6.550218340611353, "grad_norm": 1.296177045867574, "learning_rate": 6.415608984289052e-05, "loss": 0.8546, "step": 1500 }, { "epoch": 6.572052401746725, "grad_norm": 1.2633240958805778, "learning_rate": 6.344562792212554e-05, "loss": 0.8685, "step": 1505 }, { "epoch": 6.593886462882097, "grad_norm": 1.3534959976317207, "learning_rate": 6.273728934847516e-05, "loss": 0.7986, "step": 1510 }, { "epoch": 6.615720524017467, "grad_norm": 1.252619828877404, "learning_rate": 6.203111526745985e-05, "loss": 0.8332, "step": 1515 }, { "epoch": 6.637554585152839, "grad_norm": 1.3249258619095206, "learning_rate": 6.132714669887044e-05, "loss": 0.8308, "step": 1520 }, { "epoch": 6.6593886462882095, "grad_norm": 1.2054299066608396, "learning_rate": 6.0625424534385425e-05, "loss": 0.8697, "step": 1525 }, { "epoch": 6.681222707423581, "grad_norm": 1.1955192517041973, "learning_rate": 5.99259895351955e-05, "loss": 0.8591, "step": 1530 }, { "epoch": 6.703056768558952, "grad_norm": 1.2570222378899258, "learning_rate": 5.9228882329636094e-05, "loss": 0.7953, "step": 1535 }, { "epoch": 6.724890829694323, "grad_norm": 1.3956159571082007, "learning_rate": 5.8534143410827104e-05, "loss": 0.8367, "step": 1540 }, { "epoch": 6.746724890829694, "grad_norm": 1.3649251909287845, "learning_rate": 5.7841813134320975e-05, "loss": 0.8553, "step": 1545 }, { "epoch": 6.7685589519650655, "grad_norm": 1.3726673140188062, "learning_rate": 5.715193171575842e-05, "loss": 0.8649, "step": 1550 }, { "epoch": 6.790393013100436, "grad_norm": 1.2184325480564675, "learning_rate": 5.64645392285325e-05, "loss": 0.8222, "step": 1555 }, { "epoch": 6.812227074235808, "grad_norm": 1.3383861408747, "learning_rate": 5.577967560146077e-05, "loss": 0.851, "step": 1560 }, { "epoch": 6.834061135371179, "grad_norm": 1.3171908370457348, "learning_rate": 5.5097380616466057e-05, "loss": 0.8662, "step": 1565 }, { "epoch": 6.85589519650655, "grad_norm": 1.3054511507141975, "learning_rate": 5.4417693906265365e-05, "loss": 0.8979, "step": 1570 }, { "epoch": 6.877729257641922, "grad_norm": 1.2387745464762083, "learning_rate": 5.374065495206805e-05, "loss": 0.8119, "step": 1575 }, { "epoch": 6.899563318777292, "grad_norm": 1.3948067425415336, "learning_rate": 5.306630308128229e-05, "loss": 0.8409, "step": 1580 }, { "epoch": 6.921397379912664, "grad_norm": 1.3502073303444249, "learning_rate": 5.239467746523048e-05, "loss": 0.8391, "step": 1585 }, { "epoch": 6.9432314410480345, "grad_norm": 1.3334535387387385, "learning_rate": 5.172581711687438e-05, "loss": 0.8577, "step": 1590 }, { "epoch": 6.965065502183406, "grad_norm": 1.3381286547460995, "learning_rate": 5.105976088854842e-05, "loss": 0.8925, "step": 1595 }, { "epoch": 6.986899563318778, "grad_norm": 1.2107404270853181, "learning_rate": 5.0396547469703106e-05, "loss": 0.8894, "step": 1600 }, { "epoch": 7.008733624454148, "grad_norm": 1.2856577830397042, "learning_rate": 4.973621538465768e-05, "loss": 0.8269, "step": 1605 }, { "epoch": 7.03056768558952, "grad_norm": 1.5720811121732976, "learning_rate": 4.907880299036234e-05, "loss": 0.6532, "step": 1610 }, { "epoch": 7.0524017467248905, "grad_norm": 1.513790910492325, "learning_rate": 4.8424348474170014e-05, "loss": 0.6398, "step": 1615 }, { "epoch": 7.074235807860262, "grad_norm": 1.3731923874867311, "learning_rate": 4.7772889851618405e-05, "loss": 0.7323, "step": 1620 }, { "epoch": 7.096069868995633, "grad_norm": 1.3538833010462115, "learning_rate": 4.712446496422165e-05, "loss": 0.6906, "step": 1625 }, { "epoch": 7.117903930131004, "grad_norm": 1.4454754507310241, "learning_rate": 4.647911147727209e-05, "loss": 0.7328, "step": 1630 }, { "epoch": 7.139737991266376, "grad_norm": 1.449171514767732, "learning_rate": 4.583686687765264e-05, "loss": 0.6782, "step": 1635 }, { "epoch": 7.1615720524017465, "grad_norm": 1.6050731544814476, "learning_rate": 4.5197768471659104e-05, "loss": 0.7385, "step": 1640 }, { "epoch": 7.183406113537118, "grad_norm": 1.3388820740639182, "learning_rate": 4.4561853382833206e-05, "loss": 0.6937, "step": 1645 }, { "epoch": 7.205240174672489, "grad_norm": 1.3724590934014314, "learning_rate": 4.3929158549806096e-05, "loss": 0.6899, "step": 1650 }, { "epoch": 7.22707423580786, "grad_norm": 1.5165378554181088, "learning_rate": 4.32997207241528e-05, "loss": 0.7044, "step": 1655 }, { "epoch": 7.248908296943231, "grad_norm": 1.6117091341741596, "learning_rate": 4.267357646825746e-05, "loss": 0.7093, "step": 1660 }, { "epoch": 7.270742358078603, "grad_norm": 1.4490430208656846, "learning_rate": 4.205076215318925e-05, "loss": 0.6967, "step": 1665 }, { "epoch": 7.292576419213974, "grad_norm": 1.469380632694007, "learning_rate": 4.143131395658996e-05, "loss": 0.7164, "step": 1670 }, { "epoch": 7.314410480349345, "grad_norm": 1.519813749480573, "learning_rate": 4.081526786057254e-05, "loss": 0.6724, "step": 1675 }, { "epoch": 7.336244541484716, "grad_norm": 1.4588612796193572, "learning_rate": 4.020265964963066e-05, "loss": 0.731, "step": 1680 }, { "epoch": 7.358078602620087, "grad_norm": 1.429975103899558, "learning_rate": 3.9593524908560464e-05, "loss": 0.7327, "step": 1685 }, { "epoch": 7.379912663755459, "grad_norm": 1.6113371519439155, "learning_rate": 3.898789902039338e-05, "loss": 0.709, "step": 1690 }, { "epoch": 7.401746724890829, "grad_norm": 1.5463250020435173, "learning_rate": 3.8385817164340723e-05, "loss": 0.7246, "step": 1695 }, { "epoch": 7.423580786026201, "grad_norm": 1.463616658449462, "learning_rate": 3.778731431375041e-05, "loss": 0.7013, "step": 1700 }, { "epoch": 7.445414847161572, "grad_norm": 1.5566283347360372, "learning_rate": 3.719242523407539e-05, "loss": 0.7344, "step": 1705 }, { "epoch": 7.467248908296943, "grad_norm": 1.4398876621434327, "learning_rate": 3.6601184480854066e-05, "loss": 0.7323, "step": 1710 }, { "epoch": 7.489082969432315, "grad_norm": 1.5252195085267406, "learning_rate": 3.601362639770328e-05, "loss": 0.7091, "step": 1715 }, { "epoch": 7.510917030567685, "grad_norm": 1.6881923973765582, "learning_rate": 3.542978511432325e-05, "loss": 0.7585, "step": 1720 }, { "epoch": 7.532751091703057, "grad_norm": 1.450411669170985, "learning_rate": 3.484969454451511e-05, "loss": 0.7258, "step": 1725 }, { "epoch": 7.554585152838428, "grad_norm": 1.523075629927926, "learning_rate": 3.4273388384210855e-05, "loss": 0.6716, "step": 1730 }, { "epoch": 7.576419213973799, "grad_norm": 1.4565661875305633, "learning_rate": 3.3700900109516184e-05, "loss": 0.6586, "step": 1735 }, { "epoch": 7.598253275109171, "grad_norm": 1.6164061159781367, "learning_rate": 3.3132262974765906e-05, "loss": 0.7123, "step": 1740 }, { "epoch": 7.620087336244541, "grad_norm": 1.5157276652461606, "learning_rate": 3.256751001059214e-05, "loss": 0.723, "step": 1745 }, { "epoch": 7.641921397379913, "grad_norm": 1.5600828466807661, "learning_rate": 3.200667402200586e-05, "loss": 0.7477, "step": 1750 }, { "epoch": 7.663755458515284, "grad_norm": 1.5728766811598356, "learning_rate": 3.144978758649133e-05, "loss": 0.7001, "step": 1755 }, { "epoch": 7.685589519650655, "grad_norm": 1.458458820326098, "learning_rate": 3.0896883052113525e-05, "loss": 0.7066, "step": 1760 }, { "epoch": 7.707423580786026, "grad_norm": 1.5980960571332508, "learning_rate": 3.034799253563939e-05, "loss": 0.6878, "step": 1765 }, { "epoch": 7.729257641921397, "grad_norm": 1.544990160417705, "learning_rate": 2.9803147920672146e-05, "loss": 0.6894, "step": 1770 }, { "epoch": 7.751091703056769, "grad_norm": 1.6353637199811462, "learning_rate": 2.9262380855799164e-05, "loss": 0.7297, "step": 1775 }, { "epoch": 7.77292576419214, "grad_norm": 1.5830327061313785, "learning_rate": 2.872572275275379e-05, "loss": 0.6983, "step": 1780 }, { "epoch": 7.794759825327511, "grad_norm": 1.4630236788620592, "learning_rate": 2.8193204784590597e-05, "loss": 0.7176, "step": 1785 }, { "epoch": 7.816593886462882, "grad_norm": 1.3928429612080049, "learning_rate": 2.766485788387455e-05, "loss": 0.7269, "step": 1790 }, { "epoch": 7.8384279475982535, "grad_norm": 1.5318894677152983, "learning_rate": 2.7140712740884376e-05, "loss": 0.7094, "step": 1795 }, { "epoch": 7.860262008733624, "grad_norm": 1.5006081477924398, "learning_rate": 2.6620799801829765e-05, "loss": 0.7356, "step": 1800 }, { "epoch": 7.882096069868996, "grad_norm": 1.4913103972222401, "learning_rate": 2.610514926708285e-05, "loss": 0.7563, "step": 1805 }, { "epoch": 7.903930131004367, "grad_norm": 1.5640971425352013, "learning_rate": 2.5593791089423858e-05, "loss": 0.6974, "step": 1810 }, { "epoch": 7.925764192139738, "grad_norm": 1.5288946416263425, "learning_rate": 2.5086754972301384e-05, "loss": 0.7597, "step": 1815 }, { "epoch": 7.9475982532751095, "grad_norm": 1.4850580236939783, "learning_rate": 2.4584070368106928e-05, "loss": 0.731, "step": 1820 }, { "epoch": 7.96943231441048, "grad_norm": 1.5394321618591782, "learning_rate": 2.4085766476463967e-05, "loss": 0.712, "step": 1825 }, { "epoch": 7.991266375545852, "grad_norm": 1.6849009773020913, "learning_rate": 2.3591872242532066e-05, "loss": 0.7327, "step": 1830 }, { "epoch": 8.013100436681222, "grad_norm": 1.4739518847631212, "learning_rate": 2.310241635532531e-05, "loss": 0.6777, "step": 1835 }, { "epoch": 8.034934497816593, "grad_norm": 1.4710344726002955, "learning_rate": 2.2617427246045973e-05, "loss": 0.5886, "step": 1840 }, { "epoch": 8.056768558951966, "grad_norm": 1.8620287227429924, "learning_rate": 2.2136933086432955e-05, "loss": 0.6258, "step": 1845 }, { "epoch": 8.078602620087336, "grad_norm": 1.4930799825826104, "learning_rate": 2.1660961787125388e-05, "loss": 0.6041, "step": 1850 }, { "epoch": 8.100436681222707, "grad_norm": 1.469008850941141, "learning_rate": 2.1189540996041313e-05, "loss": 0.647, "step": 1855 }, { "epoch": 8.12227074235808, "grad_norm": 1.6154095310822976, "learning_rate": 2.0722698096771832e-05, "loss": 0.5866, "step": 1860 }, { "epoch": 8.14410480349345, "grad_norm": 1.7217187185906804, "learning_rate": 2.026046020699035e-05, "loss": 0.6718, "step": 1865 }, { "epoch": 8.16593886462882, "grad_norm": 1.5816769270478201, "learning_rate": 1.980285417687735e-05, "loss": 0.6303, "step": 1870 }, { "epoch": 8.187772925764191, "grad_norm": 1.5330680888020691, "learning_rate": 1.9349906587560862e-05, "loss": 0.6166, "step": 1875 }, { "epoch": 8.209606986899564, "grad_norm": 1.4811035124378678, "learning_rate": 1.8901643749572374e-05, "loss": 0.6245, "step": 1880 }, { "epoch": 8.231441048034934, "grad_norm": 1.6432943675024339, "learning_rate": 1.8458091701318504e-05, "loss": 0.6261, "step": 1885 }, { "epoch": 8.253275109170305, "grad_norm": 1.6763677385421005, "learning_rate": 1.801927620756847e-05, "loss": 0.6468, "step": 1890 }, { "epoch": 8.275109170305678, "grad_norm": 1.5425807262553166, "learning_rate": 1.7585222757957576e-05, "loss": 0.6059, "step": 1895 }, { "epoch": 8.296943231441048, "grad_norm": 1.7341819757802275, "learning_rate": 1.7155956565506547e-05, "loss": 0.6728, "step": 1900 }, { "epoch": 8.318777292576419, "grad_norm": 1.4483276727621572, "learning_rate": 1.6731502565156875e-05, "loss": 0.6033, "step": 1905 }, { "epoch": 8.34061135371179, "grad_norm": 1.5596252995782947, "learning_rate": 1.6311885412322602e-05, "loss": 0.63, "step": 1910 }, { "epoch": 8.362445414847162, "grad_norm": 1.747814876036389, "learning_rate": 1.5897129481457996e-05, "loss": 0.5621, "step": 1915 }, { "epoch": 8.384279475982533, "grad_norm": 1.5550499771354138, "learning_rate": 1.5487258864641717e-05, "loss": 0.6306, "step": 1920 }, { "epoch": 8.406113537117903, "grad_norm": 1.5708977251660243, "learning_rate": 1.50822973701775e-05, "loss": 0.6281, "step": 1925 }, { "epoch": 8.427947598253276, "grad_norm": 1.5052322890768695, "learning_rate": 1.4682268521211073e-05, "loss": 0.5805, "step": 1930 }, { "epoch": 8.449781659388647, "grad_norm": 1.5682376306714874, "learning_rate": 1.4287195554363718e-05, "loss": 0.6103, "step": 1935 }, { "epoch": 8.471615720524017, "grad_norm": 1.5926672273219815, "learning_rate": 1.3897101418382663e-05, "loss": 0.6086, "step": 1940 }, { "epoch": 8.493449781659388, "grad_norm": 1.6920842457267133, "learning_rate": 1.3512008772807993e-05, "loss": 0.6075, "step": 1945 }, { "epoch": 8.51528384279476, "grad_norm": 1.7066301478594386, "learning_rate": 1.3131939986656305e-05, "loss": 0.6037, "step": 1950 }, { "epoch": 8.537117903930131, "grad_norm": 1.6263309583877439, "learning_rate": 1.2756917137121527e-05, "loss": 0.6137, "step": 1955 }, { "epoch": 8.558951965065502, "grad_norm": 1.63476708848148, "learning_rate": 1.2386962008292413e-05, "loss": 0.5858, "step": 1960 }, { "epoch": 8.580786026200874, "grad_norm": 1.558434256481925, "learning_rate": 1.2022096089887191e-05, "loss": 0.6426, "step": 1965 }, { "epoch": 8.602620087336245, "grad_norm": 1.551619834712178, "learning_rate": 1.1662340576005216e-05, "loss": 0.6084, "step": 1970 }, { "epoch": 8.624454148471616, "grad_norm": 1.555097964766268, "learning_rate": 1.130771636389596e-05, "loss": 0.6687, "step": 1975 }, { "epoch": 8.646288209606986, "grad_norm": 1.6643047208497839, "learning_rate": 1.0958244052745126e-05, "loss": 0.6155, "step": 1980 }, { "epoch": 8.668122270742359, "grad_norm": 1.611005114630636, "learning_rate": 1.0613943942478e-05, "loss": 0.6089, "step": 1985 }, { "epoch": 8.68995633187773, "grad_norm": 1.5244955674562928, "learning_rate": 1.0274836032580415e-05, "loss": 0.6487, "step": 1990 }, { "epoch": 8.7117903930131, "grad_norm": 1.6030907080208872, "learning_rate": 9.940940020936951e-06, "loss": 0.6293, "step": 1995 }, { "epoch": 8.733624454148472, "grad_norm": 1.5700446980317904, "learning_rate": 9.612275302686713e-06, "loss": 0.6326, "step": 2000 }, { "epoch": 8.755458515283843, "grad_norm": 1.5394752226532917, "learning_rate": 9.288860969096857e-06, "loss": 0.6107, "step": 2005 }, { "epoch": 8.777292576419214, "grad_norm": 1.667355715027956, "learning_rate": 8.970715806453489e-06, "loss": 0.636, "step": 2010 }, { "epoch": 8.799126637554584, "grad_norm": 1.7421757272877927, "learning_rate": 8.657858294970412e-06, "loss": 0.6358, "step": 2015 }, { "epoch": 8.820960698689957, "grad_norm": 1.48526713960321, "learning_rate": 8.350306607715774e-06, "loss": 0.6456, "step": 2020 }, { "epoch": 8.842794759825328, "grad_norm": 1.7150442937256956, "learning_rate": 8.048078609556386e-06, "loss": 0.6443, "step": 2025 }, { "epoch": 8.864628820960698, "grad_norm": 1.6511037454437418, "learning_rate": 7.751191856119932e-06, "loss": 0.671, "step": 2030 }, { "epoch": 8.886462882096069, "grad_norm": 1.6987081532253472, "learning_rate": 7.459663592775334e-06, "loss": 0.6577, "step": 2035 }, { "epoch": 8.908296943231441, "grad_norm": 1.534952723839, "learning_rate": 7.173510753630919e-06, "loss": 0.6233, "step": 2040 }, { "epoch": 8.930131004366812, "grad_norm": 1.641736331583932, "learning_rate": 6.892749960550815e-06, "loss": 0.6289, "step": 2045 }, { "epoch": 8.951965065502183, "grad_norm": 1.5438647483260877, "learning_rate": 6.6173975221893615e-06, "loss": 0.5888, "step": 2050 }, { "epoch": 8.973799126637555, "grad_norm": 1.5243321966646095, "learning_rate": 6.347469433043851e-06, "loss": 0.6707, "step": 2055 }, { "epoch": 8.995633187772926, "grad_norm": 1.6062799594767991, "learning_rate": 6.082981372525487e-06, "loss": 0.5971, "step": 2060 }, { "epoch": 9.017467248908297, "grad_norm": 1.462091614630793, "learning_rate": 5.823948704048443e-06, "loss": 0.5631, "step": 2065 }, { "epoch": 9.039301310043669, "grad_norm": 1.4923902769131498, "learning_rate": 5.570386474137623e-06, "loss": 0.5617, "step": 2070 }, { "epoch": 9.06113537117904, "grad_norm": 1.6248158642620525, "learning_rate": 5.322309411554582e-06, "loss": 0.6111, "step": 2075 }, { "epoch": 9.08296943231441, "grad_norm": 1.6615679454812948, "learning_rate": 5.0797319264419105e-06, "loss": 0.563, "step": 2080 }, { "epoch": 9.104803493449781, "grad_norm": 1.6126706246375568, "learning_rate": 4.84266810948627e-06, "loss": 0.5686, "step": 2085 }, { "epoch": 9.126637554585153, "grad_norm": 1.5809643369161743, "learning_rate": 4.611131731099905e-06, "loss": 0.5533, "step": 2090 }, { "epoch": 9.148471615720524, "grad_norm": 1.6161716990207524, "learning_rate": 4.385136240620657e-06, "loss": 0.5962, "step": 2095 }, { "epoch": 9.170305676855895, "grad_norm": 1.5154717234389132, "learning_rate": 4.164694765530841e-06, "loss": 0.5946, "step": 2100 }, { "epoch": 9.192139737991265, "grad_norm": 1.5768496797034472, "learning_rate": 3.94982011069468e-06, "loss": 0.5383, "step": 2105 }, { "epoch": 9.213973799126638, "grad_norm": 1.6583634682510404, "learning_rate": 3.7405247576144054e-06, "loss": 0.6018, "step": 2110 }, { "epoch": 9.235807860262009, "grad_norm": 1.5801691223578864, "learning_rate": 3.5368208637053702e-06, "loss": 0.5564, "step": 2115 }, { "epoch": 9.25764192139738, "grad_norm": 1.702349073192532, "learning_rate": 3.338720261589823e-06, "loss": 0.578, "step": 2120 }, { "epoch": 9.279475982532752, "grad_norm": 1.5184620025021562, "learning_rate": 3.146234458409525e-06, "loss": 0.5649, "step": 2125 }, { "epoch": 9.301310043668122, "grad_norm": 1.4782870440067606, "learning_rate": 2.959374635157364e-06, "loss": 0.5708, "step": 2130 }, { "epoch": 9.323144104803493, "grad_norm": 1.5908300327141753, "learning_rate": 2.7781516460279157e-06, "loss": 0.5719, "step": 2135 }, { "epoch": 9.344978165938866, "grad_norm": 1.653067982991365, "learning_rate": 2.6025760177869063e-06, "loss": 0.5914, "step": 2140 }, { "epoch": 9.366812227074236, "grad_norm": 1.5475103396862675, "learning_rate": 2.4326579491597333e-06, "loss": 0.5931, "step": 2145 }, { "epoch": 9.388646288209607, "grad_norm": 1.6475600779910462, "learning_rate": 2.2684073102391066e-06, "loss": 0.5978, "step": 2150 }, { "epoch": 9.410480349344978, "grad_norm": 1.5891384582888524, "learning_rate": 2.1098336419116625e-06, "loss": 0.5656, "step": 2155 }, { "epoch": 9.43231441048035, "grad_norm": 1.5178759944544706, "learning_rate": 1.956946155303785e-06, "loss": 0.6198, "step": 2160 }, { "epoch": 9.45414847161572, "grad_norm": 1.641246543950594, "learning_rate": 1.809753731246544e-06, "loss": 0.5829, "step": 2165 }, { "epoch": 9.475982532751091, "grad_norm": 1.665331469610374, "learning_rate": 1.6682649197598433e-06, "loss": 0.5871, "step": 2170 }, { "epoch": 9.497816593886462, "grad_norm": 1.823654408381571, "learning_rate": 1.5324879395557933e-06, "loss": 0.5906, "step": 2175 }, { "epoch": 9.519650655021834, "grad_norm": 1.564912693530831, "learning_rate": 1.4024306775612283e-06, "loss": 0.6207, "step": 2180 }, { "epoch": 9.541484716157205, "grad_norm": 1.5863861199272236, "learning_rate": 1.2781006884596825e-06, "loss": 0.6267, "step": 2185 }, { "epoch": 9.563318777292576, "grad_norm": 1.6063472205433305, "learning_rate": 1.1595051942524637e-06, "loss": 0.5755, "step": 2190 }, { "epoch": 9.585152838427948, "grad_norm": 1.5366202264694289, "learning_rate": 1.0466510838392229e-06, "loss": 0.5384, "step": 2195 }, { "epoch": 9.606986899563319, "grad_norm": 1.61763818006577, "learning_rate": 9.395449126177291e-07, "loss": 0.6435, "step": 2200 }, { "epoch": 9.62882096069869, "grad_norm": 1.5959225303743334, "learning_rate": 8.381929021031409e-07, "loss": 0.5587, "step": 2205 }, { "epoch": 9.65065502183406, "grad_norm": 1.5317193865637022, "learning_rate": 7.426009395665734e-07, "loss": 0.6166, "step": 2210 }, { "epoch": 9.672489082969433, "grad_norm": 1.7050859904255076, "learning_rate": 6.527745776931382e-07, "loss": 0.6078, "step": 2215 }, { "epoch": 9.694323144104803, "grad_norm": 1.626127532872475, "learning_rate": 5.687190342594239e-07, "loss": 0.6022, "step": 2220 }, { "epoch": 9.716157205240174, "grad_norm": 1.6752520068014993, "learning_rate": 4.904391918303608e-07, "loss": 0.6126, "step": 2225 }, { "epoch": 9.737991266375547, "grad_norm": 1.6226696728292287, "learning_rate": 4.1793959747565836e-07, "loss": 0.5671, "step": 2230 }, { "epoch": 9.759825327510917, "grad_norm": 1.6182868533567345, "learning_rate": 3.5122446250562825e-07, "loss": 0.5859, "step": 2235 }, { "epoch": 9.781659388646288, "grad_norm": 1.686162675556006, "learning_rate": 2.902976622265907e-07, "loss": 0.5634, "step": 2240 }, { "epoch": 9.803493449781659, "grad_norm": 1.5169542450593143, "learning_rate": 2.3516273571577708e-07, "loss": 0.5578, "step": 2245 }, { "epoch": 9.825327510917031, "grad_norm": 1.6287222229266196, "learning_rate": 1.8582288561573847e-07, "loss": 0.5543, "step": 2250 }, { "epoch": 9.847161572052402, "grad_norm": 1.4791363351235698, "learning_rate": 1.4228097794828366e-07, "loss": 0.5705, "step": 2255 }, { "epoch": 9.868995633187772, "grad_norm": 1.5782969298269662, "learning_rate": 1.045395419480677e-07, "loss": 0.5742, "step": 2260 }, { "epoch": 9.890829694323145, "grad_norm": 1.5934772169541918, "learning_rate": 7.260076991560949e-08, "loss": 0.6509, "step": 2265 }, { "epoch": 9.912663755458516, "grad_norm": 1.5072353913111483, "learning_rate": 4.646651708998251e-08, "loss": 0.5509, "step": 2270 }, { "epoch": 9.934497816593886, "grad_norm": 1.5448684833958992, "learning_rate": 2.6138301541056564e-08, "loss": 0.5666, "step": 2275 }, { "epoch": 9.956331877729257, "grad_norm": 1.504682951530182, "learning_rate": 1.1617304081268376e-08, "loss": 0.5865, "step": 2280 }, { "epoch": 9.97816593886463, "grad_norm": 1.5933513163764192, "learning_rate": 2.9043681970875035e-09, "loss": 0.5653, "step": 2285 }, { "epoch": 10.0, "grad_norm": 1.5918112804762306, "learning_rate": 0.0, "loss": 0.5502, "step": 2290 }, { "epoch": 10.0, "step": 2290, "total_flos": 5211452815179776.0, "train_loss": 1.1147898718779785, "train_runtime": 4502.6314, "train_samples_per_second": 32.519, "train_steps_per_second": 0.509 } ], "logging_steps": 5, "max_steps": 2290, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5211452815179776.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }