|
{ |
|
"best_metric": 1.3783499002456665, |
|
"best_model_checkpoint": "../checkpoints/sft/checkpoint-2000", |
|
"epoch": 1.8601767167880947, |
|
"eval_steps": 500, |
|
"global_step": 6000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01550147263990079, |
|
"grad_norm": 0.0904994085431099, |
|
"learning_rate": 4.9741602067183466e-05, |
|
"loss": 2.0913, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03100294527980158, |
|
"grad_norm": 0.09595159441232681, |
|
"learning_rate": 4.948320413436693e-05, |
|
"loss": 1.9736, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04650441791970237, |
|
"grad_norm": 0.09749136120080948, |
|
"learning_rate": 4.922480620155039e-05, |
|
"loss": 1.9502, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06200589055960316, |
|
"grad_norm": 0.08286518603563309, |
|
"learning_rate": 4.8966408268733855e-05, |
|
"loss": 1.9324, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07750736319950395, |
|
"grad_norm": 0.10390572249889374, |
|
"learning_rate": 4.870801033591731e-05, |
|
"loss": 1.9413, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09300883583940474, |
|
"grad_norm": 0.08579128980636597, |
|
"learning_rate": 4.8449612403100775e-05, |
|
"loss": 1.9437, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.10851030847930554, |
|
"grad_norm": 0.11584578454494476, |
|
"learning_rate": 4.819121447028424e-05, |
|
"loss": 1.939, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.12401178111920633, |
|
"grad_norm": 0.1426805555820465, |
|
"learning_rate": 4.79328165374677e-05, |
|
"loss": 1.9428, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.13951325375910711, |
|
"grad_norm": 0.08612250536680222, |
|
"learning_rate": 4.7674418604651164e-05, |
|
"loss": 1.9212, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1550147263990079, |
|
"grad_norm": 0.08324660360813141, |
|
"learning_rate": 4.741602067183463e-05, |
|
"loss": 1.9186, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1550147263990079, |
|
"eval_loss": 1.3784544467926025, |
|
"eval_runtime": 10.968, |
|
"eval_samples_per_second": 32.458, |
|
"eval_steps_per_second": 4.103, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1705161990389087, |
|
"grad_norm": 0.09479448199272156, |
|
"learning_rate": 4.715762273901809e-05, |
|
"loss": 1.9346, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.18601767167880948, |
|
"grad_norm": 0.10014442354440689, |
|
"learning_rate": 4.6899224806201553e-05, |
|
"loss": 1.9093, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.20151914431871026, |
|
"grad_norm": 0.08976327627897263, |
|
"learning_rate": 4.664082687338501e-05, |
|
"loss": 1.935, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.21702061695861108, |
|
"grad_norm": 0.10600723326206207, |
|
"learning_rate": 4.638242894056848e-05, |
|
"loss": 1.9172, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.23252208959851187, |
|
"grad_norm": 0.18757325410842896, |
|
"learning_rate": 4.6124031007751936e-05, |
|
"loss": 1.9177, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.24802356223841265, |
|
"grad_norm": 0.10721405595541, |
|
"learning_rate": 4.5865633074935406e-05, |
|
"loss": 1.9098, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2635250348783134, |
|
"grad_norm": 0.09476891160011292, |
|
"learning_rate": 4.560723514211886e-05, |
|
"loss": 1.9307, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.27902650751821423, |
|
"grad_norm": 0.09466688334941864, |
|
"learning_rate": 4.5348837209302326e-05, |
|
"loss": 1.9178, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.29452798015811504, |
|
"grad_norm": 0.09889288246631622, |
|
"learning_rate": 4.509043927648579e-05, |
|
"loss": 1.9244, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3100294527980158, |
|
"grad_norm": 0.14233581721782684, |
|
"learning_rate": 4.483204134366925e-05, |
|
"loss": 1.9153, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3100294527980158, |
|
"eval_loss": 1.411450982093811, |
|
"eval_runtime": 10.6228, |
|
"eval_samples_per_second": 33.513, |
|
"eval_steps_per_second": 4.236, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3255309254379166, |
|
"grad_norm": 0.08776664733886719, |
|
"learning_rate": 4.4573643410852715e-05, |
|
"loss": 1.9278, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3410323980778174, |
|
"grad_norm": 0.09803825616836548, |
|
"learning_rate": 4.431524547803618e-05, |
|
"loss": 1.9199, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3565338707177182, |
|
"grad_norm": 0.13931065797805786, |
|
"learning_rate": 4.405684754521964e-05, |
|
"loss": 1.9225, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.37203534335761895, |
|
"grad_norm": 0.09357253462076187, |
|
"learning_rate": 4.3798449612403104e-05, |
|
"loss": 1.9317, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.38753681599751977, |
|
"grad_norm": 0.10508549958467484, |
|
"learning_rate": 4.354005167958656e-05, |
|
"loss": 1.9255, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.4030382886374205, |
|
"grad_norm": 0.10250881314277649, |
|
"learning_rate": 4.328165374677003e-05, |
|
"loss": 1.9137, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.41853976127732134, |
|
"grad_norm": 0.10409200936555862, |
|
"learning_rate": 4.302325581395349e-05, |
|
"loss": 1.8974, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.43404123391722216, |
|
"grad_norm": 0.09141243994235992, |
|
"learning_rate": 4.276485788113696e-05, |
|
"loss": 1.9103, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.4495427065571229, |
|
"grad_norm": 0.09040896594524384, |
|
"learning_rate": 4.2506459948320413e-05, |
|
"loss": 1.9057, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.46504417919702373, |
|
"grad_norm": 0.09726119041442871, |
|
"learning_rate": 4.2248062015503877e-05, |
|
"loss": 1.9014, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.46504417919702373, |
|
"eval_loss": 1.3864915370941162, |
|
"eval_runtime": 10.6078, |
|
"eval_samples_per_second": 33.56, |
|
"eval_steps_per_second": 4.242, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4805456518369245, |
|
"grad_norm": 0.11088709533214569, |
|
"learning_rate": 4.198966408268734e-05, |
|
"loss": 1.9152, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.4960471244768253, |
|
"grad_norm": 0.11300063878297806, |
|
"learning_rate": 4.17312661498708e-05, |
|
"loss": 1.9168, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5115485971167261, |
|
"grad_norm": 0.10414065420627594, |
|
"learning_rate": 4.1472868217054266e-05, |
|
"loss": 1.9172, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.5270500697566268, |
|
"grad_norm": 0.09191906452178955, |
|
"learning_rate": 4.121447028423773e-05, |
|
"loss": 1.8968, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5425515423965277, |
|
"grad_norm": 0.10366778820753098, |
|
"learning_rate": 4.095607235142119e-05, |
|
"loss": 1.8976, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.5580530150364285, |
|
"grad_norm": 0.09124482423067093, |
|
"learning_rate": 4.0697674418604655e-05, |
|
"loss": 1.8896, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5735544876763292, |
|
"grad_norm": 0.10119319707155228, |
|
"learning_rate": 4.043927648578811e-05, |
|
"loss": 1.8881, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5890559603162301, |
|
"grad_norm": 0.1147024929523468, |
|
"learning_rate": 4.018087855297158e-05, |
|
"loss": 1.9097, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6045574329561308, |
|
"grad_norm": 0.09813511371612549, |
|
"learning_rate": 3.992248062015504e-05, |
|
"loss": 1.908, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.6200589055960316, |
|
"grad_norm": 0.10509663820266724, |
|
"learning_rate": 3.966408268733851e-05, |
|
"loss": 1.9018, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6200589055960316, |
|
"eval_loss": 1.3783499002456665, |
|
"eval_runtime": 10.6185, |
|
"eval_samples_per_second": 33.526, |
|
"eval_steps_per_second": 4.238, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6355603782359324, |
|
"grad_norm": 0.1106325089931488, |
|
"learning_rate": 3.9405684754521964e-05, |
|
"loss": 1.8955, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.6510618508758332, |
|
"grad_norm": 0.09004335105419159, |
|
"learning_rate": 3.914728682170543e-05, |
|
"loss": 1.881, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.666563323515734, |
|
"grad_norm": 0.10601779073476791, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 1.8996, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6820647961556348, |
|
"grad_norm": 0.09966927021741867, |
|
"learning_rate": 3.8630490956072354e-05, |
|
"loss": 1.8806, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6975662687955356, |
|
"grad_norm": 0.10469923913478851, |
|
"learning_rate": 3.837209302325582e-05, |
|
"loss": 1.8948, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.7130677414354364, |
|
"grad_norm": 0.10708159953355789, |
|
"learning_rate": 3.811369509043928e-05, |
|
"loss": 1.8917, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.7285692140753371, |
|
"grad_norm": 0.09772736579179764, |
|
"learning_rate": 3.785529715762274e-05, |
|
"loss": 1.9049, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.7440706867152379, |
|
"grad_norm": 0.10732176899909973, |
|
"learning_rate": 3.7596899224806207e-05, |
|
"loss": 1.911, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7595721593551388, |
|
"grad_norm": 0.10482428967952728, |
|
"learning_rate": 3.733850129198966e-05, |
|
"loss": 1.8946, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.7750736319950395, |
|
"grad_norm": 0.10157684981822968, |
|
"learning_rate": 3.708010335917313e-05, |
|
"loss": 1.8859, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7750736319950395, |
|
"eval_loss": 1.3911091089248657, |
|
"eval_runtime": 10.6093, |
|
"eval_samples_per_second": 33.555, |
|
"eval_steps_per_second": 4.242, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7905751046349403, |
|
"grad_norm": 0.10432788729667664, |
|
"learning_rate": 3.682170542635659e-05, |
|
"loss": 1.894, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.806076577274841, |
|
"grad_norm": 0.10409342497587204, |
|
"learning_rate": 3.656330749354005e-05, |
|
"loss": 1.8882, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.8215780499147419, |
|
"grad_norm": 0.11120035499334335, |
|
"learning_rate": 3.6304909560723515e-05, |
|
"loss": 1.8827, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.8370795225546427, |
|
"grad_norm": 0.11159931123256683, |
|
"learning_rate": 3.604651162790698e-05, |
|
"loss": 1.8902, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.8525809951945434, |
|
"grad_norm": 0.09784770011901855, |
|
"learning_rate": 3.578811369509044e-05, |
|
"loss": 1.8881, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.8680824678344443, |
|
"grad_norm": 0.09907621890306473, |
|
"learning_rate": 3.5529715762273905e-05, |
|
"loss": 1.9006, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8835839404743451, |
|
"grad_norm": 0.10387194156646729, |
|
"learning_rate": 3.527131782945737e-05, |
|
"loss": 1.9085, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.8990854131142458, |
|
"grad_norm": 0.09763891249895096, |
|
"learning_rate": 3.5012919896640824e-05, |
|
"loss": 1.8904, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.9145868857541466, |
|
"grad_norm": 0.10186181217432022, |
|
"learning_rate": 3.4754521963824294e-05, |
|
"loss": 1.8732, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.9300883583940475, |
|
"grad_norm": 0.11027953773736954, |
|
"learning_rate": 3.449612403100775e-05, |
|
"loss": 1.8886, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.9300883583940475, |
|
"eval_loss": 1.4133652448654175, |
|
"eval_runtime": 10.6201, |
|
"eval_samples_per_second": 33.521, |
|
"eval_steps_per_second": 4.237, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.9455898310339482, |
|
"grad_norm": 0.1076328456401825, |
|
"learning_rate": 3.4237726098191214e-05, |
|
"loss": 1.9007, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.961091303673849, |
|
"grad_norm": 0.10457822680473328, |
|
"learning_rate": 3.397932816537468e-05, |
|
"loss": 1.8894, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9765927763137499, |
|
"grad_norm": 0.1240358054637909, |
|
"learning_rate": 3.372093023255814e-05, |
|
"loss": 1.8962, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.9920942489536506, |
|
"grad_norm": 0.1087583601474762, |
|
"learning_rate": 3.34625322997416e-05, |
|
"loss": 1.8849, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.0075957215935514, |
|
"grad_norm": 0.10473156720399857, |
|
"learning_rate": 3.3204134366925067e-05, |
|
"loss": 1.8825, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.0230971942334521, |
|
"grad_norm": 0.11174094676971436, |
|
"learning_rate": 3.294573643410852e-05, |
|
"loss": 1.8804, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.038598666873353, |
|
"grad_norm": 0.10820575803518295, |
|
"learning_rate": 3.268733850129199e-05, |
|
"loss": 1.874, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.0541001395132537, |
|
"grad_norm": 0.11375240236520767, |
|
"learning_rate": 3.242894056847545e-05, |
|
"loss": 1.892, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.0696016121531546, |
|
"grad_norm": 0.10983241349458694, |
|
"learning_rate": 3.217054263565892e-05, |
|
"loss": 1.8864, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.0851030847930554, |
|
"grad_norm": 0.11412903666496277, |
|
"learning_rate": 3.1912144702842375e-05, |
|
"loss": 1.8841, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.0851030847930554, |
|
"eval_loss": 1.409255862236023, |
|
"eval_runtime": 10.6154, |
|
"eval_samples_per_second": 33.536, |
|
"eval_steps_per_second": 4.239, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.1006045574329562, |
|
"grad_norm": 0.11279023438692093, |
|
"learning_rate": 3.1653746770025845e-05, |
|
"loss": 1.8809, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.116106030072857, |
|
"grad_norm": 0.12842436134815216, |
|
"learning_rate": 3.13953488372093e-05, |
|
"loss": 1.8822, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.1316075027127577, |
|
"grad_norm": 0.11607849597930908, |
|
"learning_rate": 3.1136950904392765e-05, |
|
"loss": 1.8677, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.1471089753526584, |
|
"grad_norm": 0.11454817652702332, |
|
"learning_rate": 3.087855297157623e-05, |
|
"loss": 1.8785, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.1626104479925594, |
|
"grad_norm": 0.12619228661060333, |
|
"learning_rate": 3.062015503875969e-05, |
|
"loss": 1.8735, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.1781119206324602, |
|
"grad_norm": 0.12233872711658478, |
|
"learning_rate": 3.0361757105943154e-05, |
|
"loss": 1.8625, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.193613393272361, |
|
"grad_norm": 0.1316874474287033, |
|
"learning_rate": 3.0103359173126618e-05, |
|
"loss": 1.8775, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.2091148659122617, |
|
"grad_norm": 0.12887555360794067, |
|
"learning_rate": 2.9844961240310077e-05, |
|
"loss": 1.8868, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.2246163385521625, |
|
"grad_norm": 0.11875531822443008, |
|
"learning_rate": 2.9586563307493544e-05, |
|
"loss": 1.8645, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.2401178111920632, |
|
"grad_norm": 0.11634934693574905, |
|
"learning_rate": 2.9328165374677004e-05, |
|
"loss": 1.8803, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.2401178111920632, |
|
"eval_loss": 1.4193403720855713, |
|
"eval_runtime": 10.6141, |
|
"eval_samples_per_second": 33.54, |
|
"eval_steps_per_second": 4.24, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.255619283831964, |
|
"grad_norm": 0.12749060988426208, |
|
"learning_rate": 2.9069767441860467e-05, |
|
"loss": 1.8699, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.271120756471865, |
|
"grad_norm": 0.12758812308311462, |
|
"learning_rate": 2.881136950904393e-05, |
|
"loss": 1.8624, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.2866222291117655, |
|
"grad_norm": 0.11913730949163437, |
|
"learning_rate": 2.8552971576227393e-05, |
|
"loss": 1.8731, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.3021237017516665, |
|
"grad_norm": 0.11662113666534424, |
|
"learning_rate": 2.8294573643410853e-05, |
|
"loss": 1.8936, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.3176251743915672, |
|
"grad_norm": 0.13116636872291565, |
|
"learning_rate": 2.8036175710594316e-05, |
|
"loss": 1.8788, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.333126647031468, |
|
"grad_norm": 0.12884733080863953, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 1.8788, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.3486281196713688, |
|
"grad_norm": 0.12196724116802216, |
|
"learning_rate": 2.751937984496124e-05, |
|
"loss": 1.8689, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.3641295923112695, |
|
"grad_norm": 0.13119535148143768, |
|
"learning_rate": 2.7260981912144705e-05, |
|
"loss": 1.8702, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.3796310649511705, |
|
"grad_norm": 0.12181399017572403, |
|
"learning_rate": 2.7002583979328165e-05, |
|
"loss": 1.8708, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 1.395132537591071, |
|
"grad_norm": 0.1291220635175705, |
|
"learning_rate": 2.674418604651163e-05, |
|
"loss": 1.8509, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.395132537591071, |
|
"eval_loss": 1.4268625974655151, |
|
"eval_runtime": 10.6177, |
|
"eval_samples_per_second": 33.529, |
|
"eval_steps_per_second": 4.238, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.410634010230972, |
|
"grad_norm": 0.1215544119477272, |
|
"learning_rate": 2.648578811369509e-05, |
|
"loss": 1.868, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 1.4261354828708728, |
|
"grad_norm": 0.12441984564065933, |
|
"learning_rate": 2.622739018087855e-05, |
|
"loss": 1.8624, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.4416369555107735, |
|
"grad_norm": 0.12983018159866333, |
|
"learning_rate": 2.5968992248062018e-05, |
|
"loss": 1.875, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 1.4571384281506743, |
|
"grad_norm": 0.14041350781917572, |
|
"learning_rate": 2.5710594315245478e-05, |
|
"loss": 1.8634, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.472639900790575, |
|
"grad_norm": 0.1323089897632599, |
|
"learning_rate": 2.5452196382428944e-05, |
|
"loss": 1.8701, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 1.488141373430476, |
|
"grad_norm": 0.13671238720417023, |
|
"learning_rate": 2.5193798449612404e-05, |
|
"loss": 1.8722, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.5036428460703766, |
|
"grad_norm": 0.12991730868816376, |
|
"learning_rate": 2.4935400516795867e-05, |
|
"loss": 1.8548, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 1.5191443187102776, |
|
"grad_norm": 0.12073440849781036, |
|
"learning_rate": 2.467700258397933e-05, |
|
"loss": 1.8817, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.5346457913501783, |
|
"grad_norm": 0.1927834451198578, |
|
"learning_rate": 2.441860465116279e-05, |
|
"loss": 1.8624, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 1.550147263990079, |
|
"grad_norm": 0.1434555947780609, |
|
"learning_rate": 2.4160206718346253e-05, |
|
"loss": 1.8571, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.550147263990079, |
|
"eval_loss": 1.4204214811325073, |
|
"eval_runtime": 10.6279, |
|
"eval_samples_per_second": 33.497, |
|
"eval_steps_per_second": 4.234, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.5656487366299798, |
|
"grad_norm": 0.1407179832458496, |
|
"learning_rate": 2.3901808785529716e-05, |
|
"loss": 1.8693, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 1.5811502092698806, |
|
"grad_norm": 0.13802586495876312, |
|
"learning_rate": 2.364341085271318e-05, |
|
"loss": 1.8653, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.5966516819097816, |
|
"grad_norm": 0.13259877264499664, |
|
"learning_rate": 2.3385012919896642e-05, |
|
"loss": 1.85, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.612153154549682, |
|
"grad_norm": 0.14958499372005463, |
|
"learning_rate": 2.3131782945736435e-05, |
|
"loss": 1.8707, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.627654627189583, |
|
"grad_norm": 0.13671304285526276, |
|
"learning_rate": 2.2873385012919898e-05, |
|
"loss": 1.8642, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.6431560998294839, |
|
"grad_norm": 0.12920096516609192, |
|
"learning_rate": 2.261498708010336e-05, |
|
"loss": 1.856, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.6586575724693846, |
|
"grad_norm": 0.1263495683670044, |
|
"learning_rate": 2.235658914728682e-05, |
|
"loss": 1.8639, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.6741590451092854, |
|
"grad_norm": 0.1388077437877655, |
|
"learning_rate": 2.2098191214470284e-05, |
|
"loss": 1.8662, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.6896605177491861, |
|
"grad_norm": 0.13600564002990723, |
|
"learning_rate": 2.1839793281653747e-05, |
|
"loss": 1.8667, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.7051619903890871, |
|
"grad_norm": 0.1285238116979599, |
|
"learning_rate": 2.158139534883721e-05, |
|
"loss": 1.8691, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.7051619903890871, |
|
"eval_loss": 1.4164341688156128, |
|
"eval_runtime": 10.618, |
|
"eval_samples_per_second": 33.528, |
|
"eval_steps_per_second": 4.238, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.7206634630289876, |
|
"grad_norm": 0.14896276593208313, |
|
"learning_rate": 2.1322997416020673e-05, |
|
"loss": 1.8526, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.7361649356688886, |
|
"grad_norm": 0.13365153968334198, |
|
"learning_rate": 2.1064599483204136e-05, |
|
"loss": 1.8592, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.7516664083087892, |
|
"grad_norm": 0.1381397843360901, |
|
"learning_rate": 2.0806201550387596e-05, |
|
"loss": 1.8563, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 1.7671678809486902, |
|
"grad_norm": 0.14912950992584229, |
|
"learning_rate": 2.054780361757106e-05, |
|
"loss": 1.8562, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.782669353588591, |
|
"grad_norm": 0.13218577206134796, |
|
"learning_rate": 2.0289405684754523e-05, |
|
"loss": 1.8617, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 1.7981708262284917, |
|
"grad_norm": 0.13850583136081696, |
|
"learning_rate": 2.0031007751937986e-05, |
|
"loss": 1.8563, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.8136722988683927, |
|
"grad_norm": 0.13015194237232208, |
|
"learning_rate": 1.977260981912145e-05, |
|
"loss": 1.8473, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 1.8291737715082932, |
|
"grad_norm": 0.13473467528820038, |
|
"learning_rate": 1.9514211886304912e-05, |
|
"loss": 1.8549, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.8446752441481942, |
|
"grad_norm": 0.13621702790260315, |
|
"learning_rate": 1.9255813953488372e-05, |
|
"loss": 1.8565, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 1.8601767167880947, |
|
"grad_norm": 0.1430806815624237, |
|
"learning_rate": 1.8997416020671835e-05, |
|
"loss": 1.8533, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.8601767167880947, |
|
"eval_loss": 1.4264311790466309, |
|
"eval_runtime": 10.6098, |
|
"eval_samples_per_second": 33.554, |
|
"eval_steps_per_second": 4.241, |
|
"step": 6000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 9675, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 1.108324190905172e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|