Phi-3-mini-4k-instruct-sft / trainer_state.json
StefanKrsteski's picture
Upload folder using huggingface_hub
f26bb25 verified
raw
history blame contribute delete
No virus
24.1 kB
{
"best_metric": 1.3783499002456665,
"best_model_checkpoint": "../checkpoints/sft/checkpoint-2000",
"epoch": 1.8601767167880947,
"eval_steps": 500,
"global_step": 6000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01550147263990079,
"grad_norm": 0.0904994085431099,
"learning_rate": 4.9741602067183466e-05,
"loss": 2.0913,
"step": 50
},
{
"epoch": 0.03100294527980158,
"grad_norm": 0.09595159441232681,
"learning_rate": 4.948320413436693e-05,
"loss": 1.9736,
"step": 100
},
{
"epoch": 0.04650441791970237,
"grad_norm": 0.09749136120080948,
"learning_rate": 4.922480620155039e-05,
"loss": 1.9502,
"step": 150
},
{
"epoch": 0.06200589055960316,
"grad_norm": 0.08286518603563309,
"learning_rate": 4.8966408268733855e-05,
"loss": 1.9324,
"step": 200
},
{
"epoch": 0.07750736319950395,
"grad_norm": 0.10390572249889374,
"learning_rate": 4.870801033591731e-05,
"loss": 1.9413,
"step": 250
},
{
"epoch": 0.09300883583940474,
"grad_norm": 0.08579128980636597,
"learning_rate": 4.8449612403100775e-05,
"loss": 1.9437,
"step": 300
},
{
"epoch": 0.10851030847930554,
"grad_norm": 0.11584578454494476,
"learning_rate": 4.819121447028424e-05,
"loss": 1.939,
"step": 350
},
{
"epoch": 0.12401178111920633,
"grad_norm": 0.1426805555820465,
"learning_rate": 4.79328165374677e-05,
"loss": 1.9428,
"step": 400
},
{
"epoch": 0.13951325375910711,
"grad_norm": 0.08612250536680222,
"learning_rate": 4.7674418604651164e-05,
"loss": 1.9212,
"step": 450
},
{
"epoch": 0.1550147263990079,
"grad_norm": 0.08324660360813141,
"learning_rate": 4.741602067183463e-05,
"loss": 1.9186,
"step": 500
},
{
"epoch": 0.1550147263990079,
"eval_loss": 1.3784544467926025,
"eval_runtime": 10.968,
"eval_samples_per_second": 32.458,
"eval_steps_per_second": 4.103,
"step": 500
},
{
"epoch": 0.1705161990389087,
"grad_norm": 0.09479448199272156,
"learning_rate": 4.715762273901809e-05,
"loss": 1.9346,
"step": 550
},
{
"epoch": 0.18601767167880948,
"grad_norm": 0.10014442354440689,
"learning_rate": 4.6899224806201553e-05,
"loss": 1.9093,
"step": 600
},
{
"epoch": 0.20151914431871026,
"grad_norm": 0.08976327627897263,
"learning_rate": 4.664082687338501e-05,
"loss": 1.935,
"step": 650
},
{
"epoch": 0.21702061695861108,
"grad_norm": 0.10600723326206207,
"learning_rate": 4.638242894056848e-05,
"loss": 1.9172,
"step": 700
},
{
"epoch": 0.23252208959851187,
"grad_norm": 0.18757325410842896,
"learning_rate": 4.6124031007751936e-05,
"loss": 1.9177,
"step": 750
},
{
"epoch": 0.24802356223841265,
"grad_norm": 0.10721405595541,
"learning_rate": 4.5865633074935406e-05,
"loss": 1.9098,
"step": 800
},
{
"epoch": 0.2635250348783134,
"grad_norm": 0.09476891160011292,
"learning_rate": 4.560723514211886e-05,
"loss": 1.9307,
"step": 850
},
{
"epoch": 0.27902650751821423,
"grad_norm": 0.09466688334941864,
"learning_rate": 4.5348837209302326e-05,
"loss": 1.9178,
"step": 900
},
{
"epoch": 0.29452798015811504,
"grad_norm": 0.09889288246631622,
"learning_rate": 4.509043927648579e-05,
"loss": 1.9244,
"step": 950
},
{
"epoch": 0.3100294527980158,
"grad_norm": 0.14233581721782684,
"learning_rate": 4.483204134366925e-05,
"loss": 1.9153,
"step": 1000
},
{
"epoch": 0.3100294527980158,
"eval_loss": 1.411450982093811,
"eval_runtime": 10.6228,
"eval_samples_per_second": 33.513,
"eval_steps_per_second": 4.236,
"step": 1000
},
{
"epoch": 0.3255309254379166,
"grad_norm": 0.08776664733886719,
"learning_rate": 4.4573643410852715e-05,
"loss": 1.9278,
"step": 1050
},
{
"epoch": 0.3410323980778174,
"grad_norm": 0.09803825616836548,
"learning_rate": 4.431524547803618e-05,
"loss": 1.9199,
"step": 1100
},
{
"epoch": 0.3565338707177182,
"grad_norm": 0.13931065797805786,
"learning_rate": 4.405684754521964e-05,
"loss": 1.9225,
"step": 1150
},
{
"epoch": 0.37203534335761895,
"grad_norm": 0.09357253462076187,
"learning_rate": 4.3798449612403104e-05,
"loss": 1.9317,
"step": 1200
},
{
"epoch": 0.38753681599751977,
"grad_norm": 0.10508549958467484,
"learning_rate": 4.354005167958656e-05,
"loss": 1.9255,
"step": 1250
},
{
"epoch": 0.4030382886374205,
"grad_norm": 0.10250881314277649,
"learning_rate": 4.328165374677003e-05,
"loss": 1.9137,
"step": 1300
},
{
"epoch": 0.41853976127732134,
"grad_norm": 0.10409200936555862,
"learning_rate": 4.302325581395349e-05,
"loss": 1.8974,
"step": 1350
},
{
"epoch": 0.43404123391722216,
"grad_norm": 0.09141243994235992,
"learning_rate": 4.276485788113696e-05,
"loss": 1.9103,
"step": 1400
},
{
"epoch": 0.4495427065571229,
"grad_norm": 0.09040896594524384,
"learning_rate": 4.2506459948320413e-05,
"loss": 1.9057,
"step": 1450
},
{
"epoch": 0.46504417919702373,
"grad_norm": 0.09726119041442871,
"learning_rate": 4.2248062015503877e-05,
"loss": 1.9014,
"step": 1500
},
{
"epoch": 0.46504417919702373,
"eval_loss": 1.3864915370941162,
"eval_runtime": 10.6078,
"eval_samples_per_second": 33.56,
"eval_steps_per_second": 4.242,
"step": 1500
},
{
"epoch": 0.4805456518369245,
"grad_norm": 0.11088709533214569,
"learning_rate": 4.198966408268734e-05,
"loss": 1.9152,
"step": 1550
},
{
"epoch": 0.4960471244768253,
"grad_norm": 0.11300063878297806,
"learning_rate": 4.17312661498708e-05,
"loss": 1.9168,
"step": 1600
},
{
"epoch": 0.5115485971167261,
"grad_norm": 0.10414065420627594,
"learning_rate": 4.1472868217054266e-05,
"loss": 1.9172,
"step": 1650
},
{
"epoch": 0.5270500697566268,
"grad_norm": 0.09191906452178955,
"learning_rate": 4.121447028423773e-05,
"loss": 1.8968,
"step": 1700
},
{
"epoch": 0.5425515423965277,
"grad_norm": 0.10366778820753098,
"learning_rate": 4.095607235142119e-05,
"loss": 1.8976,
"step": 1750
},
{
"epoch": 0.5580530150364285,
"grad_norm": 0.09124482423067093,
"learning_rate": 4.0697674418604655e-05,
"loss": 1.8896,
"step": 1800
},
{
"epoch": 0.5735544876763292,
"grad_norm": 0.10119319707155228,
"learning_rate": 4.043927648578811e-05,
"loss": 1.8881,
"step": 1850
},
{
"epoch": 0.5890559603162301,
"grad_norm": 0.1147024929523468,
"learning_rate": 4.018087855297158e-05,
"loss": 1.9097,
"step": 1900
},
{
"epoch": 0.6045574329561308,
"grad_norm": 0.09813511371612549,
"learning_rate": 3.992248062015504e-05,
"loss": 1.908,
"step": 1950
},
{
"epoch": 0.6200589055960316,
"grad_norm": 0.10509663820266724,
"learning_rate": 3.966408268733851e-05,
"loss": 1.9018,
"step": 2000
},
{
"epoch": 0.6200589055960316,
"eval_loss": 1.3783499002456665,
"eval_runtime": 10.6185,
"eval_samples_per_second": 33.526,
"eval_steps_per_second": 4.238,
"step": 2000
},
{
"epoch": 0.6355603782359324,
"grad_norm": 0.1106325089931488,
"learning_rate": 3.9405684754521964e-05,
"loss": 1.8955,
"step": 2050
},
{
"epoch": 0.6510618508758332,
"grad_norm": 0.09004335105419159,
"learning_rate": 3.914728682170543e-05,
"loss": 1.881,
"step": 2100
},
{
"epoch": 0.666563323515734,
"grad_norm": 0.10601779073476791,
"learning_rate": 3.888888888888889e-05,
"loss": 1.8996,
"step": 2150
},
{
"epoch": 0.6820647961556348,
"grad_norm": 0.09966927021741867,
"learning_rate": 3.8630490956072354e-05,
"loss": 1.8806,
"step": 2200
},
{
"epoch": 0.6975662687955356,
"grad_norm": 0.10469923913478851,
"learning_rate": 3.837209302325582e-05,
"loss": 1.8948,
"step": 2250
},
{
"epoch": 0.7130677414354364,
"grad_norm": 0.10708159953355789,
"learning_rate": 3.811369509043928e-05,
"loss": 1.8917,
"step": 2300
},
{
"epoch": 0.7285692140753371,
"grad_norm": 0.09772736579179764,
"learning_rate": 3.785529715762274e-05,
"loss": 1.9049,
"step": 2350
},
{
"epoch": 0.7440706867152379,
"grad_norm": 0.10732176899909973,
"learning_rate": 3.7596899224806207e-05,
"loss": 1.911,
"step": 2400
},
{
"epoch": 0.7595721593551388,
"grad_norm": 0.10482428967952728,
"learning_rate": 3.733850129198966e-05,
"loss": 1.8946,
"step": 2450
},
{
"epoch": 0.7750736319950395,
"grad_norm": 0.10157684981822968,
"learning_rate": 3.708010335917313e-05,
"loss": 1.8859,
"step": 2500
},
{
"epoch": 0.7750736319950395,
"eval_loss": 1.3911091089248657,
"eval_runtime": 10.6093,
"eval_samples_per_second": 33.555,
"eval_steps_per_second": 4.242,
"step": 2500
},
{
"epoch": 0.7905751046349403,
"grad_norm": 0.10432788729667664,
"learning_rate": 3.682170542635659e-05,
"loss": 1.894,
"step": 2550
},
{
"epoch": 0.806076577274841,
"grad_norm": 0.10409342497587204,
"learning_rate": 3.656330749354005e-05,
"loss": 1.8882,
"step": 2600
},
{
"epoch": 0.8215780499147419,
"grad_norm": 0.11120035499334335,
"learning_rate": 3.6304909560723515e-05,
"loss": 1.8827,
"step": 2650
},
{
"epoch": 0.8370795225546427,
"grad_norm": 0.11159931123256683,
"learning_rate": 3.604651162790698e-05,
"loss": 1.8902,
"step": 2700
},
{
"epoch": 0.8525809951945434,
"grad_norm": 0.09784770011901855,
"learning_rate": 3.578811369509044e-05,
"loss": 1.8881,
"step": 2750
},
{
"epoch": 0.8680824678344443,
"grad_norm": 0.09907621890306473,
"learning_rate": 3.5529715762273905e-05,
"loss": 1.9006,
"step": 2800
},
{
"epoch": 0.8835839404743451,
"grad_norm": 0.10387194156646729,
"learning_rate": 3.527131782945737e-05,
"loss": 1.9085,
"step": 2850
},
{
"epoch": 0.8990854131142458,
"grad_norm": 0.09763891249895096,
"learning_rate": 3.5012919896640824e-05,
"loss": 1.8904,
"step": 2900
},
{
"epoch": 0.9145868857541466,
"grad_norm": 0.10186181217432022,
"learning_rate": 3.4754521963824294e-05,
"loss": 1.8732,
"step": 2950
},
{
"epoch": 0.9300883583940475,
"grad_norm": 0.11027953773736954,
"learning_rate": 3.449612403100775e-05,
"loss": 1.8886,
"step": 3000
},
{
"epoch": 0.9300883583940475,
"eval_loss": 1.4133652448654175,
"eval_runtime": 10.6201,
"eval_samples_per_second": 33.521,
"eval_steps_per_second": 4.237,
"step": 3000
},
{
"epoch": 0.9455898310339482,
"grad_norm": 0.1076328456401825,
"learning_rate": 3.4237726098191214e-05,
"loss": 1.9007,
"step": 3050
},
{
"epoch": 0.961091303673849,
"grad_norm": 0.10457822680473328,
"learning_rate": 3.397932816537468e-05,
"loss": 1.8894,
"step": 3100
},
{
"epoch": 0.9765927763137499,
"grad_norm": 0.1240358054637909,
"learning_rate": 3.372093023255814e-05,
"loss": 1.8962,
"step": 3150
},
{
"epoch": 0.9920942489536506,
"grad_norm": 0.1087583601474762,
"learning_rate": 3.34625322997416e-05,
"loss": 1.8849,
"step": 3200
},
{
"epoch": 1.0075957215935514,
"grad_norm": 0.10473156720399857,
"learning_rate": 3.3204134366925067e-05,
"loss": 1.8825,
"step": 3250
},
{
"epoch": 1.0230971942334521,
"grad_norm": 0.11174094676971436,
"learning_rate": 3.294573643410852e-05,
"loss": 1.8804,
"step": 3300
},
{
"epoch": 1.038598666873353,
"grad_norm": 0.10820575803518295,
"learning_rate": 3.268733850129199e-05,
"loss": 1.874,
"step": 3350
},
{
"epoch": 1.0541001395132537,
"grad_norm": 0.11375240236520767,
"learning_rate": 3.242894056847545e-05,
"loss": 1.892,
"step": 3400
},
{
"epoch": 1.0696016121531546,
"grad_norm": 0.10983241349458694,
"learning_rate": 3.217054263565892e-05,
"loss": 1.8864,
"step": 3450
},
{
"epoch": 1.0851030847930554,
"grad_norm": 0.11412903666496277,
"learning_rate": 3.1912144702842375e-05,
"loss": 1.8841,
"step": 3500
},
{
"epoch": 1.0851030847930554,
"eval_loss": 1.409255862236023,
"eval_runtime": 10.6154,
"eval_samples_per_second": 33.536,
"eval_steps_per_second": 4.239,
"step": 3500
},
{
"epoch": 1.1006045574329562,
"grad_norm": 0.11279023438692093,
"learning_rate": 3.1653746770025845e-05,
"loss": 1.8809,
"step": 3550
},
{
"epoch": 1.116106030072857,
"grad_norm": 0.12842436134815216,
"learning_rate": 3.13953488372093e-05,
"loss": 1.8822,
"step": 3600
},
{
"epoch": 1.1316075027127577,
"grad_norm": 0.11607849597930908,
"learning_rate": 3.1136950904392765e-05,
"loss": 1.8677,
"step": 3650
},
{
"epoch": 1.1471089753526584,
"grad_norm": 0.11454817652702332,
"learning_rate": 3.087855297157623e-05,
"loss": 1.8785,
"step": 3700
},
{
"epoch": 1.1626104479925594,
"grad_norm": 0.12619228661060333,
"learning_rate": 3.062015503875969e-05,
"loss": 1.8735,
"step": 3750
},
{
"epoch": 1.1781119206324602,
"grad_norm": 0.12233872711658478,
"learning_rate": 3.0361757105943154e-05,
"loss": 1.8625,
"step": 3800
},
{
"epoch": 1.193613393272361,
"grad_norm": 0.1316874474287033,
"learning_rate": 3.0103359173126618e-05,
"loss": 1.8775,
"step": 3850
},
{
"epoch": 1.2091148659122617,
"grad_norm": 0.12887555360794067,
"learning_rate": 2.9844961240310077e-05,
"loss": 1.8868,
"step": 3900
},
{
"epoch": 1.2246163385521625,
"grad_norm": 0.11875531822443008,
"learning_rate": 2.9586563307493544e-05,
"loss": 1.8645,
"step": 3950
},
{
"epoch": 1.2401178111920632,
"grad_norm": 0.11634934693574905,
"learning_rate": 2.9328165374677004e-05,
"loss": 1.8803,
"step": 4000
},
{
"epoch": 1.2401178111920632,
"eval_loss": 1.4193403720855713,
"eval_runtime": 10.6141,
"eval_samples_per_second": 33.54,
"eval_steps_per_second": 4.24,
"step": 4000
},
{
"epoch": 1.255619283831964,
"grad_norm": 0.12749060988426208,
"learning_rate": 2.9069767441860467e-05,
"loss": 1.8699,
"step": 4050
},
{
"epoch": 1.271120756471865,
"grad_norm": 0.12758812308311462,
"learning_rate": 2.881136950904393e-05,
"loss": 1.8624,
"step": 4100
},
{
"epoch": 1.2866222291117655,
"grad_norm": 0.11913730949163437,
"learning_rate": 2.8552971576227393e-05,
"loss": 1.8731,
"step": 4150
},
{
"epoch": 1.3021237017516665,
"grad_norm": 0.11662113666534424,
"learning_rate": 2.8294573643410853e-05,
"loss": 1.8936,
"step": 4200
},
{
"epoch": 1.3176251743915672,
"grad_norm": 0.13116636872291565,
"learning_rate": 2.8036175710594316e-05,
"loss": 1.8788,
"step": 4250
},
{
"epoch": 1.333126647031468,
"grad_norm": 0.12884733080863953,
"learning_rate": 2.777777777777778e-05,
"loss": 1.8788,
"step": 4300
},
{
"epoch": 1.3486281196713688,
"grad_norm": 0.12196724116802216,
"learning_rate": 2.751937984496124e-05,
"loss": 1.8689,
"step": 4350
},
{
"epoch": 1.3641295923112695,
"grad_norm": 0.13119535148143768,
"learning_rate": 2.7260981912144705e-05,
"loss": 1.8702,
"step": 4400
},
{
"epoch": 1.3796310649511705,
"grad_norm": 0.12181399017572403,
"learning_rate": 2.7002583979328165e-05,
"loss": 1.8708,
"step": 4450
},
{
"epoch": 1.395132537591071,
"grad_norm": 0.1291220635175705,
"learning_rate": 2.674418604651163e-05,
"loss": 1.8509,
"step": 4500
},
{
"epoch": 1.395132537591071,
"eval_loss": 1.4268625974655151,
"eval_runtime": 10.6177,
"eval_samples_per_second": 33.529,
"eval_steps_per_second": 4.238,
"step": 4500
},
{
"epoch": 1.410634010230972,
"grad_norm": 0.1215544119477272,
"learning_rate": 2.648578811369509e-05,
"loss": 1.868,
"step": 4550
},
{
"epoch": 1.4261354828708728,
"grad_norm": 0.12441984564065933,
"learning_rate": 2.622739018087855e-05,
"loss": 1.8624,
"step": 4600
},
{
"epoch": 1.4416369555107735,
"grad_norm": 0.12983018159866333,
"learning_rate": 2.5968992248062018e-05,
"loss": 1.875,
"step": 4650
},
{
"epoch": 1.4571384281506743,
"grad_norm": 0.14041350781917572,
"learning_rate": 2.5710594315245478e-05,
"loss": 1.8634,
"step": 4700
},
{
"epoch": 1.472639900790575,
"grad_norm": 0.1323089897632599,
"learning_rate": 2.5452196382428944e-05,
"loss": 1.8701,
"step": 4750
},
{
"epoch": 1.488141373430476,
"grad_norm": 0.13671238720417023,
"learning_rate": 2.5193798449612404e-05,
"loss": 1.8722,
"step": 4800
},
{
"epoch": 1.5036428460703766,
"grad_norm": 0.12991730868816376,
"learning_rate": 2.4935400516795867e-05,
"loss": 1.8548,
"step": 4850
},
{
"epoch": 1.5191443187102776,
"grad_norm": 0.12073440849781036,
"learning_rate": 2.467700258397933e-05,
"loss": 1.8817,
"step": 4900
},
{
"epoch": 1.5346457913501783,
"grad_norm": 0.1927834451198578,
"learning_rate": 2.441860465116279e-05,
"loss": 1.8624,
"step": 4950
},
{
"epoch": 1.550147263990079,
"grad_norm": 0.1434555947780609,
"learning_rate": 2.4160206718346253e-05,
"loss": 1.8571,
"step": 5000
},
{
"epoch": 1.550147263990079,
"eval_loss": 1.4204214811325073,
"eval_runtime": 10.6279,
"eval_samples_per_second": 33.497,
"eval_steps_per_second": 4.234,
"step": 5000
},
{
"epoch": 1.5656487366299798,
"grad_norm": 0.1407179832458496,
"learning_rate": 2.3901808785529716e-05,
"loss": 1.8693,
"step": 5050
},
{
"epoch": 1.5811502092698806,
"grad_norm": 0.13802586495876312,
"learning_rate": 2.364341085271318e-05,
"loss": 1.8653,
"step": 5100
},
{
"epoch": 1.5966516819097816,
"grad_norm": 0.13259877264499664,
"learning_rate": 2.3385012919896642e-05,
"loss": 1.85,
"step": 5150
},
{
"epoch": 1.612153154549682,
"grad_norm": 0.14958499372005463,
"learning_rate": 2.3131782945736435e-05,
"loss": 1.8707,
"step": 5200
},
{
"epoch": 1.627654627189583,
"grad_norm": 0.13671304285526276,
"learning_rate": 2.2873385012919898e-05,
"loss": 1.8642,
"step": 5250
},
{
"epoch": 1.6431560998294839,
"grad_norm": 0.12920096516609192,
"learning_rate": 2.261498708010336e-05,
"loss": 1.856,
"step": 5300
},
{
"epoch": 1.6586575724693846,
"grad_norm": 0.1263495683670044,
"learning_rate": 2.235658914728682e-05,
"loss": 1.8639,
"step": 5350
},
{
"epoch": 1.6741590451092854,
"grad_norm": 0.1388077437877655,
"learning_rate": 2.2098191214470284e-05,
"loss": 1.8662,
"step": 5400
},
{
"epoch": 1.6896605177491861,
"grad_norm": 0.13600564002990723,
"learning_rate": 2.1839793281653747e-05,
"loss": 1.8667,
"step": 5450
},
{
"epoch": 1.7051619903890871,
"grad_norm": 0.1285238116979599,
"learning_rate": 2.158139534883721e-05,
"loss": 1.8691,
"step": 5500
},
{
"epoch": 1.7051619903890871,
"eval_loss": 1.4164341688156128,
"eval_runtime": 10.618,
"eval_samples_per_second": 33.528,
"eval_steps_per_second": 4.238,
"step": 5500
},
{
"epoch": 1.7206634630289876,
"grad_norm": 0.14896276593208313,
"learning_rate": 2.1322997416020673e-05,
"loss": 1.8526,
"step": 5550
},
{
"epoch": 1.7361649356688886,
"grad_norm": 0.13365153968334198,
"learning_rate": 2.1064599483204136e-05,
"loss": 1.8592,
"step": 5600
},
{
"epoch": 1.7516664083087892,
"grad_norm": 0.1381397843360901,
"learning_rate": 2.0806201550387596e-05,
"loss": 1.8563,
"step": 5650
},
{
"epoch": 1.7671678809486902,
"grad_norm": 0.14912950992584229,
"learning_rate": 2.054780361757106e-05,
"loss": 1.8562,
"step": 5700
},
{
"epoch": 1.782669353588591,
"grad_norm": 0.13218577206134796,
"learning_rate": 2.0289405684754523e-05,
"loss": 1.8617,
"step": 5750
},
{
"epoch": 1.7981708262284917,
"grad_norm": 0.13850583136081696,
"learning_rate": 2.0031007751937986e-05,
"loss": 1.8563,
"step": 5800
},
{
"epoch": 1.8136722988683927,
"grad_norm": 0.13015194237232208,
"learning_rate": 1.977260981912145e-05,
"loss": 1.8473,
"step": 5850
},
{
"epoch": 1.8291737715082932,
"grad_norm": 0.13473467528820038,
"learning_rate": 1.9514211886304912e-05,
"loss": 1.8549,
"step": 5900
},
{
"epoch": 1.8446752441481942,
"grad_norm": 0.13621702790260315,
"learning_rate": 1.9255813953488372e-05,
"loss": 1.8565,
"step": 5950
},
{
"epoch": 1.8601767167880947,
"grad_norm": 0.1430806815624237,
"learning_rate": 1.8997416020671835e-05,
"loss": 1.8533,
"step": 6000
},
{
"epoch": 1.8601767167880947,
"eval_loss": 1.4264311790466309,
"eval_runtime": 10.6098,
"eval_samples_per_second": 33.554,
"eval_steps_per_second": 4.241,
"step": 6000
}
],
"logging_steps": 50,
"max_steps": 9675,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 1.108324190905172e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}