{ "best_metric": 0.443807452917099, "best_model_checkpoint": "CTCLLMs_self_tokenizer/checkpoints/LongSpeech_CTC-Shrink_augment_data_self_tokenizer_addMLS_projector_restore/checkpoint-30000", "epoch": 1.0, "eval_steps": 1000, "global_step": 31479, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006353441977191143, "grad_norm": 45.06840896606445, "learning_rate": 3.597883597883598e-06, "loss": 72.1477, "step": 20 }, { "epoch": 0.0012706883954382287, "grad_norm": 56.45563507080078, "learning_rate": 7.830687830687831e-06, "loss": 71.8917, "step": 40 }, { "epoch": 0.001906032593157343, "grad_norm": 62.59088897705078, "learning_rate": 1.1851851851851853e-05, "loss": 71.7764, "step": 60 }, { "epoch": 0.0025413767908764573, "grad_norm": 75.64707946777344, "learning_rate": 1.6084656084656086e-05, "loss": 70.9277, "step": 80 }, { "epoch": 0.003176720988595572, "grad_norm": 73.5933837890625, "learning_rate": 2.031746031746032e-05, "loss": 68.0688, "step": 100 }, { "epoch": 0.003812065186314686, "grad_norm": 77.9434814453125, "learning_rate": 2.4550264550264552e-05, "loss": 65.4844, "step": 120 }, { "epoch": 0.004447409384033801, "grad_norm": 81.92144775390625, "learning_rate": 2.8783068783068785e-05, "loss": 61.2486, "step": 140 }, { "epoch": 0.005082753581752915, "grad_norm": 91.82105255126953, "learning_rate": 3.3015873015873014e-05, "loss": 55.9783, "step": 160 }, { "epoch": 0.005718097779472029, "grad_norm": 103.17108917236328, "learning_rate": 3.724867724867725e-05, "loss": 51.7487, "step": 180 }, { "epoch": 0.006353441977191144, "grad_norm": 98.97240447998047, "learning_rate": 4.148148148148148e-05, "loss": 45.0213, "step": 200 }, { "epoch": 0.006988786174910258, "grad_norm": 81.4900894165039, "learning_rate": 4.5714285714285716e-05, "loss": 38.3125, "step": 220 }, { "epoch": 0.007624130372629372, "grad_norm": 71.47420501708984, "learning_rate": 4.9947089947089946e-05, "loss": 33.2395, "step": 240 }, { "epoch": 0.008259474570348486, "grad_norm": 63.618309020996094, "learning_rate": 5.417989417989419e-05, "loss": 28.4421, "step": 260 }, { "epoch": 0.008894818768067601, "grad_norm": 58.004974365234375, "learning_rate": 5.841269841269842e-05, "loss": 25.048, "step": 280 }, { "epoch": 0.009530162965786714, "grad_norm": 46.489200592041016, "learning_rate": 6.264550264550265e-05, "loss": 21.9312, "step": 300 }, { "epoch": 0.01016550716350583, "grad_norm": 37.90148162841797, "learning_rate": 6.687830687830688e-05, "loss": 19.0696, "step": 320 }, { "epoch": 0.010800851361224944, "grad_norm": 36.47368240356445, "learning_rate": 7.111111111111112e-05, "loss": 17.0151, "step": 340 }, { "epoch": 0.011436195558944057, "grad_norm": 32.80181884765625, "learning_rate": 7.534391534391536e-05, "loss": 15.5522, "step": 360 }, { "epoch": 0.012071539756663172, "grad_norm": 25.543760299682617, "learning_rate": 7.957671957671958e-05, "loss": 14.1982, "step": 380 }, { "epoch": 0.012706883954382287, "grad_norm": 22.31871223449707, "learning_rate": 8.380952380952382e-05, "loss": 13.2314, "step": 400 }, { "epoch": 0.0133422281521014, "grad_norm": 18.374950408935547, "learning_rate": 8.804232804232805e-05, "loss": 12.4637, "step": 420 }, { "epoch": 0.013977572349820515, "grad_norm": 18.497610092163086, "learning_rate": 9.227513227513229e-05, "loss": 11.9765, "step": 440 }, { "epoch": 0.01461291654753963, "grad_norm": 14.529912948608398, "learning_rate": 9.650793650793651e-05, "loss": 11.2678, "step": 460 }, { "epoch": 0.015248260745258743, "grad_norm": 12.937056541442871, "learning_rate": 0.00010074074074074073, "loss": 10.6223, "step": 480 }, { "epoch": 0.015883604942977858, "grad_norm": 12.284934043884277, "learning_rate": 0.00010497354497354497, "loss": 10.189, "step": 500 }, { "epoch": 0.016518949140696973, "grad_norm": 9.824132919311523, "learning_rate": 0.0001092063492063492, "loss": 9.8138, "step": 520 }, { "epoch": 0.017154293338416088, "grad_norm": 8.129488945007324, "learning_rate": 0.00011343915343915343, "loss": 9.4242, "step": 540 }, { "epoch": 0.017789637536135203, "grad_norm": 9.27999496459961, "learning_rate": 0.00011767195767195766, "loss": 9.1365, "step": 560 }, { "epoch": 0.018424981733854314, "grad_norm": 5.250537872314453, "learning_rate": 0.00012190476190476193, "loss": 8.8276, "step": 580 }, { "epoch": 0.01906032593157343, "grad_norm": 5.430091381072998, "learning_rate": 0.00012613756613756615, "loss": 8.5892, "step": 600 }, { "epoch": 0.019695670129292544, "grad_norm": 3.3930234909057617, "learning_rate": 0.0001303703703703704, "loss": 8.3652, "step": 620 }, { "epoch": 0.02033101432701166, "grad_norm": 2.841287136077881, "learning_rate": 0.00013460317460317462, "loss": 8.1527, "step": 640 }, { "epoch": 0.020966358524730774, "grad_norm": 2.188707113265991, "learning_rate": 0.00013883597883597885, "loss": 7.9891, "step": 660 }, { "epoch": 0.02160170272244989, "grad_norm": 2.6337716579437256, "learning_rate": 0.0001430687830687831, "loss": 7.8345, "step": 680 }, { "epoch": 0.022237046920169, "grad_norm": 1.7390124797821045, "learning_rate": 0.00014730158730158732, "loss": 7.6817, "step": 700 }, { "epoch": 0.022872391117888115, "grad_norm": 1.6422362327575684, "learning_rate": 0.00015153439153439154, "loss": 7.5748, "step": 720 }, { "epoch": 0.02350773531560723, "grad_norm": 1.6876453161239624, "learning_rate": 0.0001557671957671958, "loss": 7.3896, "step": 740 }, { "epoch": 0.024143079513326345, "grad_norm": 1.230586290359497, "learning_rate": 0.00016, "loss": 7.3337, "step": 760 }, { "epoch": 0.02477842371104546, "grad_norm": 1.2059415578842163, "learning_rate": 0.00016423280423280424, "loss": 7.2545, "step": 780 }, { "epoch": 0.025413767908764574, "grad_norm": 1.5651260614395142, "learning_rate": 0.00016846560846560849, "loss": 7.1927, "step": 800 }, { "epoch": 0.02604911210648369, "grad_norm": 2.234393358230591, "learning_rate": 0.0001726984126984127, "loss": 7.1617, "step": 820 }, { "epoch": 0.0266844563042028, "grad_norm": 1.6703732013702393, "learning_rate": 0.00017693121693121696, "loss": 7.093, "step": 840 }, { "epoch": 0.027319800501921915, "grad_norm": 0.796870231628418, "learning_rate": 0.00018116402116402118, "loss": 7.0105, "step": 860 }, { "epoch": 0.02795514469964103, "grad_norm": 1.0919573307037354, "learning_rate": 0.0001853968253968254, "loss": 6.9911, "step": 880 }, { "epoch": 0.028590488897360145, "grad_norm": 1.3225408792495728, "learning_rate": 0.00018962962962962965, "loss": 6.9353, "step": 900 }, { "epoch": 0.02922583309507926, "grad_norm": 0.9445711970329285, "learning_rate": 0.00019386243386243388, "loss": 6.9075, "step": 920 }, { "epoch": 0.029861177292798375, "grad_norm": 1.0021796226501465, "learning_rate": 0.0001980952380952381, "loss": 6.8545, "step": 940 }, { "epoch": 0.030496521490517486, "grad_norm": 1.147709608078003, "learning_rate": 0.00019999993595464, "loss": 6.8145, "step": 960 }, { "epoch": 0.0311318656882366, "grad_norm": 1.4438824653625488, "learning_rate": 0.00019999949134260042, "loss": 6.7156, "step": 980 }, { "epoch": 0.031767209885955716, "grad_norm": 1.4000093936920166, "learning_rate": 0.0001999986232924222, "loss": 6.6363, "step": 1000 }, { "epoch": 0.031767209885955716, "eval_loss": 6.87591028213501, "eval_runtime": 46.4669, "eval_samples_per_second": 58.17, "eval_steps_per_second": 29.096, "step": 1000 }, { "epoch": 0.03240255408367483, "grad_norm": 2.151993989944458, "learning_rate": 0.00019999733180778103, "loss": 6.5176, "step": 1020 }, { "epoch": 0.033037898281393946, "grad_norm": 1.611135721206665, "learning_rate": 0.00019999561689414561, "loss": 6.4132, "step": 1040 }, { "epoch": 0.03367324247911306, "grad_norm": 2.1010184288024902, "learning_rate": 0.00019999347855877755, "loss": 6.2465, "step": 1060 }, { "epoch": 0.034308586676832176, "grad_norm": 1.5021122694015503, "learning_rate": 0.0001999909168107314, "loss": 6.1662, "step": 1080 }, { "epoch": 0.03494393087455129, "grad_norm": 1.4672967195510864, "learning_rate": 0.0001999879316608547, "loss": 6.0509, "step": 1100 }, { "epoch": 0.035579275072270405, "grad_norm": 1.4146413803100586, "learning_rate": 0.0001999845231217877, "loss": 5.9012, "step": 1120 }, { "epoch": 0.03621461926998951, "grad_norm": 1.252382755279541, "learning_rate": 0.00019998069120796358, "loss": 5.815, "step": 1140 }, { "epoch": 0.03684996346770863, "grad_norm": 1.6317933797836304, "learning_rate": 0.0001999764359356082, "loss": 5.771, "step": 1160 }, { "epoch": 0.03748530766542774, "grad_norm": 1.2354493141174316, "learning_rate": 0.0001999717573227401, "loss": 5.6189, "step": 1180 }, { "epoch": 0.03812065186314686, "grad_norm": 1.1442275047302246, "learning_rate": 0.0001999666553891704, "loss": 5.5078, "step": 1200 }, { "epoch": 0.03875599606086597, "grad_norm": 1.3596833944320679, "learning_rate": 0.0001999611301565027, "loss": 5.4507, "step": 1220 }, { "epoch": 0.03939134025858509, "grad_norm": 1.5420782566070557, "learning_rate": 0.00019995518164813315, "loss": 5.3225, "step": 1240 }, { "epoch": 0.0400266844563042, "grad_norm": 2.335935354232788, "learning_rate": 0.00019994880988925007, "loss": 5.3398, "step": 1260 }, { "epoch": 0.04066202865402332, "grad_norm": 1.2030448913574219, "learning_rate": 0.00019994201490683406, "loss": 5.2367, "step": 1280 }, { "epoch": 0.04129737285174243, "grad_norm": 1.1881422996520996, "learning_rate": 0.00019993479672965783, "loss": 5.2073, "step": 1300 }, { "epoch": 0.04193271704946155, "grad_norm": 1.2961896657943726, "learning_rate": 0.00019992715538828609, "loss": 5.157, "step": 1320 }, { "epoch": 0.04256806124718066, "grad_norm": 0.9343932271003723, "learning_rate": 0.00019991909091507525, "loss": 5.0156, "step": 1340 }, { "epoch": 0.04320340544489978, "grad_norm": 0.9654686450958252, "learning_rate": 0.00019991060334417364, "loss": 5.054, "step": 1360 }, { "epoch": 0.04383874964261889, "grad_norm": 1.4537482261657715, "learning_rate": 0.00019990169271152098, "loss": 4.9824, "step": 1380 }, { "epoch": 0.044474093840338, "grad_norm": 1.0155112743377686, "learning_rate": 0.00019989235905484853, "loss": 4.8496, "step": 1400 }, { "epoch": 0.045109438038057115, "grad_norm": 0.8903729915618896, "learning_rate": 0.00019988260241367875, "loss": 4.8407, "step": 1420 }, { "epoch": 0.04574478223577623, "grad_norm": 1.0020333528518677, "learning_rate": 0.00019987242282932518, "loss": 4.7753, "step": 1440 }, { "epoch": 0.046380126433495344, "grad_norm": 1.2074095010757446, "learning_rate": 0.0001998618203448923, "loss": 4.6939, "step": 1460 }, { "epoch": 0.04701547063121446, "grad_norm": 2.5281686782836914, "learning_rate": 0.00019985079500527527, "loss": 4.6567, "step": 1480 }, { "epoch": 0.047650814828933574, "grad_norm": 1.257580280303955, "learning_rate": 0.00019983934685715982, "loss": 4.5615, "step": 1500 }, { "epoch": 0.04828615902665269, "grad_norm": 1.5581581592559814, "learning_rate": 0.00019982747594902203, "loss": 4.6081, "step": 1520 }, { "epoch": 0.048921503224371804, "grad_norm": 1.029440999031067, "learning_rate": 0.0001998151823311281, "loss": 4.491, "step": 1540 }, { "epoch": 0.04955684742209092, "grad_norm": 0.9729529023170471, "learning_rate": 0.0001998024660555342, "loss": 4.4692, "step": 1560 }, { "epoch": 0.050192191619810034, "grad_norm": 1.1230270862579346, "learning_rate": 0.00019978932717608613, "loss": 4.3839, "step": 1580 }, { "epoch": 0.05082753581752915, "grad_norm": 1.048663854598999, "learning_rate": 0.0001997757657484192, "loss": 4.3907, "step": 1600 }, { "epoch": 0.051462880015248263, "grad_norm": 1.2080233097076416, "learning_rate": 0.000199761781829958, "loss": 4.3147, "step": 1620 }, { "epoch": 0.05209822421296738, "grad_norm": 1.1026450395584106, "learning_rate": 0.000199747375479916, "loss": 4.2496, "step": 1640 }, { "epoch": 0.052733568410686486, "grad_norm": 1.037937879562378, "learning_rate": 0.00019973254675929554, "loss": 4.2614, "step": 1660 }, { "epoch": 0.0533689126084056, "grad_norm": 1.1000276803970337, "learning_rate": 0.00019971729573088742, "loss": 4.1367, "step": 1680 }, { "epoch": 0.054004256806124716, "grad_norm": 1.4259387254714966, "learning_rate": 0.0001997016224592706, "loss": 4.1126, "step": 1700 }, { "epoch": 0.05463960100384383, "grad_norm": 1.2918739318847656, "learning_rate": 0.00019968552701081203, "loss": 4.0945, "step": 1720 }, { "epoch": 0.055274945201562946, "grad_norm": 1.0148296356201172, "learning_rate": 0.00019966900945366634, "loss": 3.9981, "step": 1740 }, { "epoch": 0.05591028939928206, "grad_norm": 1.4177788496017456, "learning_rate": 0.0001996520698577755, "loss": 3.9247, "step": 1760 }, { "epoch": 0.056545633597001176, "grad_norm": 1.1384249925613403, "learning_rate": 0.00019963470829486858, "loss": 3.9204, "step": 1780 }, { "epoch": 0.05718097779472029, "grad_norm": 1.2175607681274414, "learning_rate": 0.0001996169248384615, "loss": 3.9023, "step": 1800 }, { "epoch": 0.057816321992439405, "grad_norm": 1.7040660381317139, "learning_rate": 0.0001995987195638565, "loss": 3.8349, "step": 1820 }, { "epoch": 0.05845166619015852, "grad_norm": 1.4229464530944824, "learning_rate": 0.0001995800925481421, "loss": 3.7969, "step": 1840 }, { "epoch": 0.059087010387877635, "grad_norm": 1.1412523984909058, "learning_rate": 0.0001995610438701925, "loss": 3.6494, "step": 1860 }, { "epoch": 0.05972235458559675, "grad_norm": 1.3119606971740723, "learning_rate": 0.00019954157361066764, "loss": 3.6137, "step": 1880 }, { "epoch": 0.06035769878331586, "grad_norm": 1.260469675064087, "learning_rate": 0.0001995216818520123, "loss": 3.5703, "step": 1900 }, { "epoch": 0.06099304298103497, "grad_norm": 1.6222745180130005, "learning_rate": 0.00019950136867845627, "loss": 3.4526, "step": 1920 }, { "epoch": 0.06162838717875409, "grad_norm": 1.399109125137329, "learning_rate": 0.00019948063417601369, "loss": 3.4467, "step": 1940 }, { "epoch": 0.0622637313764732, "grad_norm": 1.1804718971252441, "learning_rate": 0.00019945947843248276, "loss": 3.3017, "step": 1960 }, { "epoch": 0.06289907557419232, "grad_norm": 1.1146492958068848, "learning_rate": 0.0001994379015374455, "loss": 3.2564, "step": 1980 }, { "epoch": 0.06353441977191143, "grad_norm": 1.3201006650924683, "learning_rate": 0.00019941590358226713, "loss": 3.2076, "step": 2000 }, { "epoch": 0.06353441977191143, "eval_loss": 3.1886417865753174, "eval_runtime": 45.0925, "eval_samples_per_second": 59.943, "eval_steps_per_second": 29.983, "step": 2000 }, { "epoch": 0.06416976396963055, "grad_norm": 1.4352892637252808, "learning_rate": 0.00019939348466009588, "loss": 3.1246, "step": 2020 }, { "epoch": 0.06480510816734966, "grad_norm": 1.4391227960586548, "learning_rate": 0.0001993706448658625, "loss": 3.1187, "step": 2040 }, { "epoch": 0.06544045236506878, "grad_norm": 1.2951711416244507, "learning_rate": 0.0001993473842962798, "loss": 3.0175, "step": 2060 }, { "epoch": 0.06607579656278789, "grad_norm": 1.559552550315857, "learning_rate": 0.00019932370304984255, "loss": 2.8894, "step": 2080 }, { "epoch": 0.066711140760507, "grad_norm": 1.2822929620742798, "learning_rate": 0.00019929960122682655, "loss": 2.8483, "step": 2100 }, { "epoch": 0.06734648495822612, "grad_norm": 1.4227052927017212, "learning_rate": 0.00019927507892928873, "loss": 2.8691, "step": 2120 }, { "epoch": 0.06798182915594524, "grad_norm": 1.643660306930542, "learning_rate": 0.00019925013626106633, "loss": 2.8578, "step": 2140 }, { "epoch": 0.06861717335366435, "grad_norm": 1.1360414028167725, "learning_rate": 0.00019922477332777664, "loss": 2.7094, "step": 2160 }, { "epoch": 0.06925251755138347, "grad_norm": 1.224853277206421, "learning_rate": 0.00019919899023681658, "loss": 2.6953, "step": 2180 }, { "epoch": 0.06988786174910258, "grad_norm": 1.093682885169983, "learning_rate": 0.00019917278709736212, "loss": 2.6255, "step": 2200 }, { "epoch": 0.0705232059468217, "grad_norm": 1.238864779472351, "learning_rate": 0.00019914616402036796, "loss": 2.5893, "step": 2220 }, { "epoch": 0.07115855014454081, "grad_norm": 1.1016559600830078, "learning_rate": 0.00019911912111856688, "loss": 2.4743, "step": 2240 }, { "epoch": 0.07179389434225993, "grad_norm": 1.12881600856781, "learning_rate": 0.00019909165850646941, "loss": 2.5057, "step": 2260 }, { "epoch": 0.07242923853997903, "grad_norm": 1.216238021850586, "learning_rate": 0.00019906377630036338, "loss": 2.4624, "step": 2280 }, { "epoch": 0.07306458273769814, "grad_norm": 1.1429589986801147, "learning_rate": 0.00019903547461831323, "loss": 2.3835, "step": 2300 }, { "epoch": 0.07369992693541726, "grad_norm": 0.9367678165435791, "learning_rate": 0.00019900675358015967, "loss": 2.3971, "step": 2320 }, { "epoch": 0.07433527113313637, "grad_norm": 1.0869677066802979, "learning_rate": 0.00019897761330751922, "loss": 2.3241, "step": 2340 }, { "epoch": 0.07497061533085549, "grad_norm": 0.958840548992157, "learning_rate": 0.0001989480539237835, "loss": 2.2828, "step": 2360 }, { "epoch": 0.0756059595285746, "grad_norm": 0.9724891781806946, "learning_rate": 0.00019891807555411884, "loss": 2.2858, "step": 2380 }, { "epoch": 0.07624130372629372, "grad_norm": 1.045828104019165, "learning_rate": 0.00019888767832546572, "loss": 2.2949, "step": 2400 }, { "epoch": 0.07687664792401283, "grad_norm": 1.0283712148666382, "learning_rate": 0.0001988568623665383, "loss": 2.2034, "step": 2420 }, { "epoch": 0.07751199212173195, "grad_norm": 1.0930371284484863, "learning_rate": 0.00019882562780782376, "loss": 2.2283, "step": 2440 }, { "epoch": 0.07814733631945106, "grad_norm": 0.892132580280304, "learning_rate": 0.00019879397478158177, "loss": 2.1872, "step": 2460 }, { "epoch": 0.07878268051717018, "grad_norm": 1.0107035636901855, "learning_rate": 0.00019876190342184402, "loss": 2.1874, "step": 2480 }, { "epoch": 0.07941802471488929, "grad_norm": 1.1195555925369263, "learning_rate": 0.00019872941386441358, "loss": 2.0823, "step": 2500 }, { "epoch": 0.0800533689126084, "grad_norm": 1.2803888320922852, "learning_rate": 0.0001986965062468643, "loss": 2.0905, "step": 2520 }, { "epoch": 0.08068871311032752, "grad_norm": 1.0955703258514404, "learning_rate": 0.00019866318070854033, "loss": 2.0645, "step": 2540 }, { "epoch": 0.08132405730804664, "grad_norm": 1.117477297782898, "learning_rate": 0.00019862943739055536, "loss": 2.0259, "step": 2560 }, { "epoch": 0.08195940150576575, "grad_norm": 0.9660820960998535, "learning_rate": 0.0001985952764357923, "loss": 1.9881, "step": 2580 }, { "epoch": 0.08259474570348486, "grad_norm": 0.9186820983886719, "learning_rate": 0.0001985606979889023, "loss": 1.9571, "step": 2600 }, { "epoch": 0.08323008990120398, "grad_norm": 1.1236801147460938, "learning_rate": 0.00019852570219630445, "loss": 1.9506, "step": 2620 }, { "epoch": 0.0838654340989231, "grad_norm": 0.9719575047492981, "learning_rate": 0.0001984902892061851, "loss": 1.9359, "step": 2640 }, { "epoch": 0.08450077829664221, "grad_norm": 1.3401118516921997, "learning_rate": 0.00019845445916849704, "loss": 1.9707, "step": 2660 }, { "epoch": 0.08513612249436132, "grad_norm": 0.980446457862854, "learning_rate": 0.00019841821223495916, "loss": 1.88, "step": 2680 }, { "epoch": 0.08577146669208044, "grad_norm": 1.178143858909607, "learning_rate": 0.00019838154855905552, "loss": 1.8629, "step": 2700 }, { "epoch": 0.08640681088979955, "grad_norm": 0.9232170581817627, "learning_rate": 0.00019834446829603494, "loss": 1.8467, "step": 2720 }, { "epoch": 0.08704215508751867, "grad_norm": 1.7343891859054565, "learning_rate": 0.00019830697160291017, "loss": 1.8194, "step": 2740 }, { "epoch": 0.08767749928523778, "grad_norm": 0.878983199596405, "learning_rate": 0.0001982690586384573, "loss": 1.8232, "step": 2760 }, { "epoch": 0.0883128434829569, "grad_norm": 1.0917317867279053, "learning_rate": 0.00019823072956321513, "loss": 1.7668, "step": 2780 }, { "epoch": 0.088948187680676, "grad_norm": 1.0753387212753296, "learning_rate": 0.00019819198453948443, "loss": 1.7968, "step": 2800 }, { "epoch": 0.08958353187839511, "grad_norm": 1.0904388427734375, "learning_rate": 0.00019815282373132718, "loss": 1.7834, "step": 2820 }, { "epoch": 0.09021887607611423, "grad_norm": 0.9622576236724854, "learning_rate": 0.00019811324730456607, "loss": 1.7773, "step": 2840 }, { "epoch": 0.09085422027383334, "grad_norm": 0.8677240610122681, "learning_rate": 0.0001980732554267836, "loss": 1.7322, "step": 2860 }, { "epoch": 0.09148956447155246, "grad_norm": 1.0953987836837769, "learning_rate": 0.0001980328482673215, "loss": 1.7123, "step": 2880 }, { "epoch": 0.09212490866927157, "grad_norm": 1.0277127027511597, "learning_rate": 0.00019799202599727998, "loss": 1.7558, "step": 2900 }, { "epoch": 0.09276025286699069, "grad_norm": 1.1174383163452148, "learning_rate": 0.000197950788789517, "loss": 1.7222, "step": 2920 }, { "epoch": 0.0933955970647098, "grad_norm": 0.9651451706886292, "learning_rate": 0.00019790913681864747, "loss": 1.6652, "step": 2940 }, { "epoch": 0.09403094126242892, "grad_norm": 0.9669461250305176, "learning_rate": 0.00019786707026104265, "loss": 1.6381, "step": 2960 }, { "epoch": 0.09466628546014803, "grad_norm": 0.9406834244728088, "learning_rate": 0.0001978245892948293, "loss": 1.6276, "step": 2980 }, { "epoch": 0.09530162965786715, "grad_norm": 0.9768303632736206, "learning_rate": 0.0001977816940998889, "loss": 1.6071, "step": 3000 }, { "epoch": 0.09530162965786715, "eval_loss": 1.5878759622573853, "eval_runtime": 45.9166, "eval_samples_per_second": 58.868, "eval_steps_per_second": 29.445, "step": 3000 }, { "epoch": 0.09593697385558626, "grad_norm": 1.083208441734314, "learning_rate": 0.00019773838485785702, "loss": 1.6341, "step": 3020 }, { "epoch": 0.09657231805330538, "grad_norm": 0.9333330988883972, "learning_rate": 0.00019769466175212244, "loss": 1.5931, "step": 3040 }, { "epoch": 0.0972076622510245, "grad_norm": 0.9718533754348755, "learning_rate": 0.00019765052496782638, "loss": 1.5735, "step": 3060 }, { "epoch": 0.09784300644874361, "grad_norm": 1.2169800996780396, "learning_rate": 0.00019760597469186184, "loss": 1.5507, "step": 3080 }, { "epoch": 0.09847835064646272, "grad_norm": 0.9822967648506165, "learning_rate": 0.00019756101111287257, "loss": 1.5784, "step": 3100 }, { "epoch": 0.09911369484418184, "grad_norm": 0.9830970168113708, "learning_rate": 0.0001975156344212525, "loss": 1.5473, "step": 3120 }, { "epoch": 0.09974903904190095, "grad_norm": 0.8926035761833191, "learning_rate": 0.00019746984480914484, "loss": 1.5141, "step": 3140 }, { "epoch": 0.10038438323962007, "grad_norm": 0.8814927339553833, "learning_rate": 0.00019742364247044125, "loss": 1.5164, "step": 3160 }, { "epoch": 0.10101972743733918, "grad_norm": 0.8626115322113037, "learning_rate": 0.00019737702760078105, "loss": 1.4495, "step": 3180 }, { "epoch": 0.1016550716350583, "grad_norm": 1.0857669115066528, "learning_rate": 0.00019733000039755036, "loss": 1.511, "step": 3200 }, { "epoch": 0.10229041583277741, "grad_norm": 0.8834457397460938, "learning_rate": 0.00019728256105988132, "loss": 1.4764, "step": 3220 }, { "epoch": 0.10292576003049653, "grad_norm": 0.8241048455238342, "learning_rate": 0.00019723470978865118, "loss": 1.4253, "step": 3240 }, { "epoch": 0.10356110422821564, "grad_norm": 0.9844352006912231, "learning_rate": 0.00019718644678648158, "loss": 1.4595, "step": 3260 }, { "epoch": 0.10419644842593476, "grad_norm": 0.8982945084571838, "learning_rate": 0.00019713777225773745, "loss": 1.3535, "step": 3280 }, { "epoch": 0.10483179262365386, "grad_norm": 1.2204469442367554, "learning_rate": 0.0001970886864085263, "loss": 1.4283, "step": 3300 }, { "epoch": 0.10546713682137297, "grad_norm": 1.0676652193069458, "learning_rate": 0.00019703918944669754, "loss": 1.3858, "step": 3320 }, { "epoch": 0.10610248101909209, "grad_norm": 1.17191743850708, "learning_rate": 0.00019698928158184116, "loss": 1.4426, "step": 3340 }, { "epoch": 0.1067378252168112, "grad_norm": 0.9601316452026367, "learning_rate": 0.00019693896302528716, "loss": 1.3621, "step": 3360 }, { "epoch": 0.10737316941453032, "grad_norm": 0.9755037426948547, "learning_rate": 0.00019688823399010463, "loss": 1.3901, "step": 3380 }, { "epoch": 0.10800851361224943, "grad_norm": 1.0111849308013916, "learning_rate": 0.0001968370946911007, "loss": 1.3748, "step": 3400 }, { "epoch": 0.10864385780996855, "grad_norm": 0.8471179604530334, "learning_rate": 0.00019678554534481978, "loss": 1.3227, "step": 3420 }, { "epoch": 0.10927920200768766, "grad_norm": 0.9206441640853882, "learning_rate": 0.0001967335861695426, "loss": 1.3493, "step": 3440 }, { "epoch": 0.10991454620540678, "grad_norm": 1.055109977722168, "learning_rate": 0.0001966812173852852, "loss": 1.3549, "step": 3460 }, { "epoch": 0.11054989040312589, "grad_norm": 0.995614767074585, "learning_rate": 0.00019662843921379816, "loss": 1.3468, "step": 3480 }, { "epoch": 0.111185234600845, "grad_norm": 0.8873100876808167, "learning_rate": 0.0001965752518785655, "loss": 1.3129, "step": 3500 }, { "epoch": 0.11182057879856412, "grad_norm": 0.9802286624908447, "learning_rate": 0.00019652165560480383, "loss": 1.34, "step": 3520 }, { "epoch": 0.11245592299628324, "grad_norm": 0.9177120923995972, "learning_rate": 0.00019646765061946133, "loss": 1.3321, "step": 3540 }, { "epoch": 0.11309126719400235, "grad_norm": 1.0982646942138672, "learning_rate": 0.00019641323715121692, "loss": 1.292, "step": 3560 }, { "epoch": 0.11372661139172147, "grad_norm": 1.1567240953445435, "learning_rate": 0.00019635841543047918, "loss": 1.3052, "step": 3580 }, { "epoch": 0.11436195558944058, "grad_norm": 0.8516421914100647, "learning_rate": 0.00019630318568938528, "loss": 1.3189, "step": 3600 }, { "epoch": 0.1149972997871597, "grad_norm": 0.9710924029350281, "learning_rate": 0.00019624754816180022, "loss": 1.2644, "step": 3620 }, { "epoch": 0.11563264398487881, "grad_norm": 0.9252649545669556, "learning_rate": 0.00019619150308331572, "loss": 1.2517, "step": 3640 }, { "epoch": 0.11626798818259793, "grad_norm": 0.973948061466217, "learning_rate": 0.0001961350506912493, "loss": 1.2651, "step": 3660 }, { "epoch": 0.11690333238031704, "grad_norm": 0.9078177213668823, "learning_rate": 0.000196078191224643, "loss": 1.2089, "step": 3680 }, { "epoch": 0.11753867657803616, "grad_norm": 0.8456325531005859, "learning_rate": 0.0001960209249242628, "loss": 1.2503, "step": 3700 }, { "epoch": 0.11817402077575527, "grad_norm": 1.2014869451522827, "learning_rate": 0.00019596325203259722, "loss": 1.2287, "step": 3720 }, { "epoch": 0.11880936497347439, "grad_norm": 0.903296172618866, "learning_rate": 0.0001959051727938566, "loss": 1.1999, "step": 3740 }, { "epoch": 0.1194447091711935, "grad_norm": 0.9159349799156189, "learning_rate": 0.00019584668745397182, "loss": 1.2077, "step": 3760 }, { "epoch": 0.12008005336891261, "grad_norm": 1.0457518100738525, "learning_rate": 0.00019578779626059332, "loss": 1.2395, "step": 3780 }, { "epoch": 0.12071539756663172, "grad_norm": 0.8328551650047302, "learning_rate": 0.0001957284994630902, "loss": 1.2039, "step": 3800 }, { "epoch": 0.12135074176435083, "grad_norm": 0.9112881422042847, "learning_rate": 0.00019566879731254902, "loss": 1.1987, "step": 3820 }, { "epoch": 0.12198608596206995, "grad_norm": 2.0256752967834473, "learning_rate": 0.00019560869006177262, "loss": 1.1923, "step": 3840 }, { "epoch": 0.12262143015978906, "grad_norm": 0.9714537262916565, "learning_rate": 0.00019554817796527943, "loss": 1.1726, "step": 3860 }, { "epoch": 0.12325677435750818, "grad_norm": 0.8522310256958008, "learning_rate": 0.00019548726127930198, "loss": 1.1985, "step": 3880 }, { "epoch": 0.12389211855522729, "grad_norm": 0.8728988766670227, "learning_rate": 0.00019542594026178612, "loss": 1.1662, "step": 3900 }, { "epoch": 0.1245274627529464, "grad_norm": 0.9155168533325195, "learning_rate": 0.00019536421517238973, "loss": 1.1529, "step": 3920 }, { "epoch": 0.12516280695066553, "grad_norm": 1.05704665184021, "learning_rate": 0.0001953020862724817, "loss": 1.1415, "step": 3940 }, { "epoch": 0.12579815114838463, "grad_norm": 0.7793872952461243, "learning_rate": 0.0001952395538251408, "loss": 1.1387, "step": 3960 }, { "epoch": 0.12643349534610376, "grad_norm": 0.9358331561088562, "learning_rate": 0.00019517661809515465, "loss": 1.1816, "step": 3980 }, { "epoch": 0.12706883954382286, "grad_norm": 0.8175097107887268, "learning_rate": 0.00019511327934901846, "loss": 1.126, "step": 4000 }, { "epoch": 0.12706883954382286, "eval_loss": 1.1081569194793701, "eval_runtime": 128.6977, "eval_samples_per_second": 21.003, "eval_steps_per_second": 10.505, "step": 4000 }, { "epoch": 0.127704183741542, "grad_norm": 0.9568232893943787, "learning_rate": 0.000195049537854934, "loss": 1.1002, "step": 4020 }, { "epoch": 0.1283395279392611, "grad_norm": 0.9011651277542114, "learning_rate": 0.00019498539388280848, "loss": 1.129, "step": 4040 }, { "epoch": 0.1289748721369802, "grad_norm": 1.045811653137207, "learning_rate": 0.00019492084770425327, "loss": 1.0945, "step": 4060 }, { "epoch": 0.12961021633469932, "grad_norm": 0.8668608069419861, "learning_rate": 0.00019485589959258292, "loss": 1.0601, "step": 4080 }, { "epoch": 0.13024556053241843, "grad_norm": 0.9976728558540344, "learning_rate": 0.00019479054982281393, "loss": 1.1127, "step": 4100 }, { "epoch": 0.13088090473013755, "grad_norm": 0.9135074019432068, "learning_rate": 0.00019472479867166354, "loss": 1.0708, "step": 4120 }, { "epoch": 0.13151624892785665, "grad_norm": 0.8302998542785645, "learning_rate": 0.0001946586464175486, "loss": 1.0925, "step": 4140 }, { "epoch": 0.13215159312557578, "grad_norm": 0.9594709277153015, "learning_rate": 0.0001945920933405844, "loss": 1.0879, "step": 4160 }, { "epoch": 0.13278693732329488, "grad_norm": 1.3145122528076172, "learning_rate": 0.00019452513972258352, "loss": 1.0706, "step": 4180 }, { "epoch": 0.133422281521014, "grad_norm": 1.0521440505981445, "learning_rate": 0.00019445778584705452, "loss": 1.1089, "step": 4200 }, { "epoch": 0.13405762571873311, "grad_norm": 1.1046104431152344, "learning_rate": 0.00019439003199920088, "loss": 1.0965, "step": 4220 }, { "epoch": 0.13469296991645224, "grad_norm": 1.1228617429733276, "learning_rate": 0.00019432187846591967, "loss": 1.0747, "step": 4240 }, { "epoch": 0.13532831411417134, "grad_norm": 0.8399156332015991, "learning_rate": 0.00019425332553580044, "loss": 1.0239, "step": 4260 }, { "epoch": 0.13596365831189047, "grad_norm": 0.9118017554283142, "learning_rate": 0.00019418437349912385, "loss": 1.0557, "step": 4280 }, { "epoch": 0.13659900250960957, "grad_norm": 1.1154282093048096, "learning_rate": 0.00019411502264786069, "loss": 1.0846, "step": 4300 }, { "epoch": 0.1372343467073287, "grad_norm": 0.8457648158073425, "learning_rate": 0.00019404527327567035, "loss": 1.0438, "step": 4320 }, { "epoch": 0.1378696909050478, "grad_norm": 0.9336498975753784, "learning_rate": 0.0001939751256778998, "loss": 1.0403, "step": 4340 }, { "epoch": 0.13850503510276693, "grad_norm": 0.9318077564239502, "learning_rate": 0.0001939045801515822, "loss": 1.0375, "step": 4360 }, { "epoch": 0.13914037930048603, "grad_norm": 0.9146689176559448, "learning_rate": 0.0001938336369954358, "loss": 1.0394, "step": 4380 }, { "epoch": 0.13977572349820516, "grad_norm": 1.2244622707366943, "learning_rate": 0.00019376229650986245, "loss": 1.0305, "step": 4400 }, { "epoch": 0.14041106769592426, "grad_norm": 0.9721834659576416, "learning_rate": 0.00019369055899694652, "loss": 1.0133, "step": 4420 }, { "epoch": 0.1410464118936434, "grad_norm": 0.8538774251937866, "learning_rate": 0.00019361842476045356, "loss": 1.0272, "step": 4440 }, { "epoch": 0.1416817560913625, "grad_norm": 0.7733943462371826, "learning_rate": 0.000193545894105829, "loss": 1.0328, "step": 4460 }, { "epoch": 0.14231710028908162, "grad_norm": 1.0937755107879639, "learning_rate": 0.00019347296734019683, "loss": 1.0501, "step": 4480 }, { "epoch": 0.14295244448680072, "grad_norm": 0.8855345845222473, "learning_rate": 0.00019339964477235836, "loss": 0.9979, "step": 4500 }, { "epoch": 0.14358778868451985, "grad_norm": 0.9113184213638306, "learning_rate": 0.0001933259267127909, "loss": 0.967, "step": 4520 }, { "epoch": 0.14422313288223895, "grad_norm": 0.9671328663825989, "learning_rate": 0.00019325181347364643, "loss": 1.016, "step": 4540 }, { "epoch": 0.14485847707995805, "grad_norm": 0.8655368685722351, "learning_rate": 0.00019317730536875022, "loss": 1.0005, "step": 4560 }, { "epoch": 0.14549382127767718, "grad_norm": 0.8673165440559387, "learning_rate": 0.00019310240271359967, "loss": 0.9697, "step": 4580 }, { "epoch": 0.14612916547539628, "grad_norm": 1.0993086099624634, "learning_rate": 0.00019302710582536276, "loss": 0.9832, "step": 4600 }, { "epoch": 0.1467645096731154, "grad_norm": 1.1561827659606934, "learning_rate": 0.00019295141502287687, "loss": 0.9603, "step": 4620 }, { "epoch": 0.1473998538708345, "grad_norm": 1.0052567720413208, "learning_rate": 0.00019287533062664733, "loss": 0.9808, "step": 4640 }, { "epoch": 0.14803519806855364, "grad_norm": 0.9202858209609985, "learning_rate": 0.00019279885295884618, "loss": 0.9564, "step": 4660 }, { "epoch": 0.14867054226627274, "grad_norm": 0.8606549501419067, "learning_rate": 0.0001927219823433106, "loss": 0.9936, "step": 4680 }, { "epoch": 0.14930588646399187, "grad_norm": 0.9188569784164429, "learning_rate": 0.00019264471910554183, "loss": 0.9833, "step": 4700 }, { "epoch": 0.14994123066171097, "grad_norm": 0.7773941159248352, "learning_rate": 0.0001925670635727035, "loss": 0.9272, "step": 4720 }, { "epoch": 0.1505765748594301, "grad_norm": 0.8689327836036682, "learning_rate": 0.00019248901607362047, "loss": 0.9462, "step": 4740 }, { "epoch": 0.1512119190571492, "grad_norm": 0.800255298614502, "learning_rate": 0.00019241057693877725, "loss": 0.9222, "step": 4760 }, { "epoch": 0.15184726325486833, "grad_norm": 0.9326597452163696, "learning_rate": 0.0001923317465003168, "loss": 0.961, "step": 4780 }, { "epoch": 0.15248260745258743, "grad_norm": 1.072416067123413, "learning_rate": 0.00019225252509203888, "loss": 0.9464, "step": 4800 }, { "epoch": 0.15311795165030656, "grad_norm": 0.9187152981758118, "learning_rate": 0.0001921729130493989, "loss": 0.9461, "step": 4820 }, { "epoch": 0.15375329584802566, "grad_norm": 0.8737976551055908, "learning_rate": 0.00019209291070950633, "loss": 0.8771, "step": 4840 }, { "epoch": 0.1543886400457448, "grad_norm": 0.9321054220199585, "learning_rate": 0.0001920125184111233, "loss": 0.9179, "step": 4860 }, { "epoch": 0.1550239842434639, "grad_norm": 0.7673978209495544, "learning_rate": 0.00019193173649466322, "loss": 0.8711, "step": 4880 }, { "epoch": 0.15565932844118302, "grad_norm": 1.0326552391052246, "learning_rate": 0.00019185056530218923, "loss": 0.9494, "step": 4900 }, { "epoch": 0.15629467263890212, "grad_norm": 0.8184536695480347, "learning_rate": 0.0001917690051774129, "loss": 0.9201, "step": 4920 }, { "epoch": 0.15693001683662125, "grad_norm": 0.8319898247718811, "learning_rate": 0.0001916870564656926, "loss": 0.9167, "step": 4940 }, { "epoch": 0.15756536103434035, "grad_norm": 1.0563160181045532, "learning_rate": 0.0001916047195140323, "loss": 0.8993, "step": 4960 }, { "epoch": 0.15820070523205948, "grad_norm": 0.8466194868087769, "learning_rate": 0.00019152199467107974, "loss": 0.9198, "step": 4980 }, { "epoch": 0.15883604942977858, "grad_norm": 1.1115593910217285, "learning_rate": 0.00019143888228712527, "loss": 0.8749, "step": 5000 }, { "epoch": 0.15883604942977858, "eval_loss": 0.8843944668769836, "eval_runtime": 127.8707, "eval_samples_per_second": 21.139, "eval_steps_per_second": 10.573, "step": 5000 }, { "epoch": 0.1594713936274977, "grad_norm": 0.9679493308067322, "learning_rate": 0.00019135538271410022, "loss": 0.9212, "step": 5020 }, { "epoch": 0.1601067378252168, "grad_norm": 0.8485816121101379, "learning_rate": 0.0001912714963055754, "loss": 0.9054, "step": 5040 }, { "epoch": 0.16074208202293594, "grad_norm": 1.0210843086242676, "learning_rate": 0.0001911872234167597, "loss": 0.917, "step": 5060 }, { "epoch": 0.16137742622065504, "grad_norm": 1.0072481632232666, "learning_rate": 0.00019110256440449844, "loss": 0.9014, "step": 5080 }, { "epoch": 0.16201277041837414, "grad_norm": 0.9833612442016602, "learning_rate": 0.00019101751962727204, "loss": 0.891, "step": 5100 }, { "epoch": 0.16264811461609327, "grad_norm": 1.0564861297607422, "learning_rate": 0.0001909320894451943, "loss": 0.8581, "step": 5120 }, { "epoch": 0.16328345881381237, "grad_norm": 1.1205075979232788, "learning_rate": 0.0001908462742200111, "loss": 0.8884, "step": 5140 }, { "epoch": 0.1639188030115315, "grad_norm": 0.9841699004173279, "learning_rate": 0.0001907600743150986, "loss": 0.8815, "step": 5160 }, { "epoch": 0.1645541472092506, "grad_norm": 0.852820098400116, "learning_rate": 0.00019067349009546197, "loss": 0.8594, "step": 5180 }, { "epoch": 0.16518949140696973, "grad_norm": 0.8630360960960388, "learning_rate": 0.00019058652192773372, "loss": 0.8653, "step": 5200 }, { "epoch": 0.16582483560468883, "grad_norm": 1.0112591981887817, "learning_rate": 0.00019049917018017207, "loss": 0.8715, "step": 5220 }, { "epoch": 0.16646017980240796, "grad_norm": 0.9182717204093933, "learning_rate": 0.00019041143522265948, "loss": 0.8875, "step": 5240 }, { "epoch": 0.16709552400012706, "grad_norm": 1.190596103668213, "learning_rate": 0.0001903233174267012, "loss": 0.9027, "step": 5260 }, { "epoch": 0.1677308681978462, "grad_norm": 0.8345910310745239, "learning_rate": 0.00019023481716542342, "loss": 0.8819, "step": 5280 }, { "epoch": 0.1683662123955653, "grad_norm": 0.8964826464653015, "learning_rate": 0.00019014593481357192, "loss": 0.845, "step": 5300 }, { "epoch": 0.16900155659328442, "grad_norm": 1.1423965692520142, "learning_rate": 0.0001900566707475104, "loss": 0.8463, "step": 5320 }, { "epoch": 0.16963690079100352, "grad_norm": 0.895899772644043, "learning_rate": 0.00018996702534521888, "loss": 0.8631, "step": 5340 }, { "epoch": 0.17027224498872265, "grad_norm": 1.0254230499267578, "learning_rate": 0.00018987699898629208, "loss": 0.8489, "step": 5360 }, { "epoch": 0.17090758918644175, "grad_norm": 0.9370276927947998, "learning_rate": 0.00018978659205193794, "loss": 0.8822, "step": 5380 }, { "epoch": 0.17154293338416088, "grad_norm": 1.1030024290084839, "learning_rate": 0.00018969580492497577, "loss": 0.8834, "step": 5400 }, { "epoch": 0.17217827758187998, "grad_norm": 0.9148856997489929, "learning_rate": 0.00018960463798983494, "loss": 0.8198, "step": 5420 }, { "epoch": 0.1728136217795991, "grad_norm": 0.8851357102394104, "learning_rate": 0.00018951309163255288, "loss": 0.8077, "step": 5440 }, { "epoch": 0.1734489659773182, "grad_norm": 0.9701651334762573, "learning_rate": 0.00018942116624077386, "loss": 0.8687, "step": 5460 }, { "epoch": 0.17408431017503734, "grad_norm": 0.9508700966835022, "learning_rate": 0.00018932886220374696, "loss": 0.8764, "step": 5480 }, { "epoch": 0.17471965437275644, "grad_norm": 0.9914870858192444, "learning_rate": 0.00018923617991232466, "loss": 0.8157, "step": 5500 }, { "epoch": 0.17535499857047557, "grad_norm": 1.010511040687561, "learning_rate": 0.00018914311975896117, "loss": 0.839, "step": 5520 }, { "epoch": 0.17599034276819467, "grad_norm": 0.8063015937805176, "learning_rate": 0.00018904968213771065, "loss": 0.8308, "step": 5540 }, { "epoch": 0.1766256869659138, "grad_norm": 0.8653827905654907, "learning_rate": 0.00018895586744422564, "loss": 0.8304, "step": 5560 }, { "epoch": 0.1772610311636329, "grad_norm": 1.0596357583999634, "learning_rate": 0.00018886167607575532, "loss": 0.8346, "step": 5580 }, { "epoch": 0.177896375361352, "grad_norm": 1.0251786708831787, "learning_rate": 0.00018876710843114398, "loss": 0.8639, "step": 5600 }, { "epoch": 0.17853171955907113, "grad_norm": 0.8897235989570618, "learning_rate": 0.00018867216491082905, "loss": 0.8286, "step": 5620 }, { "epoch": 0.17916706375679023, "grad_norm": 0.8118072748184204, "learning_rate": 0.00018857684591683967, "loss": 0.8597, "step": 5640 }, { "epoch": 0.17980240795450936, "grad_norm": 0.8698698878288269, "learning_rate": 0.0001884811518527949, "loss": 0.7894, "step": 5660 }, { "epoch": 0.18043775215222846, "grad_norm": 0.8228470087051392, "learning_rate": 0.00018838508312390192, "loss": 0.8302, "step": 5680 }, { "epoch": 0.1810730963499476, "grad_norm": 1.1411319971084595, "learning_rate": 0.00018828864013695448, "loss": 0.8313, "step": 5700 }, { "epoch": 0.1817084405476667, "grad_norm": 0.8076447248458862, "learning_rate": 0.00018819182330033103, "loss": 0.798, "step": 5720 }, { "epoch": 0.18234378474538582, "grad_norm": 0.8669622540473938, "learning_rate": 0.00018809463302399304, "loss": 0.7911, "step": 5740 }, { "epoch": 0.18297912894310492, "grad_norm": 0.8435181975364685, "learning_rate": 0.0001879970697194833, "loss": 0.7951, "step": 5760 }, { "epoch": 0.18361447314082405, "grad_norm": 1.1023324728012085, "learning_rate": 0.00018789913379992418, "loss": 0.8253, "step": 5780 }, { "epoch": 0.18424981733854315, "grad_norm": 0.9319256544113159, "learning_rate": 0.00018780082568001585, "loss": 0.7625, "step": 5800 }, { "epoch": 0.18488516153626228, "grad_norm": 0.8259923458099365, "learning_rate": 0.00018770214577603443, "loss": 0.8079, "step": 5820 }, { "epoch": 0.18552050573398138, "grad_norm": 0.8953514695167542, "learning_rate": 0.00018760309450583043, "loss": 0.7647, "step": 5840 }, { "epoch": 0.1861558499317005, "grad_norm": 0.8347587585449219, "learning_rate": 0.00018750367228882685, "loss": 0.8089, "step": 5860 }, { "epoch": 0.1867911941294196, "grad_norm": 0.9788545966148376, "learning_rate": 0.00018740387954601742, "loss": 0.7737, "step": 5880 }, { "epoch": 0.18742653832713874, "grad_norm": 0.9509750008583069, "learning_rate": 0.00018730371669996478, "loss": 0.8073, "step": 5900 }, { "epoch": 0.18806188252485784, "grad_norm": 0.9388551115989685, "learning_rate": 0.0001872031841747988, "loss": 0.7585, "step": 5920 }, { "epoch": 0.18869722672257697, "grad_norm": 0.8342726826667786, "learning_rate": 0.00018710228239621476, "loss": 0.8025, "step": 5940 }, { "epoch": 0.18933257092029607, "grad_norm": 1.0455151796340942, "learning_rate": 0.00018700101179147134, "loss": 0.7603, "step": 5960 }, { "epoch": 0.1899679151180152, "grad_norm": 0.820931077003479, "learning_rate": 0.00018689937278938915, "loss": 0.7972, "step": 5980 }, { "epoch": 0.1906032593157343, "grad_norm": 0.8494334816932678, "learning_rate": 0.00018679736582034867, "loss": 0.7663, "step": 6000 }, { "epoch": 0.1906032593157343, "eval_loss": 0.7605160474777222, "eval_runtime": 45.0866, "eval_samples_per_second": 59.951, "eval_steps_per_second": 29.987, "step": 6000 }, { "epoch": 0.19123860351345343, "grad_norm": 0.9915199279785156, "learning_rate": 0.00018669499131628847, "loss": 0.7911, "step": 6020 }, { "epoch": 0.19187394771117253, "grad_norm": 1.009752869606018, "learning_rate": 0.00018659739550293418, "loss": 0.7791, "step": 6040 }, { "epoch": 0.19250929190889166, "grad_norm": 1.008296012878418, "learning_rate": 0.00018649430555384115, "loss": 0.7741, "step": 6060 }, { "epoch": 0.19314463610661076, "grad_norm": 0.9730678200721741, "learning_rate": 0.0001863908493530077, "loss": 0.8028, "step": 6080 }, { "epoch": 0.19377998030432986, "grad_norm": 0.8386117815971375, "learning_rate": 0.0001862870273385091, "loss": 0.789, "step": 6100 }, { "epoch": 0.194415324502049, "grad_norm": 0.8517867922782898, "learning_rate": 0.00018618283994996954, "loss": 0.7472, "step": 6120 }, { "epoch": 0.1950506686997681, "grad_norm": 0.8791770339012146, "learning_rate": 0.00018607828762856046, "loss": 0.7871, "step": 6140 }, { "epoch": 0.19568601289748722, "grad_norm": 0.9248822331428528, "learning_rate": 0.00018597337081699848, "loss": 0.762, "step": 6160 }, { "epoch": 0.19632135709520632, "grad_norm": 0.8059686422348022, "learning_rate": 0.00018586808995954367, "loss": 0.7345, "step": 6180 }, { "epoch": 0.19695670129292545, "grad_norm": 0.7610188126564026, "learning_rate": 0.00018576244550199758, "loss": 0.7478, "step": 6200 }, { "epoch": 0.19759204549064455, "grad_norm": 0.7763079404830933, "learning_rate": 0.00018565643789170144, "loss": 0.7552, "step": 6220 }, { "epoch": 0.19822738968836368, "grad_norm": 1.1734811067581177, "learning_rate": 0.00018555006757753418, "loss": 0.7645, "step": 6240 }, { "epoch": 0.19886273388608278, "grad_norm": 0.7641186714172363, "learning_rate": 0.00018544333500991053, "loss": 0.7267, "step": 6260 }, { "epoch": 0.1994980780838019, "grad_norm": 0.8322380781173706, "learning_rate": 0.00018533624064077922, "loss": 0.7601, "step": 6280 }, { "epoch": 0.200133422281521, "grad_norm": 0.9059064388275146, "learning_rate": 0.00018522878492362096, "loss": 0.7716, "step": 6300 }, { "epoch": 0.20076876647924013, "grad_norm": 0.7728195786476135, "learning_rate": 0.00018512096831344653, "loss": 0.7435, "step": 6320 }, { "epoch": 0.20140411067695924, "grad_norm": 0.9880885481834412, "learning_rate": 0.00018501279126679495, "loss": 0.7378, "step": 6340 }, { "epoch": 0.20203945487467836, "grad_norm": 0.8192346096038818, "learning_rate": 0.00018490425424173138, "loss": 0.7376, "step": 6360 }, { "epoch": 0.20267479907239747, "grad_norm": 1.175627589225769, "learning_rate": 0.0001847953576978453, "loss": 0.7672, "step": 6380 }, { "epoch": 0.2033101432701166, "grad_norm": 0.7959802746772766, "learning_rate": 0.0001846861020962486, "loss": 0.7331, "step": 6400 }, { "epoch": 0.2039454874678357, "grad_norm": 0.8343777060508728, "learning_rate": 0.0001845764878995735, "loss": 0.7142, "step": 6420 }, { "epoch": 0.20458083166555482, "grad_norm": 0.9900172352790833, "learning_rate": 0.00018446651557197066, "loss": 0.7819, "step": 6440 }, { "epoch": 0.20521617586327393, "grad_norm": 1.111018180847168, "learning_rate": 0.00018435618557910725, "loss": 0.7226, "step": 6460 }, { "epoch": 0.20585152006099305, "grad_norm": 0.9301121830940247, "learning_rate": 0.00018424549838816492, "loss": 0.7295, "step": 6480 }, { "epoch": 0.20648686425871215, "grad_norm": 0.894797146320343, "learning_rate": 0.0001841344544678378, "loss": 0.7199, "step": 6500 }, { "epoch": 0.20712220845643128, "grad_norm": 1.041779637336731, "learning_rate": 0.0001840230542883306, "loss": 0.7213, "step": 6520 }, { "epoch": 0.20775755265415038, "grad_norm": 0.9267428517341614, "learning_rate": 0.00018391129832135659, "loss": 0.7463, "step": 6540 }, { "epoch": 0.2083928968518695, "grad_norm": 0.8043299913406372, "learning_rate": 0.00018379918704013556, "loss": 0.6909, "step": 6560 }, { "epoch": 0.20902824104958861, "grad_norm": 0.8037667870521545, "learning_rate": 0.0001836867209193918, "loss": 0.7307, "step": 6580 }, { "epoch": 0.20966358524730772, "grad_norm": 0.9795257449150085, "learning_rate": 0.00018357390043535228, "loss": 0.7625, "step": 6600 }, { "epoch": 0.21029892944502684, "grad_norm": 1.0763206481933594, "learning_rate": 0.0001834607260657443, "loss": 0.7457, "step": 6620 }, { "epoch": 0.21093427364274595, "grad_norm": 0.8083770275115967, "learning_rate": 0.00018334719828979373, "loss": 0.7398, "step": 6640 }, { "epoch": 0.21156961784046507, "grad_norm": 0.8648799657821655, "learning_rate": 0.00018323331758822299, "loss": 0.7392, "step": 6660 }, { "epoch": 0.21220496203818418, "grad_norm": 1.322874903678894, "learning_rate": 0.0001831190844432488, "loss": 0.767, "step": 6680 }, { "epoch": 0.2128403062359033, "grad_norm": 0.8415853977203369, "learning_rate": 0.00018300449933858034, "loss": 0.7123, "step": 6700 }, { "epoch": 0.2134756504336224, "grad_norm": 0.8832991123199463, "learning_rate": 0.00018288956275941713, "loss": 0.7329, "step": 6720 }, { "epoch": 0.21411099463134153, "grad_norm": 0.8079715967178345, "learning_rate": 0.00018277427519244692, "loss": 0.6988, "step": 6740 }, { "epoch": 0.21474633882906063, "grad_norm": 0.9029518365859985, "learning_rate": 0.00018265863712584377, "loss": 0.6943, "step": 6760 }, { "epoch": 0.21538168302677976, "grad_norm": 0.9082062244415283, "learning_rate": 0.0001825426490492658, "loss": 0.7517, "step": 6780 }, { "epoch": 0.21601702722449886, "grad_norm": 0.9031996726989746, "learning_rate": 0.00018242631145385329, "loss": 0.7108, "step": 6800 }, { "epoch": 0.216652371422218, "grad_norm": 0.9114848375320435, "learning_rate": 0.00018230962483222648, "loss": 0.7151, "step": 6820 }, { "epoch": 0.2172877156199371, "grad_norm": 0.8056477308273315, "learning_rate": 0.00018219258967848355, "loss": 0.7154, "step": 6840 }, { "epoch": 0.21792305981765622, "grad_norm": 0.9029595255851746, "learning_rate": 0.0001820752064881985, "loss": 0.728, "step": 6860 }, { "epoch": 0.21855840401537532, "grad_norm": 0.9304366707801819, "learning_rate": 0.00018195747575841905, "loss": 0.7298, "step": 6880 }, { "epoch": 0.21919374821309445, "grad_norm": 1.2549713850021362, "learning_rate": 0.00018183939798766452, "loss": 0.7166, "step": 6900 }, { "epoch": 0.21982909241081355, "grad_norm": 0.8609549403190613, "learning_rate": 0.0001817209736759238, "loss": 0.7222, "step": 6920 }, { "epoch": 0.22046443660853268, "grad_norm": 0.9668901562690735, "learning_rate": 0.00018160220332465315, "loss": 0.706, "step": 6940 }, { "epoch": 0.22109978080625178, "grad_norm": 0.9426187872886658, "learning_rate": 0.00018148308743677407, "loss": 0.7549, "step": 6960 }, { "epoch": 0.2217351250039709, "grad_norm": 1.0274590253829956, "learning_rate": 0.00018136362651667123, "loss": 0.7118, "step": 6980 }, { "epoch": 0.22237046920169, "grad_norm": 1.0056123733520508, "learning_rate": 0.00018124382107019028, "loss": 0.7284, "step": 7000 }, { "epoch": 0.22237046920169, "eval_loss": 0.6820850968360901, "eval_runtime": 44.1137, "eval_samples_per_second": 61.274, "eval_steps_per_second": 30.648, "step": 7000 }, { "epoch": 0.22300581339940914, "grad_norm": 1.01372492313385, "learning_rate": 0.0001811236716046358, "loss": 0.7306, "step": 7020 }, { "epoch": 0.22364115759712824, "grad_norm": 0.8217781782150269, "learning_rate": 0.000181003178628769, "loss": 0.7216, "step": 7040 }, { "epoch": 0.22427650179484737, "grad_norm": 0.9484082460403442, "learning_rate": 0.00018088234265280573, "loss": 0.7164, "step": 7060 }, { "epoch": 0.22491184599256647, "grad_norm": 1.2144994735717773, "learning_rate": 0.0001807672312378185, "loss": 0.7248, "step": 7080 }, { "epoch": 0.22554719019028557, "grad_norm": 0.9574259519577026, "learning_rate": 0.00018064572788467363, "loss": 0.689, "step": 7100 }, { "epoch": 0.2261825343880047, "grad_norm": 0.7626876831054688, "learning_rate": 0.00018052998338935085, "loss": 0.748, "step": 7120 }, { "epoch": 0.2268178785857238, "grad_norm": 0.8534376621246338, "learning_rate": 0.00018040781461538648, "loss": 0.6947, "step": 7140 }, { "epoch": 0.22745322278344293, "grad_norm": 1.0029544830322266, "learning_rate": 0.00018028530536233676, "loss": 0.7319, "step": 7160 }, { "epoch": 0.22808856698116203, "grad_norm": 0.925713300704956, "learning_rate": 0.00018016245614895518, "loss": 0.7092, "step": 7180 }, { "epoch": 0.22872391117888116, "grad_norm": 0.8006899952888489, "learning_rate": 0.00018003926749543488, "loss": 0.6879, "step": 7200 }, { "epoch": 0.22935925537660026, "grad_norm": 0.8886255025863647, "learning_rate": 0.00017991573992340616, "loss": 0.6784, "step": 7220 }, { "epoch": 0.2299945995743194, "grad_norm": 0.8108293414115906, "learning_rate": 0.00017979187395593459, "loss": 0.7094, "step": 7240 }, { "epoch": 0.2306299437720385, "grad_norm": 1.0475900173187256, "learning_rate": 0.00017966767011751858, "loss": 0.696, "step": 7260 }, { "epoch": 0.23126528796975762, "grad_norm": 0.9214044809341431, "learning_rate": 0.0001795431289340872, "loss": 0.7125, "step": 7280 }, { "epoch": 0.23190063216747672, "grad_norm": 0.996101975440979, "learning_rate": 0.00017941825093299802, "loss": 0.6635, "step": 7300 }, { "epoch": 0.23253597636519585, "grad_norm": 0.9577082991600037, "learning_rate": 0.00017929303664303482, "loss": 0.6753, "step": 7320 }, { "epoch": 0.23317132056291495, "grad_norm": 1.0278524160385132, "learning_rate": 0.00017916748659440533, "loss": 0.7024, "step": 7340 }, { "epoch": 0.23380666476063408, "grad_norm": 0.758007287979126, "learning_rate": 0.00017904160131873906, "loss": 0.6877, "step": 7360 }, { "epoch": 0.23444200895835318, "grad_norm": 0.8926889300346375, "learning_rate": 0.00017891538134908502, "loss": 0.7123, "step": 7380 }, { "epoch": 0.2350773531560723, "grad_norm": 0.8747749924659729, "learning_rate": 0.00017878882721990936, "loss": 0.656, "step": 7400 }, { "epoch": 0.2357126973537914, "grad_norm": 1.012324333190918, "learning_rate": 0.00017866193946709327, "loss": 0.6885, "step": 7420 }, { "epoch": 0.23634804155151054, "grad_norm": 0.7973082065582275, "learning_rate": 0.00017853471862793068, "loss": 0.6627, "step": 7440 }, { "epoch": 0.23698338574922964, "grad_norm": 0.8259735107421875, "learning_rate": 0.00017840716524112582, "loss": 0.6861, "step": 7460 }, { "epoch": 0.23761872994694877, "grad_norm": 0.7817295789718628, "learning_rate": 0.00017827927984679113, "loss": 0.6808, "step": 7480 }, { "epoch": 0.23825407414466787, "grad_norm": 0.8139945864677429, "learning_rate": 0.00017815106298644495, "loss": 0.6891, "step": 7500 }, { "epoch": 0.238889418342387, "grad_norm": 1.0507733821868896, "learning_rate": 0.00017802251520300906, "loss": 0.6936, "step": 7520 }, { "epoch": 0.2395247625401061, "grad_norm": 0.929937481880188, "learning_rate": 0.0001778936370408066, "loss": 0.687, "step": 7540 }, { "epoch": 0.24016010673782523, "grad_norm": 1.0632777214050293, "learning_rate": 0.00017776442904555962, "loss": 0.6656, "step": 7560 }, { "epoch": 0.24079545093554433, "grad_norm": 1.1247339248657227, "learning_rate": 0.00017763489176438686, "loss": 0.6645, "step": 7580 }, { "epoch": 0.24143079513326343, "grad_norm": 0.8897901773452759, "learning_rate": 0.00017750502574580135, "loss": 0.6832, "step": 7600 }, { "epoch": 0.24206613933098256, "grad_norm": 0.9285283088684082, "learning_rate": 0.00017737483153970816, "loss": 0.6841, "step": 7620 }, { "epoch": 0.24270148352870166, "grad_norm": 0.8733476400375366, "learning_rate": 0.00017724430969740196, "loss": 0.6567, "step": 7640 }, { "epoch": 0.2433368277264208, "grad_norm": 0.9532790184020996, "learning_rate": 0.0001771134607715649, "loss": 0.6795, "step": 7660 }, { "epoch": 0.2439721719241399, "grad_norm": 1.0881035327911377, "learning_rate": 0.00017698228531626398, "loss": 0.693, "step": 7680 }, { "epoch": 0.24460751612185902, "grad_norm": 1.0936851501464844, "learning_rate": 0.00017685078388694897, "loss": 0.6852, "step": 7700 }, { "epoch": 0.24524286031957812, "grad_norm": 1.0439817905426025, "learning_rate": 0.0001767189570404499, "loss": 0.6746, "step": 7720 }, { "epoch": 0.24587820451729725, "grad_norm": 0.8599082231521606, "learning_rate": 0.00017658680533497477, "loss": 0.6719, "step": 7740 }, { "epoch": 0.24651354871501635, "grad_norm": 0.9633190035820007, "learning_rate": 0.00017645432933010712, "loss": 0.7091, "step": 7760 }, { "epoch": 0.24714889291273548, "grad_norm": 0.8989465236663818, "learning_rate": 0.00017632152958680378, "loss": 0.6649, "step": 7780 }, { "epoch": 0.24778423711045458, "grad_norm": 0.8468721508979797, "learning_rate": 0.00017618840666739228, "loss": 0.6789, "step": 7800 }, { "epoch": 0.2484195813081737, "grad_norm": 0.8482181429862976, "learning_rate": 0.00017605496113556882, "loss": 0.6902, "step": 7820 }, { "epoch": 0.2490549255058928, "grad_norm": 0.8012595176696777, "learning_rate": 0.00017592119355639544, "loss": 0.6733, "step": 7840 }, { "epoch": 0.24969026970361194, "grad_norm": 0.8117650151252747, "learning_rate": 0.00017578710449629804, "loss": 0.6916, "step": 7860 }, { "epoch": 0.25032561390133107, "grad_norm": 0.9711939096450806, "learning_rate": 0.00017565269452306364, "loss": 0.6701, "step": 7880 }, { "epoch": 0.25096095809905017, "grad_norm": 0.8234876394271851, "learning_rate": 0.00017551796420583833, "loss": 0.62, "step": 7900 }, { "epoch": 0.25159630229676927, "grad_norm": 0.8263707756996155, "learning_rate": 0.00017538967420545803, "loss": 0.6907, "step": 7920 }, { "epoch": 0.25223164649448837, "grad_norm": 1.2548505067825317, "learning_rate": 0.00017525432085959138, "loss": 0.6644, "step": 7940 }, { "epoch": 0.2528669906922075, "grad_norm": 1.1948567628860474, "learning_rate": 0.00017511864885660835, "loss": 0.6609, "step": 7960 }, { "epoch": 0.25350233488992663, "grad_norm": 0.9310169219970703, "learning_rate": 0.0001749826587709989, "loss": 0.6757, "step": 7980 }, { "epoch": 0.25413767908764573, "grad_norm": 0.8832531571388245, "learning_rate": 0.00017484635117859983, "loss": 0.6552, "step": 8000 }, { "epoch": 0.25413767908764573, "eval_loss": 0.6333429217338562, "eval_runtime": 44.406, "eval_samples_per_second": 60.87, "eval_steps_per_second": 30.446, "step": 8000 }, { "epoch": 0.25477302328536483, "grad_norm": 0.7624004483222961, "learning_rate": 0.00017470972665659245, "loss": 0.6567, "step": 8020 }, { "epoch": 0.255408367483084, "grad_norm": 0.9134401082992554, "learning_rate": 0.00017457278578350002, "loss": 0.6681, "step": 8040 }, { "epoch": 0.2560437116808031, "grad_norm": 0.9597674608230591, "learning_rate": 0.00017443552913918534, "loss": 0.6818, "step": 8060 }, { "epoch": 0.2566790558785222, "grad_norm": 0.961934506893158, "learning_rate": 0.00017429795730484836, "loss": 0.6833, "step": 8080 }, { "epoch": 0.2573144000762413, "grad_norm": 0.9118033647537231, "learning_rate": 0.00017416007086302367, "loss": 0.6607, "step": 8100 }, { "epoch": 0.2579497442739604, "grad_norm": 0.8447214961051941, "learning_rate": 0.00017402187039757805, "loss": 0.6409, "step": 8120 }, { "epoch": 0.25858508847167955, "grad_norm": 1.010040044784546, "learning_rate": 0.0001738833564937079, "loss": 0.6761, "step": 8140 }, { "epoch": 0.25922043266939865, "grad_norm": 0.8686466217041016, "learning_rate": 0.00017374452973793693, "loss": 0.6575, "step": 8160 }, { "epoch": 0.25985577686711775, "grad_norm": 1.0445839166641235, "learning_rate": 0.00017360539071811356, "loss": 0.667, "step": 8180 }, { "epoch": 0.26049112106483685, "grad_norm": 1.1015607118606567, "learning_rate": 0.00017346594002340843, "loss": 0.6468, "step": 8200 }, { "epoch": 0.261126465262556, "grad_norm": 1.4550483226776123, "learning_rate": 0.00017332617824431204, "loss": 0.6642, "step": 8220 }, { "epoch": 0.2617618094602751, "grad_norm": 0.8968580961227417, "learning_rate": 0.000173186105972632, "loss": 0.6695, "step": 8240 }, { "epoch": 0.2623971536579942, "grad_norm": 0.9802786111831665, "learning_rate": 0.00017304572380149078, "loss": 0.6516, "step": 8260 }, { "epoch": 0.2630324978557133, "grad_norm": 0.8785617351531982, "learning_rate": 0.00017290503232532305, "loss": 0.6857, "step": 8280 }, { "epoch": 0.26366784205343247, "grad_norm": 0.8675135970115662, "learning_rate": 0.00017276403213987323, "loss": 0.6493, "step": 8300 }, { "epoch": 0.26430318625115157, "grad_norm": 0.8159687519073486, "learning_rate": 0.0001726227238421929, "loss": 0.6445, "step": 8320 }, { "epoch": 0.26493853044887067, "grad_norm": 0.8598359823226929, "learning_rate": 0.00017248110803063833, "loss": 0.6515, "step": 8340 }, { "epoch": 0.26557387464658977, "grad_norm": 1.0304324626922607, "learning_rate": 0.00017233918530486792, "loss": 0.6431, "step": 8360 }, { "epoch": 0.2662092188443089, "grad_norm": 0.933110773563385, "learning_rate": 0.0001722040749834389, "loss": 0.6958, "step": 8380 }, { "epoch": 0.266844563042028, "grad_norm": 0.9690568447113037, "learning_rate": 0.0001720615555046345, "loss": 0.5922, "step": 8400 }, { "epoch": 0.26747990723974713, "grad_norm": 0.9293822646141052, "learning_rate": 0.0001719187308881687, "loss": 0.6407, "step": 8420 }, { "epoch": 0.26811525143746623, "grad_norm": 0.8957870602607727, "learning_rate": 0.00017177560173881846, "loss": 0.662, "step": 8440 }, { "epoch": 0.2687505956351854, "grad_norm": 1.0288225412368774, "learning_rate": 0.0001716321686626503, "loss": 0.6395, "step": 8460 }, { "epoch": 0.2693859398329045, "grad_norm": 0.838657021522522, "learning_rate": 0.00017148843226701764, "loss": 0.6313, "step": 8480 }, { "epoch": 0.2700212840306236, "grad_norm": 0.8575971722602844, "learning_rate": 0.00017134439316055834, "loss": 0.6655, "step": 8500 }, { "epoch": 0.2706566282283427, "grad_norm": 0.9840354919433594, "learning_rate": 0.00017120005195319195, "loss": 0.6646, "step": 8520 }, { "epoch": 0.27129197242606184, "grad_norm": 0.8279704451560974, "learning_rate": 0.00017105540925611737, "loss": 0.6259, "step": 8540 }, { "epoch": 0.27192731662378095, "grad_norm": 1.0609900951385498, "learning_rate": 0.00017091046568180996, "loss": 0.6561, "step": 8560 }, { "epoch": 0.27256266082150005, "grad_norm": 0.890514612197876, "learning_rate": 0.0001707652218440193, "loss": 0.6324, "step": 8580 }, { "epoch": 0.27319800501921915, "grad_norm": 0.9357948303222656, "learning_rate": 0.0001706196783577663, "loss": 0.6116, "step": 8600 }, { "epoch": 0.27383334921693825, "grad_norm": 0.9577456116676331, "learning_rate": 0.0001704738358393407, "loss": 0.6764, "step": 8620 }, { "epoch": 0.2744686934146574, "grad_norm": 0.834900438785553, "learning_rate": 0.0001703276949062985, "loss": 0.6324, "step": 8640 }, { "epoch": 0.2751040376123765, "grad_norm": 0.8283354043960571, "learning_rate": 0.00017018125617745933, "loss": 0.6187, "step": 8660 }, { "epoch": 0.2757393818100956, "grad_norm": 0.854200541973114, "learning_rate": 0.00017003452027290373, "loss": 0.6294, "step": 8680 }, { "epoch": 0.2763747260078147, "grad_norm": 0.8695046901702881, "learning_rate": 0.00016988748781397064, "loss": 0.6377, "step": 8700 }, { "epoch": 0.27701007020553386, "grad_norm": 0.7802212238311768, "learning_rate": 0.00016974015942325475, "loss": 0.6051, "step": 8720 }, { "epoch": 0.27764541440325297, "grad_norm": 1.0842890739440918, "learning_rate": 0.00016959253572460382, "loss": 0.6352, "step": 8740 }, { "epoch": 0.27828075860097207, "grad_norm": 0.8472367525100708, "learning_rate": 0.0001694446173431161, "loss": 0.5907, "step": 8760 }, { "epoch": 0.27891610279869117, "grad_norm": 0.8548029661178589, "learning_rate": 0.0001692964049051376, "loss": 0.6434, "step": 8780 }, { "epoch": 0.2795514469964103, "grad_norm": 0.9771581888198853, "learning_rate": 0.00016914789903825945, "loss": 0.6381, "step": 8800 }, { "epoch": 0.2801867911941294, "grad_norm": 0.9199798703193665, "learning_rate": 0.0001689991003713154, "loss": 0.6589, "step": 8820 }, { "epoch": 0.2808221353918485, "grad_norm": 1.0753369331359863, "learning_rate": 0.00016885000953437894, "loss": 0.6413, "step": 8840 }, { "epoch": 0.2814574795895676, "grad_norm": 1.0925753116607666, "learning_rate": 0.00016870062715876075, "loss": 0.6234, "step": 8860 }, { "epoch": 0.2820928237872868, "grad_norm": 1.0023586750030518, "learning_rate": 0.00016855095387700598, "loss": 0.6104, "step": 8880 }, { "epoch": 0.2827281679850059, "grad_norm": 0.9077417254447937, "learning_rate": 0.00016840099032289162, "loss": 0.602, "step": 8900 }, { "epoch": 0.283363512182725, "grad_norm": 0.8238940238952637, "learning_rate": 0.00016825073713142374, "loss": 0.6157, "step": 8920 }, { "epoch": 0.2839988563804441, "grad_norm": 1.111948847770691, "learning_rate": 0.000168100194938835, "loss": 0.6092, "step": 8940 }, { "epoch": 0.28463420057816324, "grad_norm": 1.0630967617034912, "learning_rate": 0.0001679493643825816, "loss": 0.5904, "step": 8960 }, { "epoch": 0.28526954477588234, "grad_norm": 0.8827186822891235, "learning_rate": 0.00016779824610134092, "loss": 0.6166, "step": 8980 }, { "epoch": 0.28590488897360145, "grad_norm": 0.9229192137718201, "learning_rate": 0.00016764684073500866, "loss": 0.6178, "step": 9000 }, { "epoch": 0.28590488897360145, "eval_loss": 0.5966877341270447, "eval_runtime": 44.6044, "eval_samples_per_second": 60.599, "eval_steps_per_second": 30.311, "step": 9000 }, { "epoch": 0.28654023317132055, "grad_norm": 0.8136707544326782, "learning_rate": 0.00016749514892469615, "loss": 0.6366, "step": 9020 }, { "epoch": 0.2871755773690397, "grad_norm": 0.8175415992736816, "learning_rate": 0.00016734317131272762, "loss": 0.6177, "step": 9040 }, { "epoch": 0.2878109215667588, "grad_norm": 0.929182767868042, "learning_rate": 0.00016719090854263753, "loss": 0.646, "step": 9060 }, { "epoch": 0.2884462657644779, "grad_norm": 0.9779849052429199, "learning_rate": 0.0001670383612591678, "loss": 0.6362, "step": 9080 }, { "epoch": 0.289081609962197, "grad_norm": 0.8542407751083374, "learning_rate": 0.00016688553010826506, "loss": 0.6076, "step": 9100 }, { "epoch": 0.2897169541599161, "grad_norm": 0.8885607719421387, "learning_rate": 0.00016673241573707804, "loss": 0.6055, "step": 9120 }, { "epoch": 0.29035229835763526, "grad_norm": 0.876097559928894, "learning_rate": 0.0001665790187939546, "loss": 0.6196, "step": 9140 }, { "epoch": 0.29098764255535436, "grad_norm": 1.0198227167129517, "learning_rate": 0.0001664253399284393, "loss": 0.6374, "step": 9160 }, { "epoch": 0.29162298675307347, "grad_norm": 0.8938513994216919, "learning_rate": 0.00016627137979127033, "loss": 0.6254, "step": 9180 }, { "epoch": 0.29225833095079257, "grad_norm": 0.7427443861961365, "learning_rate": 0.00016611713903437692, "loss": 0.6099, "step": 9200 }, { "epoch": 0.2928936751485117, "grad_norm": 0.9959378242492676, "learning_rate": 0.00016596261831087661, "loss": 0.648, "step": 9220 }, { "epoch": 0.2935290193462308, "grad_norm": 1.048519253730774, "learning_rate": 0.00016580781827507242, "loss": 0.6292, "step": 9240 }, { "epoch": 0.2941643635439499, "grad_norm": 0.858858585357666, "learning_rate": 0.00016565273958245002, "loss": 0.6252, "step": 9260 }, { "epoch": 0.294799707741669, "grad_norm": 0.8437022566795349, "learning_rate": 0.00016549738288967514, "loss": 0.6188, "step": 9280 }, { "epoch": 0.2954350519393882, "grad_norm": 0.8608834743499756, "learning_rate": 0.00016534174885459056, "loss": 0.6509, "step": 9300 }, { "epoch": 0.2960703961371073, "grad_norm": 1.083897590637207, "learning_rate": 0.00016518583813621357, "loss": 0.6193, "step": 9320 }, { "epoch": 0.2967057403348264, "grad_norm": 0.9606235027313232, "learning_rate": 0.0001650296513947329, "loss": 0.6287, "step": 9340 }, { "epoch": 0.2973410845325455, "grad_norm": 1.0519804954528809, "learning_rate": 0.00016487318929150617, "loss": 0.6097, "step": 9360 }, { "epoch": 0.29797642873026464, "grad_norm": 1.3490453958511353, "learning_rate": 0.000164716452489057, "loss": 0.6043, "step": 9380 }, { "epoch": 0.29861177292798374, "grad_norm": 1.1292142868041992, "learning_rate": 0.00016455944165107207, "loss": 0.5896, "step": 9400 }, { "epoch": 0.29924711712570284, "grad_norm": 0.9570278525352478, "learning_rate": 0.00016440215744239865, "loss": 0.6087, "step": 9420 }, { "epoch": 0.29988246132342194, "grad_norm": 0.8570756316184998, "learning_rate": 0.00016424460052904137, "loss": 0.6036, "step": 9440 }, { "epoch": 0.3005178055211411, "grad_norm": 0.9214951395988464, "learning_rate": 0.00016408677157815974, "loss": 0.6519, "step": 9460 }, { "epoch": 0.3011531497188602, "grad_norm": 1.1580623388290405, "learning_rate": 0.00016392867125806504, "loss": 0.5991, "step": 9480 }, { "epoch": 0.3017884939165793, "grad_norm": 1.1025846004486084, "learning_rate": 0.00016377030023821782, "loss": 0.6416, "step": 9500 }, { "epoch": 0.3024238381142984, "grad_norm": 0.8918984532356262, "learning_rate": 0.00016361165918922477, "loss": 0.6165, "step": 9520 }, { "epoch": 0.30305918231201756, "grad_norm": 0.8747968673706055, "learning_rate": 0.000163452748782836, "loss": 0.6094, "step": 9540 }, { "epoch": 0.30369452650973666, "grad_norm": 0.7480270862579346, "learning_rate": 0.0001632935696919422, "loss": 0.5987, "step": 9560 }, { "epoch": 0.30432987070745576, "grad_norm": 0.8854328393936157, "learning_rate": 0.00016313412259057178, "loss": 0.6514, "step": 9580 }, { "epoch": 0.30496521490517486, "grad_norm": 1.0659030675888062, "learning_rate": 0.00016297440815388802, "loss": 0.5796, "step": 9600 }, { "epoch": 0.305600559102894, "grad_norm": 0.9668769240379333, "learning_rate": 0.00016281442705818618, "loss": 0.6147, "step": 9620 }, { "epoch": 0.3062359033006131, "grad_norm": 0.939028263092041, "learning_rate": 0.00016265417998089068, "loss": 0.6241, "step": 9640 }, { "epoch": 0.3068712474983322, "grad_norm": 0.8955005407333374, "learning_rate": 0.00016249366760055222, "loss": 0.5832, "step": 9660 }, { "epoch": 0.3075065916960513, "grad_norm": 0.7991370558738708, "learning_rate": 0.00016233289059684492, "loss": 0.5799, "step": 9680 }, { "epoch": 0.3081419358937704, "grad_norm": 0.8115846514701843, "learning_rate": 0.00016217184965056336, "loss": 0.6109, "step": 9700 }, { "epoch": 0.3087772800914896, "grad_norm": 0.7488042712211609, "learning_rate": 0.00016201054544361977, "loss": 0.6166, "step": 9720 }, { "epoch": 0.3094126242892087, "grad_norm": 0.8463062644004822, "learning_rate": 0.00016184897865904123, "loss": 0.5779, "step": 9740 }, { "epoch": 0.3100479684869278, "grad_norm": 1.083001732826233, "learning_rate": 0.00016168714998096654, "loss": 0.6175, "step": 9760 }, { "epoch": 0.3106833126846469, "grad_norm": 0.8545092940330505, "learning_rate": 0.00016152506009464357, "loss": 0.6104, "step": 9780 }, { "epoch": 0.31131865688236604, "grad_norm": 0.9297589063644409, "learning_rate": 0.00016136270968642618, "loss": 0.5831, "step": 9800 }, { "epoch": 0.31195400108008514, "grad_norm": 0.7775977253913879, "learning_rate": 0.0001612000994437714, "loss": 0.6001, "step": 9820 }, { "epoch": 0.31258934527780424, "grad_norm": 0.943267822265625, "learning_rate": 0.0001610372300552366, "loss": 0.6089, "step": 9840 }, { "epoch": 0.31322468947552334, "grad_norm": 0.8398995399475098, "learning_rate": 0.0001608741022104763, "loss": 0.5929, "step": 9860 }, { "epoch": 0.3138600336732425, "grad_norm": 1.0078269243240356, "learning_rate": 0.00016071071660023954, "loss": 0.6215, "step": 9880 }, { "epoch": 0.3144953778709616, "grad_norm": 0.9710105657577515, "learning_rate": 0.0001605470739163669, "loss": 0.5983, "step": 9900 }, { "epoch": 0.3151307220686807, "grad_norm": 0.8864800333976746, "learning_rate": 0.00016038317485178734, "loss": 0.5812, "step": 9920 }, { "epoch": 0.3157660662663998, "grad_norm": 0.9775105118751526, "learning_rate": 0.0001602190201005156, "loss": 0.5899, "step": 9940 }, { "epoch": 0.31640141046411896, "grad_norm": 0.8554601669311523, "learning_rate": 0.00016005461035764902, "loss": 0.5989, "step": 9960 }, { "epoch": 0.31703675466183806, "grad_norm": 0.8149896264076233, "learning_rate": 0.0001598899463193647, "loss": 0.6383, "step": 9980 }, { "epoch": 0.31767209885955716, "grad_norm": 1.1985602378845215, "learning_rate": 0.00015972502868291652, "loss": 0.604, "step": 10000 }, { "epoch": 0.31767209885955716, "eval_loss": 0.5633410811424255, "eval_runtime": 44.2566, "eval_samples_per_second": 61.076, "eval_steps_per_second": 30.549, "step": 10000 }, { "epoch": 0.31830744305727626, "grad_norm": 0.9848890900611877, "learning_rate": 0.0001595598581466322, "loss": 0.5741, "step": 10020 }, { "epoch": 0.3189427872549954, "grad_norm": 1.0653225183486938, "learning_rate": 0.00015939443540991034, "loss": 0.6154, "step": 10040 }, { "epoch": 0.3195781314527145, "grad_norm": 0.8440039157867432, "learning_rate": 0.0001592287611732175, "loss": 0.6077, "step": 10060 }, { "epoch": 0.3202134756504336, "grad_norm": 0.8706631660461426, "learning_rate": 0.00015906283613808508, "loss": 0.6143, "step": 10080 }, { "epoch": 0.3208488198481527, "grad_norm": 1.0338808298110962, "learning_rate": 0.00015889666100710659, "loss": 0.5697, "step": 10100 }, { "epoch": 0.3214841640458719, "grad_norm": 0.8499680757522583, "learning_rate": 0.00015873023648393448, "loss": 0.5968, "step": 10120 }, { "epoch": 0.322119508243591, "grad_norm": 1.0106873512268066, "learning_rate": 0.00015856356327327724, "loss": 0.5657, "step": 10140 }, { "epoch": 0.3227548524413101, "grad_norm": 0.9771645665168762, "learning_rate": 0.00015839664208089634, "loss": 0.5989, "step": 10160 }, { "epoch": 0.3233901966390292, "grad_norm": 0.9425153136253357, "learning_rate": 0.0001582294736136035, "loss": 0.6314, "step": 10180 }, { "epoch": 0.3240255408367483, "grad_norm": 1.1419885158538818, "learning_rate": 0.0001580620585792572, "loss": 0.6137, "step": 10200 }, { "epoch": 0.32466088503446744, "grad_norm": 0.8356417417526245, "learning_rate": 0.00015789439768676032, "loss": 0.6189, "step": 10220 }, { "epoch": 0.32529622923218654, "grad_norm": 0.9876666069030762, "learning_rate": 0.00015772649164605648, "loss": 0.6069, "step": 10240 }, { "epoch": 0.32593157342990564, "grad_norm": 1.0510075092315674, "learning_rate": 0.0001575583411681276, "loss": 0.5996, "step": 10260 }, { "epoch": 0.32656691762762474, "grad_norm": 0.91109299659729, "learning_rate": 0.00015738994696499055, "loss": 0.5996, "step": 10280 }, { "epoch": 0.3272022618253439, "grad_norm": 0.8995181322097778, "learning_rate": 0.00015722130974969421, "loss": 0.5798, "step": 10300 }, { "epoch": 0.327837606023063, "grad_norm": 1.1067475080490112, "learning_rate": 0.00015705243023631652, "loss": 0.5983, "step": 10320 }, { "epoch": 0.3284729502207821, "grad_norm": 1.0324633121490479, "learning_rate": 0.00015688330913996135, "loss": 0.6011, "step": 10340 }, { "epoch": 0.3291082944185012, "grad_norm": 1.0662481784820557, "learning_rate": 0.0001567139471767556, "loss": 0.6254, "step": 10360 }, { "epoch": 0.32974363861622036, "grad_norm": 0.9539555907249451, "learning_rate": 0.00015654434506384607, "loss": 0.6176, "step": 10380 }, { "epoch": 0.33037898281393946, "grad_norm": 0.7341588139533997, "learning_rate": 0.00015637450351939637, "loss": 0.5852, "step": 10400 }, { "epoch": 0.33101432701165856, "grad_norm": 0.9077139496803284, "learning_rate": 0.00015620442326258414, "loss": 0.609, "step": 10420 }, { "epoch": 0.33164967120937766, "grad_norm": 1.083999752998352, "learning_rate": 0.00015603410501359766, "loss": 0.5768, "step": 10440 }, { "epoch": 0.3322850154070968, "grad_norm": 0.9190422296524048, "learning_rate": 0.000155863549493633, "loss": 0.5845, "step": 10460 }, { "epoch": 0.3329203596048159, "grad_norm": 1.0731889009475708, "learning_rate": 0.000155692757424891, "loss": 0.5988, "step": 10480 }, { "epoch": 0.333555703802535, "grad_norm": 0.9898316264152527, "learning_rate": 0.00015552172953057407, "loss": 0.5918, "step": 10500 }, { "epoch": 0.3341910480002541, "grad_norm": 1.135695219039917, "learning_rate": 0.00015535046653488322, "loss": 0.5882, "step": 10520 }, { "epoch": 0.3348263921979733, "grad_norm": 1.0453022718429565, "learning_rate": 0.000155178969163015, "loss": 0.609, "step": 10540 }, { "epoch": 0.3354617363956924, "grad_norm": 0.9859703183174133, "learning_rate": 0.00015500723814115835, "loss": 0.5899, "step": 10560 }, { "epoch": 0.3360970805934115, "grad_norm": 1.031168818473816, "learning_rate": 0.00015483527419649163, "loss": 0.5987, "step": 10580 }, { "epoch": 0.3367324247911306, "grad_norm": 1.1591908931732178, "learning_rate": 0.00015466307805717951, "loss": 0.6191, "step": 10600 }, { "epoch": 0.33736776898884974, "grad_norm": 0.8246921896934509, "learning_rate": 0.00015449065045236977, "loss": 0.6098, "step": 10620 }, { "epoch": 0.33800311318656884, "grad_norm": 0.8392571210861206, "learning_rate": 0.0001543179921121904, "loss": 0.5675, "step": 10640 }, { "epoch": 0.33863845738428794, "grad_norm": 0.8678343892097473, "learning_rate": 0.00015414510376774633, "loss": 0.5721, "step": 10660 }, { "epoch": 0.33927380158200704, "grad_norm": 0.8436061143875122, "learning_rate": 0.00015397198615111653, "loss": 0.5703, "step": 10680 }, { "epoch": 0.33990914577972614, "grad_norm": 0.9926438927650452, "learning_rate": 0.00015379863999535074, "loss": 0.6049, "step": 10700 }, { "epoch": 0.3405444899774453, "grad_norm": 1.098764419555664, "learning_rate": 0.00015362506603446637, "loss": 0.6007, "step": 10720 }, { "epoch": 0.3411798341751644, "grad_norm": 1.052038311958313, "learning_rate": 0.00015345126500344554, "loss": 0.5865, "step": 10740 }, { "epoch": 0.3418151783728835, "grad_norm": 0.8772541880607605, "learning_rate": 0.00015327723763823188, "loss": 0.6066, "step": 10760 }, { "epoch": 0.3424505225706026, "grad_norm": 0.7938296794891357, "learning_rate": 0.00015310298467572733, "loss": 0.5467, "step": 10780 }, { "epoch": 0.34308586676832176, "grad_norm": 1.0938440561294556, "learning_rate": 0.00015292850685378915, "loss": 0.5916, "step": 10800 }, { "epoch": 0.34372121096604086, "grad_norm": 0.8460657000541687, "learning_rate": 0.00015275380491122672, "loss": 0.603, "step": 10820 }, { "epoch": 0.34435655516375996, "grad_norm": 0.8238389492034912, "learning_rate": 0.00015257887958779854, "loss": 0.5808, "step": 10840 }, { "epoch": 0.34499189936147906, "grad_norm": 0.8064368367195129, "learning_rate": 0.0001524037316242088, "loss": 0.5862, "step": 10860 }, { "epoch": 0.3456272435591982, "grad_norm": 1.2068203687667847, "learning_rate": 0.00015222836176210467, "loss": 0.5694, "step": 10880 }, { "epoch": 0.3462625877569173, "grad_norm": 0.9752914309501648, "learning_rate": 0.00015205277074407266, "loss": 0.5367, "step": 10900 }, { "epoch": 0.3468979319546364, "grad_norm": 0.9989959597587585, "learning_rate": 0.00015187695931363602, "loss": 0.5712, "step": 10920 }, { "epoch": 0.3475332761523555, "grad_norm": 0.8734492659568787, "learning_rate": 0.00015170092821525114, "loss": 0.6029, "step": 10940 }, { "epoch": 0.3481686203500747, "grad_norm": 0.8759735822677612, "learning_rate": 0.00015152467819430458, "loss": 0.5676, "step": 10960 }, { "epoch": 0.3488039645477938, "grad_norm": 0.8554444909095764, "learning_rate": 0.00015134820999711, "loss": 0.5664, "step": 10980 }, { "epoch": 0.3494393087455129, "grad_norm": 0.730451762676239, "learning_rate": 0.00015117152437090482, "loss": 0.5735, "step": 11000 }, { "epoch": 0.3494393087455129, "eval_loss": 0.5449489951133728, "eval_runtime": 44.9152, "eval_samples_per_second": 60.18, "eval_steps_per_second": 30.101, "step": 11000 }, { "epoch": 0.350074652943232, "grad_norm": 0.7964712381362915, "learning_rate": 0.00015099462206384718, "loss": 0.5943, "step": 11020 }, { "epoch": 0.35070999714095114, "grad_norm": 0.809177577495575, "learning_rate": 0.00015081750382501277, "loss": 0.5986, "step": 11040 }, { "epoch": 0.35134534133867024, "grad_norm": 0.9207815527915955, "learning_rate": 0.00015064017040439148, "loss": 0.559, "step": 11060 }, { "epoch": 0.35198068553638934, "grad_norm": 0.9813947677612305, "learning_rate": 0.0001504626225528845, "loss": 0.5529, "step": 11080 }, { "epoch": 0.35261602973410844, "grad_norm": 0.9409967660903931, "learning_rate": 0.00015028486102230105, "loss": 0.5725, "step": 11100 }, { "epoch": 0.3532513739318276, "grad_norm": 0.9317089319229126, "learning_rate": 0.000150106886565355, "loss": 0.5568, "step": 11120 }, { "epoch": 0.3538867181295467, "grad_norm": 1.025341510772705, "learning_rate": 0.00014992869993566194, "loss": 0.5555, "step": 11140 }, { "epoch": 0.3545220623272658, "grad_norm": 1.0014809370040894, "learning_rate": 0.00014975030188773585, "loss": 0.5922, "step": 11160 }, { "epoch": 0.3551574065249849, "grad_norm": 0.9769735336303711, "learning_rate": 0.00014957169317698593, "loss": 0.583, "step": 11180 }, { "epoch": 0.355792750722704, "grad_norm": 0.8555041551589966, "learning_rate": 0.0001493928745597134, "loss": 0.5609, "step": 11200 }, { "epoch": 0.35642809492042316, "grad_norm": 0.9463367462158203, "learning_rate": 0.0001492138467931084, "loss": 0.5783, "step": 11220 }, { "epoch": 0.35706343911814226, "grad_norm": 0.9429970979690552, "learning_rate": 0.00014903461063524661, "loss": 0.5934, "step": 11240 }, { "epoch": 0.35769878331586136, "grad_norm": 1.4683854579925537, "learning_rate": 0.00014885516684508612, "loss": 0.5939, "step": 11260 }, { "epoch": 0.35833412751358046, "grad_norm": 0.825720489025116, "learning_rate": 0.00014867551618246428, "loss": 0.5685, "step": 11280 }, { "epoch": 0.3589694717112996, "grad_norm": 1.001832127571106, "learning_rate": 0.00014849565940809432, "loss": 0.5837, "step": 11300 }, { "epoch": 0.3596048159090187, "grad_norm": 0.9406988024711609, "learning_rate": 0.00014831559728356234, "loss": 0.5864, "step": 11320 }, { "epoch": 0.3602401601067378, "grad_norm": 0.7483388185501099, "learning_rate": 0.00014813533057132393, "loss": 0.5991, "step": 11340 }, { "epoch": 0.3608755043044569, "grad_norm": 0.8849460482597351, "learning_rate": 0.00014795486003470093, "loss": 0.5821, "step": 11360 }, { "epoch": 0.3615108485021761, "grad_norm": 0.7930045127868652, "learning_rate": 0.00014777418643787836, "loss": 0.5395, "step": 11380 }, { "epoch": 0.3621461926998952, "grad_norm": 0.9285226464271545, "learning_rate": 0.000147593310545901, "loss": 0.5713, "step": 11400 }, { "epoch": 0.3627815368976143, "grad_norm": 1.0233609676361084, "learning_rate": 0.00014741223312467026, "loss": 0.5875, "step": 11420 }, { "epoch": 0.3634168810953334, "grad_norm": 1.033948302268982, "learning_rate": 0.00014723095494094092, "loss": 0.5993, "step": 11440 }, { "epoch": 0.36405222529305253, "grad_norm": 0.9479451179504395, "learning_rate": 0.00014704947676231784, "loss": 0.571, "step": 11460 }, { "epoch": 0.36468756949077163, "grad_norm": 0.7781844735145569, "learning_rate": 0.0001468677993572528, "loss": 0.5503, "step": 11480 }, { "epoch": 0.36532291368849074, "grad_norm": 0.9249241352081299, "learning_rate": 0.00014668592349504101, "loss": 0.574, "step": 11500 }, { "epoch": 0.36595825788620984, "grad_norm": 0.9108446836471558, "learning_rate": 0.00014650384994581824, "loss": 0.557, "step": 11520 }, { "epoch": 0.366593602083929, "grad_norm": 1.0099608898162842, "learning_rate": 0.0001463215794805573, "loss": 0.5605, "step": 11540 }, { "epoch": 0.3672289462816481, "grad_norm": 0.8376953601837158, "learning_rate": 0.00014613911287106467, "loss": 0.538, "step": 11560 }, { "epoch": 0.3678642904793672, "grad_norm": 0.8893873691558838, "learning_rate": 0.00014595645088997757, "loss": 0.5606, "step": 11580 }, { "epoch": 0.3684996346770863, "grad_norm": 1.1310006380081177, "learning_rate": 0.00014577359431076046, "loss": 0.5612, "step": 11600 }, { "epoch": 0.36913497887480545, "grad_norm": 0.8577033281326294, "learning_rate": 0.00014559054390770167, "loss": 0.5688, "step": 11620 }, { "epoch": 0.36977032307252455, "grad_norm": 0.9386855959892273, "learning_rate": 0.00014540730045591044, "loss": 0.5614, "step": 11640 }, { "epoch": 0.37040566727024365, "grad_norm": 0.9492216110229492, "learning_rate": 0.00014522386473131332, "loss": 0.5878, "step": 11660 }, { "epoch": 0.37104101146796276, "grad_norm": 0.853327751159668, "learning_rate": 0.00014504023751065115, "loss": 0.5568, "step": 11680 }, { "epoch": 0.37167635566568186, "grad_norm": 0.7977784872055054, "learning_rate": 0.00014485641957147553, "loss": 0.5428, "step": 11700 }, { "epoch": 0.372311699863401, "grad_norm": 1.1006829738616943, "learning_rate": 0.00014467241169214567, "loss": 0.559, "step": 11720 }, { "epoch": 0.3729470440611201, "grad_norm": 1.08724045753479, "learning_rate": 0.0001444882146518251, "loss": 0.5642, "step": 11740 }, { "epoch": 0.3735823882588392, "grad_norm": 1.0295459032058716, "learning_rate": 0.00014430382923047831, "loss": 0.5969, "step": 11760 }, { "epoch": 0.3742177324565583, "grad_norm": 1.1096023321151733, "learning_rate": 0.00014411925620886742, "loss": 0.5678, "step": 11780 }, { "epoch": 0.3748530766542775, "grad_norm": 0.9315259456634521, "learning_rate": 0.000143934496368549, "loss": 0.5728, "step": 11800 }, { "epoch": 0.3754884208519966, "grad_norm": 0.9581449031829834, "learning_rate": 0.00014374955049187066, "loss": 0.5485, "step": 11820 }, { "epoch": 0.3761237650497157, "grad_norm": 1.472161054611206, "learning_rate": 0.00014356441936196776, "loss": 0.5931, "step": 11840 }, { "epoch": 0.3767591092474348, "grad_norm": 1.0234733819961548, "learning_rate": 0.00014337910376276011, "loss": 0.5635, "step": 11860 }, { "epoch": 0.37739445344515393, "grad_norm": 0.9299212694168091, "learning_rate": 0.00014319360447894862, "loss": 0.5802, "step": 11880 }, { "epoch": 0.37802979764287303, "grad_norm": 0.853388786315918, "learning_rate": 0.00014300792229601198, "loss": 0.5645, "step": 11900 }, { "epoch": 0.37866514184059213, "grad_norm": 0.9909472465515137, "learning_rate": 0.0001428220580002034, "loss": 0.5451, "step": 11920 }, { "epoch": 0.37930048603831124, "grad_norm": 0.8121063113212585, "learning_rate": 0.00014263601237854716, "loss": 0.5514, "step": 11940 }, { "epoch": 0.3799358302360304, "grad_norm": 0.9053930044174194, "learning_rate": 0.00014244978621883543, "loss": 0.5371, "step": 11960 }, { "epoch": 0.3805711744337495, "grad_norm": 1.0551111698150635, "learning_rate": 0.00014226338030962475, "loss": 0.5862, "step": 11980 }, { "epoch": 0.3812065186314686, "grad_norm": 0.8897386193275452, "learning_rate": 0.0001420767954402329, "loss": 0.5439, "step": 12000 }, { "epoch": 0.3812065186314686, "eval_loss": 0.5259391665458679, "eval_runtime": 45.0289, "eval_samples_per_second": 60.028, "eval_steps_per_second": 30.025, "step": 12000 }, { "epoch": 0.3818418628291877, "grad_norm": 0.8436812162399292, "learning_rate": 0.00014189003240073535, "loss": 0.5684, "step": 12020 }, { "epoch": 0.38247720702690685, "grad_norm": 1.2769359350204468, "learning_rate": 0.0001417030919819621, "loss": 0.5483, "step": 12040 }, { "epoch": 0.38311255122462595, "grad_norm": 0.8915470838546753, "learning_rate": 0.0001415159749754942, "loss": 0.5674, "step": 12060 }, { "epoch": 0.38374789542234505, "grad_norm": 1.1026362180709839, "learning_rate": 0.00014132868217366044, "loss": 0.5868, "step": 12080 }, { "epoch": 0.38438323962006415, "grad_norm": 0.92413729429245, "learning_rate": 0.00014114121436953402, "loss": 0.5602, "step": 12100 }, { "epoch": 0.3850185838177833, "grad_norm": 0.8880215287208557, "learning_rate": 0.0001409535723569291, "loss": 0.563, "step": 12120 }, { "epoch": 0.3856539280155024, "grad_norm": 0.7865646481513977, "learning_rate": 0.00014076575693039767, "loss": 0.5731, "step": 12140 }, { "epoch": 0.3862892722132215, "grad_norm": 0.8817760348320007, "learning_rate": 0.00014057776888522583, "loss": 0.5205, "step": 12160 }, { "epoch": 0.3869246164109406, "grad_norm": 0.7473212480545044, "learning_rate": 0.0001403896090174307, "loss": 0.5494, "step": 12180 }, { "epoch": 0.3875599606086597, "grad_norm": 0.9429736137390137, "learning_rate": 0.0001402012781237571, "loss": 0.551, "step": 12200 }, { "epoch": 0.38819530480637887, "grad_norm": 0.9144492149353027, "learning_rate": 0.00014001277700167382, "loss": 0.529, "step": 12220 }, { "epoch": 0.388830649004098, "grad_norm": 0.8465405702590942, "learning_rate": 0.00013982410644937057, "loss": 0.566, "step": 12240 }, { "epoch": 0.3894659932018171, "grad_norm": 0.8520842790603638, "learning_rate": 0.00013963526726575446, "loss": 0.61, "step": 12260 }, { "epoch": 0.3901013373995362, "grad_norm": 0.8384197354316711, "learning_rate": 0.00013944626025044673, "loss": 0.563, "step": 12280 }, { "epoch": 0.39073668159725533, "grad_norm": 0.9083155989646912, "learning_rate": 0.00013925708620377927, "loss": 0.5433, "step": 12300 }, { "epoch": 0.39137202579497443, "grad_norm": 1.0582692623138428, "learning_rate": 0.00013906774592679116, "loss": 0.5368, "step": 12320 }, { "epoch": 0.39200736999269353, "grad_norm": 0.8538171648979187, "learning_rate": 0.00013887824022122537, "loss": 0.5217, "step": 12340 }, { "epoch": 0.39264271419041263, "grad_norm": 0.8264597058296204, "learning_rate": 0.00013868856988952556, "loss": 0.5564, "step": 12360 }, { "epoch": 0.3932780583881318, "grad_norm": 0.8192921280860901, "learning_rate": 0.00013849873573483222, "loss": 0.6058, "step": 12380 }, { "epoch": 0.3939134025858509, "grad_norm": 0.8523415923118591, "learning_rate": 0.00013830873856097964, "loss": 0.5565, "step": 12400 }, { "epoch": 0.39454874678357, "grad_norm": 1.0821831226348877, "learning_rate": 0.00013811857917249253, "loss": 0.5617, "step": 12420 }, { "epoch": 0.3951840909812891, "grad_norm": 0.8053098917007446, "learning_rate": 0.00013792825837458225, "loss": 0.579, "step": 12440 }, { "epoch": 0.39581943517900825, "grad_norm": 0.9511120319366455, "learning_rate": 0.00013773777697314378, "loss": 0.5417, "step": 12460 }, { "epoch": 0.39645477937672735, "grad_norm": 1.0273131132125854, "learning_rate": 0.00013754713577475213, "loss": 0.582, "step": 12480 }, { "epoch": 0.39709012357444645, "grad_norm": 1.0347099304199219, "learning_rate": 0.00013735633558665893, "loss": 0.5679, "step": 12500 }, { "epoch": 0.39772546777216555, "grad_norm": 1.0762611627578735, "learning_rate": 0.00013716537721678907, "loss": 0.5483, "step": 12520 }, { "epoch": 0.3983608119698847, "grad_norm": 1.4243688583374023, "learning_rate": 0.00013697426147373721, "loss": 0.5558, "step": 12540 }, { "epoch": 0.3989961561676038, "grad_norm": 0.7539466023445129, "learning_rate": 0.00013678298916676445, "loss": 0.5404, "step": 12560 }, { "epoch": 0.3996315003653229, "grad_norm": 0.7736854553222656, "learning_rate": 0.00013659156110579476, "loss": 0.5578, "step": 12580 }, { "epoch": 0.400266844563042, "grad_norm": 0.9489171504974365, "learning_rate": 0.0001363999781014117, "loss": 0.5668, "step": 12600 }, { "epoch": 0.40090218876076117, "grad_norm": 0.9692643880844116, "learning_rate": 0.00013621783146979094, "loss": 0.5663, "step": 12620 }, { "epoch": 0.40153753295848027, "grad_norm": 1.0705336332321167, "learning_rate": 0.00013602594865967435, "loss": 0.5293, "step": 12640 }, { "epoch": 0.40217287715619937, "grad_norm": 1.0149205923080444, "learning_rate": 0.00013583391330117533, "loss": 0.5348, "step": 12660 }, { "epoch": 0.40280822135391847, "grad_norm": 0.9088581204414368, "learning_rate": 0.00013564172620744906, "loss": 0.5677, "step": 12680 }, { "epoch": 0.4034435655516376, "grad_norm": 1.1513986587524414, "learning_rate": 0.00013544938819229306, "loss": 0.569, "step": 12700 }, { "epoch": 0.40407890974935673, "grad_norm": 0.8725998401641846, "learning_rate": 0.00013525690007014406, "loss": 0.5692, "step": 12720 }, { "epoch": 0.40471425394707583, "grad_norm": 1.0663046836853027, "learning_rate": 0.00013506426265607425, "loss": 0.567, "step": 12740 }, { "epoch": 0.40534959814479493, "grad_norm": 0.9139559864997864, "learning_rate": 0.00013487147676578812, "loss": 0.5465, "step": 12760 }, { "epoch": 0.40598494234251403, "grad_norm": 1.3140777349472046, "learning_rate": 0.00013467854321561878, "loss": 0.5407, "step": 12780 }, { "epoch": 0.4066202865402332, "grad_norm": 0.8671903610229492, "learning_rate": 0.00013448546282252458, "loss": 0.5303, "step": 12800 }, { "epoch": 0.4072556307379523, "grad_norm": 0.692545473575592, "learning_rate": 0.00013429223640408578, "loss": 0.5333, "step": 12820 }, { "epoch": 0.4078909749356714, "grad_norm": 1.1087654829025269, "learning_rate": 0.00013409886477850087, "loss": 0.5493, "step": 12840 }, { "epoch": 0.4085263191333905, "grad_norm": 0.9659181833267212, "learning_rate": 0.00013390534876458319, "loss": 0.5902, "step": 12860 }, { "epoch": 0.40916166333110965, "grad_norm": 0.7794270515441895, "learning_rate": 0.00013371168918175754, "loss": 0.5647, "step": 12880 }, { "epoch": 0.40979700752882875, "grad_norm": 0.910505473613739, "learning_rate": 0.00013351788685005662, "loss": 0.5752, "step": 12900 }, { "epoch": 0.41043235172654785, "grad_norm": 0.9549837112426758, "learning_rate": 0.00013332394259011758, "loss": 0.5424, "step": 12920 }, { "epoch": 0.41106769592426695, "grad_norm": 1.2679826021194458, "learning_rate": 0.00013312985722317862, "loss": 0.5285, "step": 12940 }, { "epoch": 0.4117030401219861, "grad_norm": 0.8822807669639587, "learning_rate": 0.0001329356315710753, "loss": 0.5662, "step": 12960 }, { "epoch": 0.4123383843197052, "grad_norm": 0.8247064352035522, "learning_rate": 0.0001327412664562373, "loss": 0.5338, "step": 12980 }, { "epoch": 0.4129737285174243, "grad_norm": 0.8655696511268616, "learning_rate": 0.0001325467627016849, "loss": 0.5563, "step": 13000 }, { "epoch": 0.4129737285174243, "eval_loss": 0.5103311538696289, "eval_runtime": 44.4811, "eval_samples_per_second": 60.767, "eval_steps_per_second": 30.395, "step": 13000 }, { "epoch": 0.4136090727151434, "grad_norm": 1.1745620965957642, "learning_rate": 0.00013235212113102532, "loss": 0.5432, "step": 13020 }, { "epoch": 0.41424441691286257, "grad_norm": 1.375957727432251, "learning_rate": 0.0001321573425684494, "loss": 0.5518, "step": 13040 }, { "epoch": 0.41487976111058167, "grad_norm": 1.2425376176834106, "learning_rate": 0.00013196242783872805, "loss": 0.5667, "step": 13060 }, { "epoch": 0.41551510530830077, "grad_norm": 0.9375765919685364, "learning_rate": 0.00013176737776720876, "loss": 0.5629, "step": 13080 }, { "epoch": 0.41615044950601987, "grad_norm": 0.9392895698547363, "learning_rate": 0.00013157219317981217, "loss": 0.5577, "step": 13100 }, { "epoch": 0.416785793703739, "grad_norm": 0.9028527140617371, "learning_rate": 0.00013137687490302844, "loss": 0.5358, "step": 13120 }, { "epoch": 0.41742113790145813, "grad_norm": 0.9373983144760132, "learning_rate": 0.00013118142376391381, "loss": 0.5517, "step": 13140 }, { "epoch": 0.41805648209917723, "grad_norm": 1.3339825868606567, "learning_rate": 0.00013098584059008725, "loss": 0.5512, "step": 13160 }, { "epoch": 0.41869182629689633, "grad_norm": 0.7137243747711182, "learning_rate": 0.00013079012620972663, "loss": 0.5464, "step": 13180 }, { "epoch": 0.41932717049461543, "grad_norm": 1.1450612545013428, "learning_rate": 0.00013059428145156555, "loss": 0.564, "step": 13200 }, { "epoch": 0.4199625146923346, "grad_norm": 1.2148438692092896, "learning_rate": 0.00013039830714488965, "loss": 0.5555, "step": 13220 }, { "epoch": 0.4205978588900537, "grad_norm": 1.277346134185791, "learning_rate": 0.00013020220411953304, "loss": 0.5898, "step": 13240 }, { "epoch": 0.4212332030877728, "grad_norm": 1.0933984518051147, "learning_rate": 0.00013000597320587492, "loss": 0.553, "step": 13260 }, { "epoch": 0.4218685472854919, "grad_norm": 0.7297493815422058, "learning_rate": 0.00012980961523483616, "loss": 0.5626, "step": 13280 }, { "epoch": 0.42250389148321105, "grad_norm": 0.8859849572181702, "learning_rate": 0.00012961313103787548, "loss": 0.5455, "step": 13300 }, { "epoch": 0.42313923568093015, "grad_norm": 0.9647216200828552, "learning_rate": 0.00012941652144698608, "loss": 0.5157, "step": 13320 }, { "epoch": 0.42377457987864925, "grad_norm": 0.9097155332565308, "learning_rate": 0.00012921978729469222, "loss": 0.542, "step": 13340 }, { "epoch": 0.42440992407636835, "grad_norm": 1.0074721574783325, "learning_rate": 0.0001290229294140456, "loss": 0.5319, "step": 13360 }, { "epoch": 0.4250452682740875, "grad_norm": 0.7759230732917786, "learning_rate": 0.0001288259486386218, "loss": 0.4939, "step": 13380 }, { "epoch": 0.4256806124718066, "grad_norm": 0.8912795782089233, "learning_rate": 0.00012862884580251675, "loss": 0.5276, "step": 13400 }, { "epoch": 0.4263159566695257, "grad_norm": 1.090395450592041, "learning_rate": 0.00012843162174034332, "loss": 0.5227, "step": 13420 }, { "epoch": 0.4269513008672448, "grad_norm": 0.8524248003959656, "learning_rate": 0.00012823427728722762, "loss": 0.5438, "step": 13440 }, { "epoch": 0.42758664506496397, "grad_norm": 1.209073543548584, "learning_rate": 0.0001280368132788056, "loss": 0.5495, "step": 13460 }, { "epoch": 0.42822198926268307, "grad_norm": 0.9301733374595642, "learning_rate": 0.00012783923055121945, "loss": 0.5411, "step": 13480 }, { "epoch": 0.42885733346040217, "grad_norm": 0.916028618812561, "learning_rate": 0.000127641529941114, "loss": 0.5674, "step": 13500 }, { "epoch": 0.42949267765812127, "grad_norm": 0.9181066751480103, "learning_rate": 0.00012744371228563334, "loss": 0.5522, "step": 13520 }, { "epoch": 0.4301280218558404, "grad_norm": 1.2208302021026611, "learning_rate": 0.0001272457784224171, "loss": 0.5428, "step": 13540 }, { "epoch": 0.4307633660535595, "grad_norm": 0.8382121920585632, "learning_rate": 0.00012704772918959706, "loss": 0.5347, "step": 13560 }, { "epoch": 0.4313987102512786, "grad_norm": 0.7942314147949219, "learning_rate": 0.0001268495654257934, "loss": 0.5455, "step": 13580 }, { "epoch": 0.43203405444899773, "grad_norm": 1.0586442947387695, "learning_rate": 0.00012665128797011138, "loss": 0.5588, "step": 13600 }, { "epoch": 0.4326693986467169, "grad_norm": 0.9026583433151245, "learning_rate": 0.00012645289766213764, "loss": 0.5448, "step": 13620 }, { "epoch": 0.433304742844436, "grad_norm": 1.107459545135498, "learning_rate": 0.0001262643231052632, "loss": 0.5226, "step": 13640 }, { "epoch": 0.4339400870421551, "grad_norm": 0.7181698679924011, "learning_rate": 0.00012606571515198816, "loss": 0.5587, "step": 13660 }, { "epoch": 0.4345754312398742, "grad_norm": 0.850642740726471, "learning_rate": 0.0001258669968259726, "loss": 0.5514, "step": 13680 }, { "epoch": 0.4352107754375933, "grad_norm": 0.9803110957145691, "learning_rate": 0.00012567811294990802, "loss": 0.5612, "step": 13700 }, { "epoch": 0.43584611963531245, "grad_norm": 0.8320556282997131, "learning_rate": 0.00012547918181770158, "loss": 0.5464, "step": 13720 }, { "epoch": 0.43648146383303155, "grad_norm": 0.9645776152610779, "learning_rate": 0.0001252801427963731, "loss": 0.5394, "step": 13740 }, { "epoch": 0.43711680803075065, "grad_norm": 0.981066107749939, "learning_rate": 0.00012508099672873401, "loss": 0.5518, "step": 13760 }, { "epoch": 0.43775215222846975, "grad_norm": 0.950231671333313, "learning_rate": 0.00012488174445804905, "loss": 0.5628, "step": 13780 }, { "epoch": 0.4383874964261889, "grad_norm": 0.7942489981651306, "learning_rate": 0.00012468238682803256, "loss": 0.5682, "step": 13800 }, { "epoch": 0.439022840623908, "grad_norm": 0.9598709940910339, "learning_rate": 0.0001244829246828451, "loss": 0.5398, "step": 13820 }, { "epoch": 0.4396581848216271, "grad_norm": 0.9328323602676392, "learning_rate": 0.0001242833588670898, "loss": 0.5465, "step": 13840 }, { "epoch": 0.4402935290193462, "grad_norm": 0.9036662578582764, "learning_rate": 0.00012408369022580865, "loss": 0.5307, "step": 13860 }, { "epoch": 0.44092887321706536, "grad_norm": 1.1593483686447144, "learning_rate": 0.0001238839196044792, "loss": 0.5838, "step": 13880 }, { "epoch": 0.44156421741478447, "grad_norm": 0.9283963441848755, "learning_rate": 0.0001236840478490107, "loss": 0.5112, "step": 13900 }, { "epoch": 0.44219956161250357, "grad_norm": 1.1374804973602295, "learning_rate": 0.00012348407580574068, "loss": 0.5616, "step": 13920 }, { "epoch": 0.44283490581022267, "grad_norm": 0.8757379055023193, "learning_rate": 0.00012328400432143143, "loss": 0.5409, "step": 13940 }, { "epoch": 0.4434702500079418, "grad_norm": 0.9971847534179688, "learning_rate": 0.00012308383424326617, "loss": 0.5573, "step": 13960 }, { "epoch": 0.4441055942056609, "grad_norm": 0.8985651135444641, "learning_rate": 0.00012288356641884567, "loss": 0.5602, "step": 13980 }, { "epoch": 0.44474093840338, "grad_norm": 0.8877219557762146, "learning_rate": 0.0001226832016961846, "loss": 0.5418, "step": 14000 }, { "epoch": 0.44474093840338, "eval_loss": 0.49767744541168213, "eval_runtime": 45.8378, "eval_samples_per_second": 58.969, "eval_steps_per_second": 29.495, "step": 14000 }, { "epoch": 0.4453762826010991, "grad_norm": 0.9760685563087463, "learning_rate": 0.00012248274092370795, "loss": 0.5386, "step": 14020 }, { "epoch": 0.4460116267988183, "grad_norm": 0.9159601330757141, "learning_rate": 0.00012228218495024734, "loss": 0.5658, "step": 14040 }, { "epoch": 0.4466469709965374, "grad_norm": 0.9726976752281189, "learning_rate": 0.00012208153462503764, "loss": 0.5619, "step": 14060 }, { "epoch": 0.4472823151942565, "grad_norm": 0.8647946715354919, "learning_rate": 0.00012188079079771311, "loss": 0.5312, "step": 14080 }, { "epoch": 0.4479176593919756, "grad_norm": 0.8291323781013489, "learning_rate": 0.00012167995431830404, "loss": 0.5555, "step": 14100 }, { "epoch": 0.44855300358969474, "grad_norm": 1.1393893957138062, "learning_rate": 0.00012147902603723302, "loss": 0.5368, "step": 14120 }, { "epoch": 0.44918834778741384, "grad_norm": 0.9214714169502258, "learning_rate": 0.00012127800680531129, "loss": 0.5312, "step": 14140 }, { "epoch": 0.44982369198513295, "grad_norm": 0.7314972877502441, "learning_rate": 0.00012107689747373533, "loss": 0.5306, "step": 14160 }, { "epoch": 0.45045903618285205, "grad_norm": 0.9739118218421936, "learning_rate": 0.00012087569889408308, "loss": 0.5474, "step": 14180 }, { "epoch": 0.45109438038057115, "grad_norm": 1.1331558227539062, "learning_rate": 0.00012067441191831035, "loss": 0.5251, "step": 14200 }, { "epoch": 0.4517297245782903, "grad_norm": 0.9672099947929382, "learning_rate": 0.00012047303739874733, "loss": 0.5638, "step": 14220 }, { "epoch": 0.4523650687760094, "grad_norm": 0.9430161118507385, "learning_rate": 0.00012027157618809488, "loss": 0.5473, "step": 14240 }, { "epoch": 0.4530004129737285, "grad_norm": 0.9385126233100891, "learning_rate": 0.00012007002913942092, "loss": 0.5305, "step": 14260 }, { "epoch": 0.4536357571714476, "grad_norm": 1.2930362224578857, "learning_rate": 0.00011986839710615689, "loss": 0.5264, "step": 14280 }, { "epoch": 0.45427110136916676, "grad_norm": 1.098981499671936, "learning_rate": 0.00011966668094209401, "loss": 0.5945, "step": 14300 }, { "epoch": 0.45490644556688586, "grad_norm": 1.016724944114685, "learning_rate": 0.00011946488150137987, "loss": 0.5423, "step": 14320 }, { "epoch": 0.45554178976460497, "grad_norm": 1.3441358804702759, "learning_rate": 0.00011926299963851455, "loss": 0.5311, "step": 14340 }, { "epoch": 0.45617713396232407, "grad_norm": 0.8672164678573608, "learning_rate": 0.00011906103620834721, "loss": 0.5377, "step": 14360 }, { "epoch": 0.4568124781600432, "grad_norm": 0.8844342231750488, "learning_rate": 0.00011885899206607243, "loss": 0.5539, "step": 14380 }, { "epoch": 0.4574478223577623, "grad_norm": 1.0755807161331177, "learning_rate": 0.00011865686806722647, "loss": 0.5489, "step": 14400 }, { "epoch": 0.4580831665554814, "grad_norm": 0.8909132480621338, "learning_rate": 0.00011845466506768379, "loss": 0.5492, "step": 14420 }, { "epoch": 0.4587185107532005, "grad_norm": 0.7222205996513367, "learning_rate": 0.00011826249982356501, "loss": 0.5452, "step": 14440 }, { "epoch": 0.4593538549509197, "grad_norm": 0.8589527606964111, "learning_rate": 0.00011806014523563623, "loss": 0.5553, "step": 14460 }, { "epoch": 0.4599891991486388, "grad_norm": 0.8546582460403442, "learning_rate": 0.00011785771417377567, "loss": 0.518, "step": 14480 }, { "epoch": 0.4606245433463579, "grad_norm": 0.7938315272331238, "learning_rate": 0.00011765520749515795, "loss": 0.5732, "step": 14500 }, { "epoch": 0.461259887544077, "grad_norm": 1.030897617340088, "learning_rate": 0.000117452626057278, "loss": 0.5293, "step": 14520 }, { "epoch": 0.46189523174179614, "grad_norm": 0.9275230765342712, "learning_rate": 0.00011724997071794722, "loss": 0.5453, "step": 14540 }, { "epoch": 0.46253057593951524, "grad_norm": 0.8049765825271606, "learning_rate": 0.00011704724233528997, "loss": 0.5237, "step": 14560 }, { "epoch": 0.46316592013723434, "grad_norm": 0.9411914348602295, "learning_rate": 0.00011684444176773994, "loss": 0.5529, "step": 14580 }, { "epoch": 0.46380126433495344, "grad_norm": 1.0553874969482422, "learning_rate": 0.0001166415698740364, "loss": 0.5107, "step": 14600 }, { "epoch": 0.4644366085326726, "grad_norm": 1.1203105449676514, "learning_rate": 0.00011643862751322072, "loss": 0.5503, "step": 14620 }, { "epoch": 0.4650719527303917, "grad_norm": 0.9356998801231384, "learning_rate": 0.00011623561554463263, "loss": 0.5388, "step": 14640 }, { "epoch": 0.4657072969281108, "grad_norm": 1.0603325366973877, "learning_rate": 0.00011603253482790657, "loss": 0.5379, "step": 14660 }, { "epoch": 0.4663426411258299, "grad_norm": 0.7650070786476135, "learning_rate": 0.00011582938622296818, "loss": 0.5175, "step": 14680 }, { "epoch": 0.466977985323549, "grad_norm": 1.1926647424697876, "learning_rate": 0.00011562617059003044, "loss": 0.5558, "step": 14700 }, { "epoch": 0.46761332952126816, "grad_norm": 0.9466400742530823, "learning_rate": 0.00011542288878959025, "loss": 0.5288, "step": 14720 }, { "epoch": 0.46824867371898726, "grad_norm": 1.036163091659546, "learning_rate": 0.0001152195416824247, "loss": 0.5322, "step": 14740 }, { "epoch": 0.46888401791670636, "grad_norm": 0.8458572626113892, "learning_rate": 0.00011501613012958729, "loss": 0.5358, "step": 14760 }, { "epoch": 0.46951936211442546, "grad_norm": 0.789557695388794, "learning_rate": 0.00011481265499240455, "loss": 0.5067, "step": 14780 }, { "epoch": 0.4701547063121446, "grad_norm": 0.845371425151825, "learning_rate": 0.00011460911713247222, "loss": 0.5433, "step": 14800 }, { "epoch": 0.4707900505098637, "grad_norm": 0.8561549782752991, "learning_rate": 0.00011440551741165156, "loss": 0.5362, "step": 14820 }, { "epoch": 0.4714253947075828, "grad_norm": 0.921575665473938, "learning_rate": 0.00011420185669206582, "loss": 0.5093, "step": 14840 }, { "epoch": 0.4720607389053019, "grad_norm": 0.9392147660255432, "learning_rate": 0.0001139981358360966, "loss": 0.5419, "step": 14860 }, { "epoch": 0.4726960831030211, "grad_norm": 0.859464168548584, "learning_rate": 0.00011379435570638002, "loss": 0.5329, "step": 14880 }, { "epoch": 0.4733314273007402, "grad_norm": 0.9370890259742737, "learning_rate": 0.00011359051716580331, "loss": 0.516, "step": 14900 }, { "epoch": 0.4739667714984593, "grad_norm": 0.8993077278137207, "learning_rate": 0.00011338662107750098, "loss": 0.4785, "step": 14920 }, { "epoch": 0.4746021156961784, "grad_norm": 0.7652683854103088, "learning_rate": 0.00011318266830485119, "loss": 0.5348, "step": 14940 }, { "epoch": 0.47523745989389754, "grad_norm": 1.0513384342193604, "learning_rate": 0.00011297865971147217, "loss": 0.5181, "step": 14960 }, { "epoch": 0.47587280409161664, "grad_norm": 0.8159809112548828, "learning_rate": 0.00011277459616121851, "loss": 0.5368, "step": 14980 }, { "epoch": 0.47650814828933574, "grad_norm": 1.0844529867172241, "learning_rate": 0.00011257047851817748, "loss": 0.5497, "step": 15000 }, { "epoch": 0.47650814828933574, "eval_loss": 0.4893677234649658, "eval_runtime": 45.7511, "eval_samples_per_second": 59.081, "eval_steps_per_second": 29.551, "step": 15000 }, { "epoch": 0.47714349248705484, "grad_norm": 0.7700105309486389, "learning_rate": 0.0001123663076466655, "loss": 0.5354, "step": 15020 }, { "epoch": 0.477778836684774, "grad_norm": 0.872631847858429, "learning_rate": 0.0001121620844112242, "loss": 0.5243, "step": 15040 }, { "epoch": 0.4784141808824931, "grad_norm": 1.1037932634353638, "learning_rate": 0.0001119578096766171, "loss": 0.5412, "step": 15060 }, { "epoch": 0.4790495250802122, "grad_norm": 0.9620169997215271, "learning_rate": 0.00011175348430782579, "loss": 0.5137, "step": 15080 }, { "epoch": 0.4796848692779313, "grad_norm": 0.7465859055519104, "learning_rate": 0.0001115491091700461, "loss": 0.5213, "step": 15100 }, { "epoch": 0.48032021347565046, "grad_norm": 0.7287941575050354, "learning_rate": 0.00011134468512868479, "loss": 0.5184, "step": 15120 }, { "epoch": 0.48095555767336956, "grad_norm": 0.9596436023712158, "learning_rate": 0.00011114021304935558, "loss": 0.5471, "step": 15140 }, { "epoch": 0.48159090187108866, "grad_norm": 0.869172215461731, "learning_rate": 0.00011093569379787563, "loss": 0.5074, "step": 15160 }, { "epoch": 0.48222624606880776, "grad_norm": 1.0704097747802734, "learning_rate": 0.00011073112824026191, "loss": 0.544, "step": 15180 }, { "epoch": 0.48286159026652686, "grad_norm": 0.896312415599823, "learning_rate": 0.00011052651724272736, "loss": 0.5261, "step": 15200 }, { "epoch": 0.483496934464246, "grad_norm": 1.010606288909912, "learning_rate": 0.00011032186167167741, "loss": 0.5112, "step": 15220 }, { "epoch": 0.4841322786619651, "grad_norm": 0.980171263217926, "learning_rate": 0.00011011716239370625, "loss": 0.5414, "step": 15240 }, { "epoch": 0.4847676228596842, "grad_norm": 0.7417489290237427, "learning_rate": 0.00010991242027559301, "loss": 0.5019, "step": 15260 }, { "epoch": 0.4854029670574033, "grad_norm": 0.9232955574989319, "learning_rate": 0.0001097076361842984, "loss": 0.5293, "step": 15280 }, { "epoch": 0.4860383112551225, "grad_norm": 0.8391673564910889, "learning_rate": 0.00010950281098696072, "loss": 0.5397, "step": 15300 }, { "epoch": 0.4866736554528416, "grad_norm": 1.0795869827270508, "learning_rate": 0.00010929794555089239, "loss": 0.5293, "step": 15320 }, { "epoch": 0.4873089996505607, "grad_norm": 0.9179370403289795, "learning_rate": 0.00010909304074357627, "loss": 0.5089, "step": 15340 }, { "epoch": 0.4879443438482798, "grad_norm": 0.9346722960472107, "learning_rate": 0.0001088880974326618, "loss": 0.4981, "step": 15360 }, { "epoch": 0.48857968804599894, "grad_norm": 0.9835326075553894, "learning_rate": 0.00010868311648596157, "loss": 0.52, "step": 15380 }, { "epoch": 0.48921503224371804, "grad_norm": 0.8709509968757629, "learning_rate": 0.0001084780987714475, "loss": 0.5507, "step": 15400 }, { "epoch": 0.48985037644143714, "grad_norm": 1.0125563144683838, "learning_rate": 0.00010827304515724719, "loss": 0.5522, "step": 15420 }, { "epoch": 0.49048572063915624, "grad_norm": 0.9726683497428894, "learning_rate": 0.00010806795651164026, "loss": 0.5195, "step": 15440 }, { "epoch": 0.4911210648368754, "grad_norm": 0.9348143935203552, "learning_rate": 0.0001078628337030547, "loss": 0.5376, "step": 15460 }, { "epoch": 0.4917564090345945, "grad_norm": 1.247452735900879, "learning_rate": 0.00010765767760006308, "loss": 0.5238, "step": 15480 }, { "epoch": 0.4923917532323136, "grad_norm": 1.2584036588668823, "learning_rate": 0.00010745248907137906, "loss": 0.539, "step": 15500 }, { "epoch": 0.4930270974300327, "grad_norm": 0.9565659165382385, "learning_rate": 0.00010724726898585353, "loss": 0.546, "step": 15520 }, { "epoch": 0.49366244162775186, "grad_norm": 0.9646620750427246, "learning_rate": 0.000107042018212471, "loss": 0.5094, "step": 15540 }, { "epoch": 0.49429778582547096, "grad_norm": 0.7045026421546936, "learning_rate": 0.00010683673762034594, "loss": 0.5708, "step": 15560 }, { "epoch": 0.49493313002319006, "grad_norm": 1.1588184833526611, "learning_rate": 0.00010663142807871911, "loss": 0.5681, "step": 15580 }, { "epoch": 0.49556847422090916, "grad_norm": 0.8272905349731445, "learning_rate": 0.00010642609045695382, "loss": 0.5239, "step": 15600 }, { "epoch": 0.4962038184186283, "grad_norm": 0.9670738577842712, "learning_rate": 0.00010622072562453234, "loss": 0.486, "step": 15620 }, { "epoch": 0.4968391626163474, "grad_norm": 0.8635004162788391, "learning_rate": 0.00010601533445105205, "loss": 0.5419, "step": 15640 }, { "epoch": 0.4974745068140665, "grad_norm": 1.0769212245941162, "learning_rate": 0.00010580991780622196, "loss": 0.5252, "step": 15660 }, { "epoch": 0.4981098510117856, "grad_norm": 0.9688665270805359, "learning_rate": 0.00010560447655985894, "loss": 0.5559, "step": 15680 }, { "epoch": 0.4987451952095048, "grad_norm": 0.9587375521659851, "learning_rate": 0.00010539901158188398, "loss": 0.5136, "step": 15700 }, { "epoch": 0.4993805394072239, "grad_norm": 0.870891273021698, "learning_rate": 0.0001051935237423186, "loss": 0.5274, "step": 15720 }, { "epoch": 0.500015883604943, "grad_norm": 1.1741816997528076, "learning_rate": 0.00010498801391128108, "loss": 0.5274, "step": 15740 }, { "epoch": 0.5006512278026621, "grad_norm": 1.074429988861084, "learning_rate": 0.00010478248295898285, "loss": 0.5049, "step": 15760 }, { "epoch": 0.5012865720003812, "grad_norm": 0.7894431352615356, "learning_rate": 0.00010457693175572483, "loss": 0.5141, "step": 15780 }, { "epoch": 0.5019219161981003, "grad_norm": 0.8638029098510742, "learning_rate": 0.00010437136117189356, "loss": 0.5053, "step": 15800 }, { "epoch": 0.5025572603958194, "grad_norm": 0.9749894142150879, "learning_rate": 0.00010416577207795776, "loss": 0.5319, "step": 15820 }, { "epoch": 0.5031926045935385, "grad_norm": 0.9491709470748901, "learning_rate": 0.00010396016534446451, "loss": 0.4968, "step": 15840 }, { "epoch": 0.5038279487912577, "grad_norm": 0.880732536315918, "learning_rate": 0.00010375454184203555, "loss": 0.5292, "step": 15860 }, { "epoch": 0.5044632929889767, "grad_norm": 1.22807776927948, "learning_rate": 0.00010354890244136361, "loss": 0.5228, "step": 15880 }, { "epoch": 0.5050986371866959, "grad_norm": 0.8567366003990173, "learning_rate": 0.00010334324801320881, "loss": 0.558, "step": 15900 }, { "epoch": 0.505733981384415, "grad_norm": 0.8203198909759521, "learning_rate": 0.00010313757942839482, "loss": 0.5061, "step": 15920 }, { "epoch": 0.5063693255821341, "grad_norm": 0.9894897937774658, "learning_rate": 0.00010293189755780535, "loss": 0.5322, "step": 15940 }, { "epoch": 0.5070046697798533, "grad_norm": 1.0645695924758911, "learning_rate": 0.0001027262032723803, "loss": 0.536, "step": 15960 }, { "epoch": 0.5076400139775723, "grad_norm": 0.9940254092216492, "learning_rate": 0.0001025204974431121, "loss": 0.5211, "step": 15980 }, { "epoch": 0.5082753581752915, "grad_norm": 0.7856065630912781, "learning_rate": 0.00010231478094104216, "loss": 0.5137, "step": 16000 }, { "epoch": 0.5082753581752915, "eval_loss": 0.48191481828689575, "eval_runtime": 44.2211, "eval_samples_per_second": 61.125, "eval_steps_per_second": 30.574, "step": 16000 }, { "epoch": 0.5089107023730106, "grad_norm": 0.9363443851470947, "learning_rate": 0.00010210905463725703, "loss": 0.5426, "step": 16020 }, { "epoch": 0.5095460465707297, "grad_norm": 0.8720065355300903, "learning_rate": 0.0001019033194028848, "loss": 0.525, "step": 16040 }, { "epoch": 0.5101813907684488, "grad_norm": 0.9192999005317688, "learning_rate": 0.00010169757610909131, "loss": 0.5265, "step": 16060 }, { "epoch": 0.510816734966168, "grad_norm": 1.089529037475586, "learning_rate": 0.00010149182562707657, "loss": 0.5148, "step": 16080 }, { "epoch": 0.511452079163887, "grad_norm": 0.8161883354187012, "learning_rate": 0.00010128606882807106, "loss": 0.5441, "step": 16100 }, { "epoch": 0.5120874233616062, "grad_norm": 0.8635348081588745, "learning_rate": 0.00010108030658333192, "loss": 0.4981, "step": 16120 }, { "epoch": 0.5127227675593252, "grad_norm": 0.9366866946220398, "learning_rate": 0.00010087453976413943, "loss": 0.5155, "step": 16140 }, { "epoch": 0.5133581117570444, "grad_norm": 0.8161008954048157, "learning_rate": 0.00010066876924179321, "loss": 0.5178, "step": 16160 }, { "epoch": 0.5139934559547635, "grad_norm": 1.2926280498504639, "learning_rate": 0.00010046299588760855, "loss": 0.5409, "step": 16180 }, { "epoch": 0.5146288001524826, "grad_norm": 0.9963902235031128, "learning_rate": 0.00010025722057291273, "loss": 0.514, "step": 16200 }, { "epoch": 0.5152641443502017, "grad_norm": 0.7572094202041626, "learning_rate": 0.0001000514441690414, "loss": 0.5142, "step": 16220 }, { "epoch": 0.5158994885479208, "grad_norm": 0.7842695713043213, "learning_rate": 9.984566754733471e-05, "loss": 0.5419, "step": 16240 }, { "epoch": 0.5165348327456399, "grad_norm": 0.8259790539741516, "learning_rate": 9.96398915791338e-05, "loss": 0.5053, "step": 16260 }, { "epoch": 0.5171701769433591, "grad_norm": 0.7848758697509766, "learning_rate": 9.943411713577707e-05, "loss": 0.5129, "step": 16280 }, { "epoch": 0.5178055211410781, "grad_norm": 0.9001737236976624, "learning_rate": 9.922834508859636e-05, "loss": 0.5095, "step": 16300 }, { "epoch": 0.5184408653387973, "grad_norm": 1.2547895908355713, "learning_rate": 9.90225763089135e-05, "loss": 0.5402, "step": 16320 }, { "epoch": 0.5190762095365165, "grad_norm": 1.0412747859954834, "learning_rate": 9.881681166803634e-05, "loss": 0.5039, "step": 16340 }, { "epoch": 0.5197115537342355, "grad_norm": 0.8408613204956055, "learning_rate": 9.861105203725533e-05, "loss": 0.5256, "step": 16360 }, { "epoch": 0.5203468979319547, "grad_norm": 0.7325016856193542, "learning_rate": 9.840529828783965e-05, "loss": 0.5055, "step": 16380 }, { "epoch": 0.5209822421296737, "grad_norm": 1.3417218923568726, "learning_rate": 9.819955129103355e-05, "loss": 0.5336, "step": 16400 }, { "epoch": 0.5216175863273929, "grad_norm": 0.8016658425331116, "learning_rate": 9.799381191805272e-05, "loss": 0.5285, "step": 16420 }, { "epoch": 0.522252930525112, "grad_norm": 0.7678484916687012, "learning_rate": 9.778808104008059e-05, "loss": 0.5243, "step": 16440 }, { "epoch": 0.5228882747228311, "grad_norm": 1.0348572731018066, "learning_rate": 9.760293123314227e-05, "loss": 0.5305, "step": 16460 }, { "epoch": 0.5235236189205502, "grad_norm": 0.891635537147522, "learning_rate": 9.739721889566509e-05, "loss": 0.5258, "step": 16480 }, { "epoch": 0.5241589631182694, "grad_norm": 0.9525818824768066, "learning_rate": 9.719151757941184e-05, "loss": 0.5405, "step": 16500 }, { "epoch": 0.5247943073159884, "grad_norm": 0.8067079186439514, "learning_rate": 9.698582815540476e-05, "loss": 0.5058, "step": 16520 }, { "epoch": 0.5254296515137076, "grad_norm": 0.8525674939155579, "learning_rate": 9.678015149461577e-05, "loss": 0.5429, "step": 16540 }, { "epoch": 0.5260649957114266, "grad_norm": 0.9794461727142334, "learning_rate": 9.65744884679627e-05, "loss": 0.5106, "step": 16560 }, { "epoch": 0.5267003399091458, "grad_norm": 0.8107161521911621, "learning_rate": 9.636883994630567e-05, "loss": 0.5124, "step": 16580 }, { "epoch": 0.5273356841068649, "grad_norm": 0.8728024959564209, "learning_rate": 9.61632068004434e-05, "loss": 0.5483, "step": 16600 }, { "epoch": 0.527971028304584, "grad_norm": 1.0132850408554077, "learning_rate": 9.595758990110948e-05, "loss": 0.55, "step": 16620 }, { "epoch": 0.5286063725023031, "grad_norm": 1.0854065418243408, "learning_rate": 9.575199011896869e-05, "loss": 0.5022, "step": 16640 }, { "epoch": 0.5292417167000223, "grad_norm": 1.06479012966156, "learning_rate": 9.555668697368233e-05, "loss": 0.4932, "step": 16660 }, { "epoch": 0.5298770608977413, "grad_norm": 1.1619220972061157, "learning_rate": 9.535112307403999e-05, "loss": 0.5377, "step": 16680 }, { "epoch": 0.5305124050954605, "grad_norm": 1.1277661323547363, "learning_rate": 9.514557885961573e-05, "loss": 0.5267, "step": 16700 }, { "epoch": 0.5311477492931795, "grad_norm": 1.0196537971496582, "learning_rate": 9.494005520076655e-05, "loss": 0.5203, "step": 16720 }, { "epoch": 0.5317830934908987, "grad_norm": 0.9534218907356262, "learning_rate": 9.473455296776239e-05, "loss": 0.5177, "step": 16740 }, { "epoch": 0.5324184376886179, "grad_norm": 0.9330717325210571, "learning_rate": 9.45290730307826e-05, "loss": 0.55, "step": 16760 }, { "epoch": 0.5330537818863369, "grad_norm": 0.9290218949317932, "learning_rate": 9.43236162599119e-05, "loss": 0.5301, "step": 16780 }, { "epoch": 0.533689126084056, "grad_norm": 0.9842971563339233, "learning_rate": 9.411818352513715e-05, "loss": 0.4928, "step": 16800 }, { "epoch": 0.5343244702817751, "grad_norm": 0.9267326593399048, "learning_rate": 9.391277569634329e-05, "loss": 0.5443, "step": 16820 }, { "epoch": 0.5349598144794943, "grad_norm": 0.9270855784416199, "learning_rate": 9.370739364330982e-05, "loss": 0.5132, "step": 16840 }, { "epoch": 0.5355951586772134, "grad_norm": 0.9786942601203918, "learning_rate": 9.35020382357071e-05, "loss": 0.5229, "step": 16860 }, { "epoch": 0.5362305028749325, "grad_norm": 0.8397322297096252, "learning_rate": 9.329671034309269e-05, "loss": 0.5248, "step": 16880 }, { "epoch": 0.5368658470726516, "grad_norm": 0.9696868062019348, "learning_rate": 9.30914108349076e-05, "loss": 0.5635, "step": 16900 }, { "epoch": 0.5375011912703708, "grad_norm": 1.1376127004623413, "learning_rate": 9.28861405804727e-05, "loss": 0.548, "step": 16920 }, { "epoch": 0.5381365354680898, "grad_norm": 0.9028751254081726, "learning_rate": 9.268090044898489e-05, "loss": 0.5253, "step": 16940 }, { "epoch": 0.538771879665809, "grad_norm": 0.7549586296081543, "learning_rate": 9.247569130951365e-05, "loss": 0.5119, "step": 16960 }, { "epoch": 0.539407223863528, "grad_norm": 1.002920150756836, "learning_rate": 9.227051403099715e-05, "loss": 0.5383, "step": 16980 }, { "epoch": 0.5400425680612472, "grad_norm": 0.7857794761657715, "learning_rate": 9.206536948223862e-05, "loss": 0.4943, "step": 17000 }, { "epoch": 0.5400425680612472, "eval_loss": 0.47516322135925293, "eval_runtime": 44.9681, "eval_samples_per_second": 60.109, "eval_steps_per_second": 30.066, "step": 17000 }, { "epoch": 0.5406779122589663, "grad_norm": 0.8384699821472168, "learning_rate": 9.186025853190276e-05, "loss": 0.5005, "step": 17020 }, { "epoch": 0.5413132564566854, "grad_norm": 0.859467089176178, "learning_rate": 9.1655182048512e-05, "loss": 0.486, "step": 17040 }, { "epoch": 0.5419486006544045, "grad_norm": 0.9178836345672607, "learning_rate": 9.145014090044276e-05, "loss": 0.4866, "step": 17060 }, { "epoch": 0.5425839448521237, "grad_norm": 1.5116227865219116, "learning_rate": 9.12451359559219e-05, "loss": 0.5103, "step": 17080 }, { "epoch": 0.5432192890498427, "grad_norm": 0.8251123428344727, "learning_rate": 9.104016808302297e-05, "loss": 0.5403, "step": 17100 }, { "epoch": 0.5438546332475619, "grad_norm": 0.8845348358154297, "learning_rate": 9.08352381496625e-05, "loss": 0.5295, "step": 17120 }, { "epoch": 0.5444899774452809, "grad_norm": 0.8761606812477112, "learning_rate": 9.063034702359643e-05, "loss": 0.5175, "step": 17140 }, { "epoch": 0.5451253216430001, "grad_norm": 0.8992062211036682, "learning_rate": 9.042549557241629e-05, "loss": 0.5211, "step": 17160 }, { "epoch": 0.5457606658407193, "grad_norm": 1.0609464645385742, "learning_rate": 9.022068466354573e-05, "loss": 0.5231, "step": 17180 }, { "epoch": 0.5463960100384383, "grad_norm": 1.1660939455032349, "learning_rate": 9.001591516423664e-05, "loss": 0.5097, "step": 17200 }, { "epoch": 0.5470313542361575, "grad_norm": 0.8982824683189392, "learning_rate": 8.981118794156556e-05, "loss": 0.499, "step": 17220 }, { "epoch": 0.5476666984338765, "grad_norm": 0.9423658847808838, "learning_rate": 8.960650386243009e-05, "loss": 0.5023, "step": 17240 }, { "epoch": 0.5483020426315957, "grad_norm": 0.781741738319397, "learning_rate": 8.940186379354505e-05, "loss": 0.5098, "step": 17260 }, { "epoch": 0.5489373868293148, "grad_norm": 0.9678505063056946, "learning_rate": 8.919726860143895e-05, "loss": 0.5005, "step": 17280 }, { "epoch": 0.5495727310270339, "grad_norm": 0.9400302171707153, "learning_rate": 8.899271915245028e-05, "loss": 0.537, "step": 17300 }, { "epoch": 0.550208075224753, "grad_norm": 0.8072425127029419, "learning_rate": 8.878821631272384e-05, "loss": 0.5073, "step": 17320 }, { "epoch": 0.5508434194224722, "grad_norm": 0.9000498652458191, "learning_rate": 8.858376094820701e-05, "loss": 0.5014, "step": 17340 }, { "epoch": 0.5514787636201912, "grad_norm": 0.9222893118858337, "learning_rate": 8.837935392464621e-05, "loss": 0.5216, "step": 17360 }, { "epoch": 0.5521141078179104, "grad_norm": 0.8468360304832458, "learning_rate": 8.817499610758316e-05, "loss": 0.5282, "step": 17380 }, { "epoch": 0.5527494520156294, "grad_norm": 0.7120311260223389, "learning_rate": 8.797068836235116e-05, "loss": 0.5277, "step": 17400 }, { "epoch": 0.5533847962133486, "grad_norm": 0.880155622959137, "learning_rate": 8.776643155407154e-05, "loss": 0.523, "step": 17420 }, { "epoch": 0.5540201404110677, "grad_norm": 1.023587703704834, "learning_rate": 8.756222654764996e-05, "loss": 0.508, "step": 17440 }, { "epoch": 0.5546554846087868, "grad_norm": 0.8903362154960632, "learning_rate": 8.735807420777262e-05, "loss": 0.5165, "step": 17460 }, { "epoch": 0.5552908288065059, "grad_norm": 0.7317694425582886, "learning_rate": 8.715397539890287e-05, "loss": 0.4672, "step": 17480 }, { "epoch": 0.5559261730042251, "grad_norm": 1.0228464603424072, "learning_rate": 8.694993098527723e-05, "loss": 0.5112, "step": 17500 }, { "epoch": 0.5565615172019441, "grad_norm": 0.7797629237174988, "learning_rate": 8.674594183090199e-05, "loss": 0.477, "step": 17520 }, { "epoch": 0.5571968613996633, "grad_norm": 0.8488342761993408, "learning_rate": 8.654200879954945e-05, "loss": 0.4993, "step": 17540 }, { "epoch": 0.5578322055973823, "grad_norm": 0.8529194593429565, "learning_rate": 8.63381327547542e-05, "loss": 0.5293, "step": 17560 }, { "epoch": 0.5584675497951015, "grad_norm": 0.9537157416343689, "learning_rate": 8.613431455980955e-05, "loss": 0.5047, "step": 17580 }, { "epoch": 0.5591028939928206, "grad_norm": 0.8697558045387268, "learning_rate": 8.593055507776393e-05, "loss": 0.5293, "step": 17600 }, { "epoch": 0.5597382381905397, "grad_norm": 0.8306463360786438, "learning_rate": 8.5726855171417e-05, "loss": 0.5075, "step": 17620 }, { "epoch": 0.5603735823882589, "grad_norm": 0.8880159258842468, "learning_rate": 8.55232157033163e-05, "loss": 0.5149, "step": 17640 }, { "epoch": 0.561008926585978, "grad_norm": 0.9390746355056763, "learning_rate": 8.531963753575334e-05, "loss": 0.5196, "step": 17660 }, { "epoch": 0.561644270783697, "grad_norm": 0.968285322189331, "learning_rate": 8.511612153076015e-05, "loss": 0.5229, "step": 17680 }, { "epoch": 0.5622796149814162, "grad_norm": 0.9114767909049988, "learning_rate": 8.491266855010548e-05, "loss": 0.5008, "step": 17700 }, { "epoch": 0.5629149591791353, "grad_norm": 0.9089644551277161, "learning_rate": 8.470927945529123e-05, "loss": 0.4848, "step": 17720 }, { "epoch": 0.5635503033768544, "grad_norm": 0.7264979481697083, "learning_rate": 8.450595510754877e-05, "loss": 0.5155, "step": 17740 }, { "epoch": 0.5641856475745736, "grad_norm": 0.9070448875427246, "learning_rate": 8.430269636783534e-05, "loss": 0.524, "step": 17760 }, { "epoch": 0.5648209917722926, "grad_norm": 0.9725968241691589, "learning_rate": 8.40995040968303e-05, "loss": 0.4925, "step": 17780 }, { "epoch": 0.5654563359700118, "grad_norm": 0.8976007103919983, "learning_rate": 8.389637915493162e-05, "loss": 0.4937, "step": 17800 }, { "epoch": 0.5660916801677308, "grad_norm": 0.9926420450210571, "learning_rate": 8.369332240225214e-05, "loss": 0.5181, "step": 17820 }, { "epoch": 0.56672702436545, "grad_norm": 0.852676272392273, "learning_rate": 8.349033469861598e-05, "loss": 0.5175, "step": 17840 }, { "epoch": 0.5673623685631691, "grad_norm": 0.8739320635795593, "learning_rate": 8.328741690355487e-05, "loss": 0.4805, "step": 17860 }, { "epoch": 0.5679977127608882, "grad_norm": 0.9660511016845703, "learning_rate": 8.308456987630449e-05, "loss": 0.5063, "step": 17880 }, { "epoch": 0.5686330569586073, "grad_norm": 0.9321526288986206, "learning_rate": 8.288179447580088e-05, "loss": 0.4994, "step": 17900 }, { "epoch": 0.5692684011563265, "grad_norm": 1.0359587669372559, "learning_rate": 8.267909156067685e-05, "loss": 0.5279, "step": 17920 }, { "epoch": 0.5699037453540455, "grad_norm": 0.9722701907157898, "learning_rate": 8.247646198925813e-05, "loss": 0.5061, "step": 17940 }, { "epoch": 0.5705390895517647, "grad_norm": 0.854860782623291, "learning_rate": 8.227390661956006e-05, "loss": 0.4827, "step": 17960 }, { "epoch": 0.5711744337494837, "grad_norm": 0.8997724652290344, "learning_rate": 8.207142630928362e-05, "loss": 0.4978, "step": 17980 }, { "epoch": 0.5718097779472029, "grad_norm": 0.9234896898269653, "learning_rate": 8.186902191581205e-05, "loss": 0.4982, "step": 18000 }, { "epoch": 0.5718097779472029, "eval_loss": 0.469827800989151, "eval_runtime": 44.8258, "eval_samples_per_second": 60.3, "eval_steps_per_second": 30.161, "step": 18000 }, { "epoch": 0.572445122144922, "grad_norm": 0.8457797169685364, "learning_rate": 8.166669429620712e-05, "loss": 0.5263, "step": 18020 }, { "epoch": 0.5730804663426411, "grad_norm": 0.8909218907356262, "learning_rate": 8.146444430720545e-05, "loss": 0.5045, "step": 18040 }, { "epoch": 0.5737158105403602, "grad_norm": 0.950072705745697, "learning_rate": 8.126227280521503e-05, "loss": 0.5247, "step": 18060 }, { "epoch": 0.5743511547380794, "grad_norm": 0.9507225751876831, "learning_rate": 8.106018064631148e-05, "loss": 0.4851, "step": 18080 }, { "epoch": 0.5749864989357985, "grad_norm": 1.0232789516448975, "learning_rate": 8.085816868623436e-05, "loss": 0.5457, "step": 18100 }, { "epoch": 0.5756218431335176, "grad_norm": 1.0967813730239868, "learning_rate": 8.065623778038377e-05, "loss": 0.52, "step": 18120 }, { "epoch": 0.5762571873312367, "grad_norm": 0.7866876125335693, "learning_rate": 8.045438878381649e-05, "loss": 0.5117, "step": 18140 }, { "epoch": 0.5768925315289558, "grad_norm": 0.9325518012046814, "learning_rate": 8.025262255124248e-05, "loss": 0.5415, "step": 18160 }, { "epoch": 0.577527875726675, "grad_norm": 0.8899424076080322, "learning_rate": 8.005093993702133e-05, "loss": 0.4947, "step": 18180 }, { "epoch": 0.578163219924394, "grad_norm": 1.0050842761993408, "learning_rate": 7.984934179515843e-05, "loss": 0.4863, "step": 18200 }, { "epoch": 0.5787985641221132, "grad_norm": 0.836564302444458, "learning_rate": 7.964782897930158e-05, "loss": 0.5055, "step": 18220 }, { "epoch": 0.5794339083198322, "grad_norm": 1.032029628753662, "learning_rate": 7.944640234273724e-05, "loss": 0.4919, "step": 18240 }, { "epoch": 0.5800692525175514, "grad_norm": 0.854015588760376, "learning_rate": 7.92450627383869e-05, "loss": 0.5108, "step": 18260 }, { "epoch": 0.5807045967152705, "grad_norm": 1.0629216432571411, "learning_rate": 7.904381101880364e-05, "loss": 0.5312, "step": 18280 }, { "epoch": 0.5813399409129896, "grad_norm": 0.8146398067474365, "learning_rate": 7.884264803616827e-05, "loss": 0.5203, "step": 18300 }, { "epoch": 0.5819752851107087, "grad_norm": 1.1307437419891357, "learning_rate": 7.864157464228593e-05, "loss": 0.5325, "step": 18320 }, { "epoch": 0.5826106293084279, "grad_norm": 0.9609930515289307, "learning_rate": 7.844059168858241e-05, "loss": 0.5034, "step": 18340 }, { "epoch": 0.5832459735061469, "grad_norm": 0.8615232110023499, "learning_rate": 7.823970002610048e-05, "loss": 0.522, "step": 18360 }, { "epoch": 0.5838813177038661, "grad_norm": 1.014160394668579, "learning_rate": 7.803890050549641e-05, "loss": 0.5104, "step": 18380 }, { "epoch": 0.5845166619015851, "grad_norm": 1.015424370765686, "learning_rate": 7.78381939770363e-05, "loss": 0.4887, "step": 18400 }, { "epoch": 0.5851520060993043, "grad_norm": 1.0072382688522339, "learning_rate": 7.763758129059243e-05, "loss": 0.5242, "step": 18420 }, { "epoch": 0.5857873502970234, "grad_norm": 1.122096300125122, "learning_rate": 7.743706329563971e-05, "loss": 0.5408, "step": 18440 }, { "epoch": 0.5864226944947425, "grad_norm": 0.8347269296646118, "learning_rate": 7.723664084125218e-05, "loss": 0.5112, "step": 18460 }, { "epoch": 0.5870580386924616, "grad_norm": 0.9214980006217957, "learning_rate": 7.703631477609926e-05, "loss": 0.5111, "step": 18480 }, { "epoch": 0.5876933828901808, "grad_norm": 0.8427157402038574, "learning_rate": 7.683608594844218e-05, "loss": 0.5199, "step": 18500 }, { "epoch": 0.5883287270878998, "grad_norm": 0.8485844731330872, "learning_rate": 7.663595520613054e-05, "loss": 0.5193, "step": 18520 }, { "epoch": 0.588964071285619, "grad_norm": 0.8761444687843323, "learning_rate": 7.643592339659848e-05, "loss": 0.5044, "step": 18540 }, { "epoch": 0.589599415483338, "grad_norm": 0.9373889565467834, "learning_rate": 7.623599136686133e-05, "loss": 0.493, "step": 18560 }, { "epoch": 0.5902347596810572, "grad_norm": 0.9052358269691467, "learning_rate": 7.603615996351184e-05, "loss": 0.516, "step": 18580 }, { "epoch": 0.5908701038787764, "grad_norm": 0.7757846117019653, "learning_rate": 7.583643003271668e-05, "loss": 0.5043, "step": 18600 }, { "epoch": 0.5915054480764954, "grad_norm": 0.7769386172294617, "learning_rate": 7.563680242021285e-05, "loss": 0.5005, "step": 18620 }, { "epoch": 0.5921407922742146, "grad_norm": 0.7892422080039978, "learning_rate": 7.543727797130413e-05, "loss": 0.4982, "step": 18640 }, { "epoch": 0.5927761364719337, "grad_norm": 1.0471646785736084, "learning_rate": 7.524782606964114e-05, "loss": 0.5139, "step": 18660 }, { "epoch": 0.5934114806696528, "grad_norm": 0.7995429039001465, "learning_rate": 7.504850521939017e-05, "loss": 0.4736, "step": 18680 }, { "epoch": 0.5940468248673719, "grad_norm": 0.9799679517745972, "learning_rate": 7.484929002382169e-05, "loss": 0.5033, "step": 18700 }, { "epoch": 0.594682169065091, "grad_norm": 0.8607106804847717, "learning_rate": 7.465018132649311e-05, "loss": 0.498, "step": 18720 }, { "epoch": 0.5953175132628101, "grad_norm": 0.9690695405006409, "learning_rate": 7.445117997051085e-05, "loss": 0.4898, "step": 18740 }, { "epoch": 0.5959528574605293, "grad_norm": 1.331871747970581, "learning_rate": 7.425228679852684e-05, "loss": 0.5044, "step": 18760 }, { "epoch": 0.5965882016582483, "grad_norm": 0.9347879886627197, "learning_rate": 7.405350265273492e-05, "loss": 0.5088, "step": 18780 }, { "epoch": 0.5972235458559675, "grad_norm": 0.8495462536811829, "learning_rate": 7.385482837486725e-05, "loss": 0.5078, "step": 18800 }, { "epoch": 0.5978588900536865, "grad_norm": 1.318202257156372, "learning_rate": 7.365626480619081e-05, "loss": 0.5014, "step": 18820 }, { "epoch": 0.5984942342514057, "grad_norm": 1.0349724292755127, "learning_rate": 7.345781278750368e-05, "loss": 0.531, "step": 18840 }, { "epoch": 0.5991295784491248, "grad_norm": 1.047760248184204, "learning_rate": 7.326938745831322e-05, "loss": 0.4925, "step": 18860 }, { "epoch": 0.5997649226468439, "grad_norm": 0.874220073223114, "learning_rate": 7.307115537865903e-05, "loss": 0.5056, "step": 18880 }, { "epoch": 0.600400266844563, "grad_norm": 0.738158106803894, "learning_rate": 7.287303732658328e-05, "loss": 0.4938, "step": 18900 }, { "epoch": 0.6010356110422822, "grad_norm": 0.8721213936805725, "learning_rate": 7.267503414099758e-05, "loss": 0.5074, "step": 18920 }, { "epoch": 0.6016709552400012, "grad_norm": 0.7241856455802917, "learning_rate": 7.247714666032724e-05, "loss": 0.5045, "step": 18940 }, { "epoch": 0.6023062994377204, "grad_norm": 1.0385938882827759, "learning_rate": 7.227937572250761e-05, "loss": 0.5313, "step": 18960 }, { "epoch": 0.6029416436354395, "grad_norm": 1.8555858135223389, "learning_rate": 7.208172216498046e-05, "loss": 0.4989, "step": 18980 }, { "epoch": 0.6035769878331586, "grad_norm": 0.9453182816505432, "learning_rate": 7.188418682469064e-05, "loss": 0.5146, "step": 19000 }, { "epoch": 0.6035769878331586, "eval_loss": 0.46334323287010193, "eval_runtime": 44.8428, "eval_samples_per_second": 60.277, "eval_steps_per_second": 30.15, "step": 19000 }, { "epoch": 0.6042123320308778, "grad_norm": 0.9362254738807678, "learning_rate": 7.168677053808237e-05, "loss": 0.5148, "step": 19020 }, { "epoch": 0.6048476762285968, "grad_norm": 1.19162917137146, "learning_rate": 7.148947414109572e-05, "loss": 0.4954, "step": 19040 }, { "epoch": 0.605483020426316, "grad_norm": 0.9854863286018372, "learning_rate": 7.129229846916318e-05, "loss": 0.5173, "step": 19060 }, { "epoch": 0.6061183646240351, "grad_norm": 0.8435449600219727, "learning_rate": 7.109524435720597e-05, "loss": 0.5154, "step": 19080 }, { "epoch": 0.6067537088217542, "grad_norm": 0.920364260673523, "learning_rate": 7.08983126396306e-05, "loss": 0.5092, "step": 19100 }, { "epoch": 0.6073890530194733, "grad_norm": 1.2439565658569336, "learning_rate": 7.070150415032527e-05, "loss": 0.511, "step": 19120 }, { "epoch": 0.6080243972171924, "grad_norm": 0.7429732084274292, "learning_rate": 7.050481972265648e-05, "loss": 0.4787, "step": 19140 }, { "epoch": 0.6086597414149115, "grad_norm": 0.6966003179550171, "learning_rate": 7.03082601894653e-05, "loss": 0.5237, "step": 19160 }, { "epoch": 0.6092950856126307, "grad_norm": 0.8211964964866638, "learning_rate": 7.011182638306402e-05, "loss": 0.5349, "step": 19180 }, { "epoch": 0.6099304298103497, "grad_norm": 0.9803711771965027, "learning_rate": 6.991551913523253e-05, "loss": 0.5369, "step": 19200 }, { "epoch": 0.6105657740080689, "grad_norm": 0.9161061644554138, "learning_rate": 6.971933927721479e-05, "loss": 0.4993, "step": 19220 }, { "epoch": 0.611201118205788, "grad_norm": 0.9608227014541626, "learning_rate": 6.952328763971537e-05, "loss": 0.4837, "step": 19240 }, { "epoch": 0.6118364624035071, "grad_norm": 0.9438381195068359, "learning_rate": 6.932736505289592e-05, "loss": 0.479, "step": 19260 }, { "epoch": 0.6124718066012262, "grad_norm": 1.571315884590149, "learning_rate": 6.91315723463716e-05, "loss": 0.5417, "step": 19280 }, { "epoch": 0.6131071507989453, "grad_norm": 0.8187804818153381, "learning_rate": 6.893591034920763e-05, "loss": 0.5189, "step": 19300 }, { "epoch": 0.6137424949966644, "grad_norm": 0.7617794871330261, "learning_rate": 6.87403798899157e-05, "loss": 0.468, "step": 19320 }, { "epoch": 0.6143778391943836, "grad_norm": 0.8723959922790527, "learning_rate": 6.85449817964506e-05, "loss": 0.5044, "step": 19340 }, { "epoch": 0.6150131833921026, "grad_norm": 0.7760429382324219, "learning_rate": 6.834971689620659e-05, "loss": 0.4922, "step": 19360 }, { "epoch": 0.6156485275898218, "grad_norm": 0.925581693649292, "learning_rate": 6.815458601601392e-05, "loss": 0.5079, "step": 19380 }, { "epoch": 0.6162838717875408, "grad_norm": 0.8069369792938232, "learning_rate": 6.795958998213535e-05, "loss": 0.4995, "step": 19400 }, { "epoch": 0.61691921598526, "grad_norm": 1.3501884937286377, "learning_rate": 6.77647296202627e-05, "loss": 0.4906, "step": 19420 }, { "epoch": 0.6175545601829792, "grad_norm": 0.9078099131584167, "learning_rate": 6.75700057555132e-05, "loss": 0.4983, "step": 19440 }, { "epoch": 0.6181899043806982, "grad_norm": 0.7792625427246094, "learning_rate": 6.737541921242619e-05, "loss": 0.4869, "step": 19460 }, { "epoch": 0.6188252485784174, "grad_norm": 0.8952593803405762, "learning_rate": 6.718097081495947e-05, "loss": 0.4975, "step": 19480 }, { "epoch": 0.6194605927761365, "grad_norm": 0.9192362427711487, "learning_rate": 6.698666138648593e-05, "loss": 0.5059, "step": 19500 }, { "epoch": 0.6200959369738556, "grad_norm": 0.8911659121513367, "learning_rate": 6.679249174978997e-05, "loss": 0.5014, "step": 19520 }, { "epoch": 0.6207312811715747, "grad_norm": 0.9853730201721191, "learning_rate": 6.659846272706406e-05, "loss": 0.4935, "step": 19540 }, { "epoch": 0.6213666253692938, "grad_norm": 1.3485686779022217, "learning_rate": 6.640457513990527e-05, "loss": 0.5061, "step": 19560 }, { "epoch": 0.6220019695670129, "grad_norm": 0.8757696747779846, "learning_rate": 6.621082980931179e-05, "loss": 0.4869, "step": 19580 }, { "epoch": 0.6226373137647321, "grad_norm": 1.0088223218917847, "learning_rate": 6.601722755567937e-05, "loss": 0.5138, "step": 19600 }, { "epoch": 0.6232726579624511, "grad_norm": 0.94034343957901, "learning_rate": 6.582376919879798e-05, "loss": 0.5159, "step": 19620 }, { "epoch": 0.6239080021601703, "grad_norm": 0.834994375705719, "learning_rate": 6.563045555784826e-05, "loss": 0.4862, "step": 19640 }, { "epoch": 0.6245433463578894, "grad_norm": 1.2617956399917603, "learning_rate": 6.543728745139802e-05, "loss": 0.5112, "step": 19660 }, { "epoch": 0.6251786905556085, "grad_norm": 0.8542491793632507, "learning_rate": 6.524426569739892e-05, "loss": 0.5234, "step": 19680 }, { "epoch": 0.6258140347533276, "grad_norm": 1.162606120109558, "learning_rate": 6.505139111318277e-05, "loss": 0.4772, "step": 19700 }, { "epoch": 0.6264493789510467, "grad_norm": 1.0025289058685303, "learning_rate": 6.48586645154583e-05, "loss": 0.5212, "step": 19720 }, { "epoch": 0.6270847231487658, "grad_norm": 1.0566537380218506, "learning_rate": 6.466608672030763e-05, "loss": 0.5556, "step": 19740 }, { "epoch": 0.627720067346485, "grad_norm": 1.0380536317825317, "learning_rate": 6.447365854318266e-05, "loss": 0.4827, "step": 19760 }, { "epoch": 0.628355411544204, "grad_norm": 1.0499038696289062, "learning_rate": 6.42813807989019e-05, "loss": 0.5316, "step": 19780 }, { "epoch": 0.6289907557419232, "grad_norm": 0.7457720637321472, "learning_rate": 6.408925430164669e-05, "loss": 0.5055, "step": 19800 }, { "epoch": 0.6296260999396422, "grad_norm": 1.2990676164627075, "learning_rate": 6.389727986495813e-05, "loss": 0.5068, "step": 19820 }, { "epoch": 0.6302614441373614, "grad_norm": 0.9500844478607178, "learning_rate": 6.370545830173332e-05, "loss": 0.4889, "step": 19840 }, { "epoch": 0.6308967883350806, "grad_norm": 0.7668824195861816, "learning_rate": 6.351379042422199e-05, "loss": 0.5314, "step": 19860 }, { "epoch": 0.6315321325327996, "grad_norm": 0.9457335472106934, "learning_rate": 6.332227704402321e-05, "loss": 0.4898, "step": 19880 }, { "epoch": 0.6321674767305188, "grad_norm": 0.8252271413803101, "learning_rate": 6.31309189720818e-05, "loss": 0.5045, "step": 19900 }, { "epoch": 0.6328028209282379, "grad_norm": 0.9943385720252991, "learning_rate": 6.29397170186849e-05, "loss": 0.5243, "step": 19920 }, { "epoch": 0.633438165125957, "grad_norm": 1.1582151651382446, "learning_rate": 6.27582205051849e-05, "loss": 0.5331, "step": 19940 }, { "epoch": 0.6340735093236761, "grad_norm": 0.9436770677566528, "learning_rate": 6.256732531103176e-05, "loss": 0.4903, "step": 19960 }, { "epoch": 0.6347088535213952, "grad_norm": 0.8253883123397827, "learning_rate": 6.237658862190583e-05, "loss": 0.4934, "step": 19980 }, { "epoch": 0.6353441977191143, "grad_norm": 0.8770557641983032, "learning_rate": 6.21860112454631e-05, "loss": 0.5202, "step": 20000 }, { "epoch": 0.6353441977191143, "eval_loss": 0.45828375220298767, "eval_runtime": 44.5614, "eval_samples_per_second": 60.658, "eval_steps_per_second": 30.34, "step": 20000 }, { "epoch": 0.6359795419168335, "grad_norm": 1.2218546867370605, "learning_rate": 6.19955939886849e-05, "loss": 0.5171, "step": 20020 }, { "epoch": 0.6366148861145525, "grad_norm": 0.8330618143081665, "learning_rate": 6.180533765787468e-05, "loss": 0.4863, "step": 20040 }, { "epoch": 0.6372502303122717, "grad_norm": 1.0419652462005615, "learning_rate": 6.162474393506114e-05, "loss": 0.5427, "step": 20060 }, { "epoch": 0.6378855745099908, "grad_norm": 0.9472757577896118, "learning_rate": 6.143480372643493e-05, "loss": 0.5245, "step": 20080 }, { "epoch": 0.6385209187077099, "grad_norm": 0.7603405117988586, "learning_rate": 6.12450268183886e-05, "loss": 0.4964, "step": 20100 }, { "epoch": 0.639156262905429, "grad_norm": 0.8776742219924927, "learning_rate": 6.105541401451404e-05, "loss": 0.4966, "step": 20120 }, { "epoch": 0.6397916071031481, "grad_norm": 0.8271143436431885, "learning_rate": 6.086596611770831e-05, "loss": 0.5119, "step": 20140 }, { "epoch": 0.6404269513008672, "grad_norm": 1.1509547233581543, "learning_rate": 6.067668393017007e-05, "loss": 0.5031, "step": 20160 }, { "epoch": 0.6410622954985864, "grad_norm": 0.8693366050720215, "learning_rate": 6.048756825339643e-05, "loss": 0.4986, "step": 20180 }, { "epoch": 0.6416976396963054, "grad_norm": 0.949834942817688, "learning_rate": 6.029861988817935e-05, "loss": 0.4921, "step": 20200 }, { "epoch": 0.6423329838940246, "grad_norm": 0.9004225730895996, "learning_rate": 6.010983963460233e-05, "loss": 0.5023, "step": 20220 }, { "epoch": 0.6429683280917438, "grad_norm": 0.7829142808914185, "learning_rate": 5.9921228292037026e-05, "loss": 0.507, "step": 20240 }, { "epoch": 0.6436036722894628, "grad_norm": 1.1816707849502563, "learning_rate": 5.973278665913985e-05, "loss": 0.4926, "step": 20260 }, { "epoch": 0.644239016487182, "grad_norm": 0.881648063659668, "learning_rate": 5.9544515533848614e-05, "loss": 0.4885, "step": 20280 }, { "epoch": 0.644874360684901, "grad_norm": 0.9568135738372803, "learning_rate": 5.9356415713379145e-05, "loss": 0.515, "step": 20300 }, { "epoch": 0.6455097048826202, "grad_norm": 0.9377472400665283, "learning_rate": 5.9168487994221834e-05, "loss": 0.4886, "step": 20320 }, { "epoch": 0.6461450490803393, "grad_norm": 0.9032811522483826, "learning_rate": 5.898073317213837e-05, "loss": 0.5064, "step": 20340 }, { "epoch": 0.6467803932780584, "grad_norm": 0.9788734316825867, "learning_rate": 5.879315204215836e-05, "loss": 0.4698, "step": 20360 }, { "epoch": 0.6474157374757775, "grad_norm": 1.0353432893753052, "learning_rate": 5.860574539857584e-05, "loss": 0.5227, "step": 20380 }, { "epoch": 0.6480510816734966, "grad_norm": 0.8998845815658569, "learning_rate": 5.84185140349461e-05, "loss": 0.5132, "step": 20400 }, { "epoch": 0.6486864258712157, "grad_norm": 0.8317026495933533, "learning_rate": 5.82314587440821e-05, "loss": 0.468, "step": 20420 }, { "epoch": 0.6493217700689349, "grad_norm": 0.7740748524665833, "learning_rate": 5.80445803180514e-05, "loss": 0.5119, "step": 20440 }, { "epoch": 0.6499571142666539, "grad_norm": 1.0922515392303467, "learning_rate": 5.78578795481725e-05, "loss": 0.5284, "step": 20460 }, { "epoch": 0.6505924584643731, "grad_norm": 0.8265649676322937, "learning_rate": 5.76713572250117e-05, "loss": 0.5095, "step": 20480 }, { "epoch": 0.6512278026620922, "grad_norm": 1.0644861459732056, "learning_rate": 5.748501413837963e-05, "loss": 0.5028, "step": 20500 }, { "epoch": 0.6518631468598113, "grad_norm": 0.9139828681945801, "learning_rate": 5.729885107732808e-05, "loss": 0.4814, "step": 20520 }, { "epoch": 0.6524984910575304, "grad_norm": 0.7917624115943909, "learning_rate": 5.7112868830146416e-05, "loss": 0.4772, "step": 20540 }, { "epoch": 0.6531338352552495, "grad_norm": 0.7677121162414551, "learning_rate": 5.692706818435836e-05, "loss": 0.519, "step": 20560 }, { "epoch": 0.6537691794529686, "grad_norm": 0.8412395715713501, "learning_rate": 5.674144992671882e-05, "loss": 0.501, "step": 20580 }, { "epoch": 0.6544045236506878, "grad_norm": 1.014061689376831, "learning_rate": 5.655601484321022e-05, "loss": 0.5122, "step": 20600 }, { "epoch": 0.6550398678484068, "grad_norm": 1.0746990442276, "learning_rate": 5.6370763719039375e-05, "loss": 0.4969, "step": 20620 }, { "epoch": 0.655675212046126, "grad_norm": 0.9021841883659363, "learning_rate": 5.6185697338634304e-05, "loss": 0.4771, "step": 20640 }, { "epoch": 0.6563105562438452, "grad_norm": 0.8193987607955933, "learning_rate": 5.600081648564056e-05, "loss": 0.5143, "step": 20660 }, { "epoch": 0.6569459004415642, "grad_norm": 1.152421474456787, "learning_rate": 5.581612194291814e-05, "loss": 0.4873, "step": 20680 }, { "epoch": 0.6575812446392834, "grad_norm": 0.8709347248077393, "learning_rate": 5.5631614492538217e-05, "loss": 0.5199, "step": 20700 }, { "epoch": 0.6582165888370024, "grad_norm": 0.827723503112793, "learning_rate": 5.544729491577967e-05, "loss": 0.4917, "step": 20720 }, { "epoch": 0.6588519330347216, "grad_norm": 1.5408345460891724, "learning_rate": 5.526316399312579e-05, "loss": 0.5562, "step": 20740 }, { "epoch": 0.6594872772324407, "grad_norm": 0.731490433216095, "learning_rate": 5.507922250426118e-05, "loss": 0.4927, "step": 20760 }, { "epoch": 0.6601226214301598, "grad_norm": 0.950702428817749, "learning_rate": 5.4895471228068185e-05, "loss": 0.5115, "step": 20780 }, { "epoch": 0.6607579656278789, "grad_norm": 0.8342424631118774, "learning_rate": 5.471191094262369e-05, "loss": 0.4856, "step": 20800 }, { "epoch": 0.661393309825598, "grad_norm": 0.9297844767570496, "learning_rate": 5.4528542425196004e-05, "loss": 0.4896, "step": 20820 }, { "epoch": 0.6620286540233171, "grad_norm": 0.7558259963989258, "learning_rate": 5.434536645224126e-05, "loss": 0.4895, "step": 20840 }, { "epoch": 0.6626639982210363, "grad_norm": 1.2116395235061646, "learning_rate": 5.416238379940035e-05, "loss": 0.507, "step": 20860 }, { "epoch": 0.6632993424187553, "grad_norm": 0.913467526435852, "learning_rate": 5.39795952414955e-05, "loss": 0.5137, "step": 20880 }, { "epoch": 0.6639346866164745, "grad_norm": 0.868238627910614, "learning_rate": 5.3797001552527184e-05, "loss": 0.5185, "step": 20900 }, { "epoch": 0.6645700308141936, "grad_norm": 1.0668286085128784, "learning_rate": 5.361460350567062e-05, "loss": 0.5158, "step": 20920 }, { "epoch": 0.6652053750119127, "grad_norm": 0.795097291469574, "learning_rate": 5.3432401873272655e-05, "loss": 0.4985, "step": 20940 }, { "epoch": 0.6658407192096318, "grad_norm": 0.6949301958084106, "learning_rate": 5.325039742684839e-05, "loss": 0.4722, "step": 20960 }, { "epoch": 0.6664760634073509, "grad_norm": 0.7859952449798584, "learning_rate": 5.3068590937077945e-05, "loss": 0.4933, "step": 20980 }, { "epoch": 0.66711140760507, "grad_norm": 0.8529000282287598, "learning_rate": 5.288698317380334e-05, "loss": 0.5098, "step": 21000 }, { "epoch": 0.66711140760507, "eval_loss": 0.45643100142478943, "eval_runtime": 44.6378, "eval_samples_per_second": 60.554, "eval_steps_per_second": 30.288, "step": 21000 }, { "epoch": 0.6677467518027892, "grad_norm": 0.9853639602661133, "learning_rate": 5.270557490602499e-05, "loss": 0.4715, "step": 21020 }, { "epoch": 0.6683820960005082, "grad_norm": 0.8387131690979004, "learning_rate": 5.2524366901898566e-05, "loss": 0.5128, "step": 21040 }, { "epoch": 0.6690174401982274, "grad_norm": 0.8610044717788696, "learning_rate": 5.234335992873176e-05, "loss": 0.5424, "step": 21060 }, { "epoch": 0.6696527843959466, "grad_norm": 0.8878015279769897, "learning_rate": 5.216255475298109e-05, "loss": 0.4734, "step": 21080 }, { "epoch": 0.6702881285936656, "grad_norm": 1.0038951635360718, "learning_rate": 5.198195214024848e-05, "loss": 0.4879, "step": 21100 }, { "epoch": 0.6709234727913848, "grad_norm": 0.9256641864776611, "learning_rate": 5.1801552855278126e-05, "loss": 0.527, "step": 21120 }, { "epoch": 0.6715588169891038, "grad_norm": 0.7668296098709106, "learning_rate": 5.162135766195337e-05, "loss": 0.5161, "step": 21140 }, { "epoch": 0.672194161186823, "grad_norm": 0.7756738066673279, "learning_rate": 5.144136732329323e-05, "loss": 0.5265, "step": 21160 }, { "epoch": 0.6728295053845421, "grad_norm": 0.9279829859733582, "learning_rate": 5.1261582601449285e-05, "loss": 0.4814, "step": 21180 }, { "epoch": 0.6734648495822612, "grad_norm": 1.1274375915527344, "learning_rate": 5.108200425770255e-05, "loss": 0.5061, "step": 21200 }, { "epoch": 0.6741001937799803, "grad_norm": 1.082535982131958, "learning_rate": 5.090263305246006e-05, "loss": 0.5081, "step": 21220 }, { "epoch": 0.6747355379776995, "grad_norm": 1.0355536937713623, "learning_rate": 5.0723469745251725e-05, "loss": 0.5044, "step": 21240 }, { "epoch": 0.6753708821754185, "grad_norm": 0.9309506416320801, "learning_rate": 5.054451509472728e-05, "loss": 0.5241, "step": 21260 }, { "epoch": 0.6760062263731377, "grad_norm": 0.818247377872467, "learning_rate": 5.0365769858652735e-05, "loss": 0.5034, "step": 21280 }, { "epoch": 0.6766415705708567, "grad_norm": 0.8921930193901062, "learning_rate": 5.0187234793907447e-05, "loss": 0.5089, "step": 21300 }, { "epoch": 0.6772769147685759, "grad_norm": 0.9915839433670044, "learning_rate": 5.000891065648087e-05, "loss": 0.5049, "step": 21320 }, { "epoch": 0.677912258966295, "grad_norm": 0.8783996105194092, "learning_rate": 4.983079820146922e-05, "loss": 0.5314, "step": 21340 }, { "epoch": 0.6785476031640141, "grad_norm": 0.8735405802726746, "learning_rate": 4.96528981830724e-05, "loss": 0.5036, "step": 21360 }, { "epoch": 0.6791829473617332, "grad_norm": 0.9674988389015198, "learning_rate": 4.947521135459072e-05, "loss": 0.5269, "step": 21380 }, { "epoch": 0.6798182915594523, "grad_norm": 0.9271227717399597, "learning_rate": 4.9297738468421896e-05, "loss": 0.5061, "step": 21400 }, { "epoch": 0.6804536357571714, "grad_norm": 0.7828012704849243, "learning_rate": 4.912048027605759e-05, "loss": 0.4978, "step": 21420 }, { "epoch": 0.6810889799548906, "grad_norm": 1.3417547941207886, "learning_rate": 4.8943437528080385e-05, "loss": 0.5326, "step": 21440 }, { "epoch": 0.6817243241526096, "grad_norm": 0.8963372707366943, "learning_rate": 4.876661097416066e-05, "loss": 0.4989, "step": 21460 }, { "epoch": 0.6823596683503288, "grad_norm": 0.893553614616394, "learning_rate": 4.859000136305329e-05, "loss": 0.4859, "step": 21480 }, { "epoch": 0.682995012548048, "grad_norm": 1.2325243949890137, "learning_rate": 4.8413609442594445e-05, "loss": 0.5037, "step": 21500 }, { "epoch": 0.683630356745767, "grad_norm": 0.8049502372741699, "learning_rate": 4.8237435959698706e-05, "loss": 0.509, "step": 21520 }, { "epoch": 0.6842657009434862, "grad_norm": 1.2289927005767822, "learning_rate": 4.8061481660355534e-05, "loss": 0.5128, "step": 21540 }, { "epoch": 0.6849010451412052, "grad_norm": 0.8123481869697571, "learning_rate": 4.7885747289626284e-05, "loss": 0.5031, "step": 21560 }, { "epoch": 0.6855363893389244, "grad_norm": 0.8852875232696533, "learning_rate": 4.771023359164116e-05, "loss": 0.4875, "step": 21580 }, { "epoch": 0.6861717335366435, "grad_norm": 0.8462742567062378, "learning_rate": 4.753494130959586e-05, "loss": 0.4787, "step": 21600 }, { "epoch": 0.6868070777343626, "grad_norm": 0.99876868724823, "learning_rate": 4.7359871185748485e-05, "loss": 0.5116, "step": 21620 }, { "epoch": 0.6874424219320817, "grad_norm": 0.9393181204795837, "learning_rate": 4.718502396141656e-05, "loss": 0.4878, "step": 21640 }, { "epoch": 0.6880777661298009, "grad_norm": 0.8426542282104492, "learning_rate": 4.701040037697364e-05, "loss": 0.4897, "step": 21660 }, { "epoch": 0.6887131103275199, "grad_norm": 0.938210666179657, "learning_rate": 4.683600117184631e-05, "loss": 0.492, "step": 21680 }, { "epoch": 0.6893484545252391, "grad_norm": 0.8325148820877075, "learning_rate": 4.666182708451114e-05, "loss": 0.4842, "step": 21700 }, { "epoch": 0.6899837987229581, "grad_norm": 0.8813055753707886, "learning_rate": 4.648787885249136e-05, "loss": 0.491, "step": 21720 }, { "epoch": 0.6906191429206773, "grad_norm": 1.0838825702667236, "learning_rate": 4.631415721235389e-05, "loss": 0.4732, "step": 21740 }, { "epoch": 0.6912544871183964, "grad_norm": 0.7203667163848877, "learning_rate": 4.614066289970609e-05, "loss": 0.4692, "step": 21760 }, { "epoch": 0.6918898313161155, "grad_norm": 1.181038737297058, "learning_rate": 4.596739664919287e-05, "loss": 0.5177, "step": 21780 }, { "epoch": 0.6925251755138346, "grad_norm": 0.9107904434204102, "learning_rate": 4.579435919449332e-05, "loss": 0.5186, "step": 21800 }, { "epoch": 0.6931605197115537, "grad_norm": 0.8281117081642151, "learning_rate": 4.5621551268317686e-05, "loss": 0.4848, "step": 21820 }, { "epoch": 0.6937958639092728, "grad_norm": 0.9180241227149963, "learning_rate": 4.545759700573378e-05, "loss": 0.4979, "step": 21840 }, { "epoch": 0.694431208106992, "grad_norm": 0.912675678730011, "learning_rate": 4.5285238763954426e-05, "loss": 0.5124, "step": 21860 }, { "epoch": 0.695066552304711, "grad_norm": 0.8163600564002991, "learning_rate": 4.5113112206520056e-05, "loss": 0.5205, "step": 21880 }, { "epoch": 0.6957018965024302, "grad_norm": 0.7308365702629089, "learning_rate": 4.494121806228392e-05, "loss": 0.5208, "step": 21900 }, { "epoch": 0.6963372407001494, "grad_norm": 0.7426006197929382, "learning_rate": 4.476955705911504e-05, "loss": 0.48, "step": 21920 }, { "epoch": 0.6969725848978684, "grad_norm": 0.9886866807937622, "learning_rate": 4.459812992389526e-05, "loss": 0.5483, "step": 21940 }, { "epoch": 0.6976079290955876, "grad_norm": 0.9653937816619873, "learning_rate": 4.44269373825162e-05, "loss": 0.4613, "step": 21960 }, { "epoch": 0.6982432732933066, "grad_norm": 0.8184491991996765, "learning_rate": 4.425598015987602e-05, "loss": 0.5212, "step": 21980 }, { "epoch": 0.6988786174910258, "grad_norm": 0.9365077614784241, "learning_rate": 4.408525897987645e-05, "loss": 0.4868, "step": 22000 }, { "epoch": 0.6988786174910258, "eval_loss": 0.45187339186668396, "eval_runtime": 44.7631, "eval_samples_per_second": 60.385, "eval_steps_per_second": 30.203, "step": 22000 }, { "epoch": 0.6995139616887449, "grad_norm": 0.9188706874847412, "learning_rate": 4.391477456541983e-05, "loss": 0.4991, "step": 22020 }, { "epoch": 0.700149305886464, "grad_norm": 0.8599129319190979, "learning_rate": 4.374452763840584e-05, "loss": 0.5184, "step": 22040 }, { "epoch": 0.7007846500841831, "grad_norm": 0.8643587827682495, "learning_rate": 4.357451891972854e-05, "loss": 0.4966, "step": 22060 }, { "epoch": 0.7014199942819023, "grad_norm": 0.9123074412345886, "learning_rate": 4.340474912927332e-05, "loss": 0.5068, "step": 22080 }, { "epoch": 0.7020553384796213, "grad_norm": 0.8422294855117798, "learning_rate": 4.323521898591394e-05, "loss": 0.4753, "step": 22100 }, { "epoch": 0.7026906826773405, "grad_norm": 0.8830937743186951, "learning_rate": 4.306592920750931e-05, "loss": 0.4837, "step": 22120 }, { "epoch": 0.7033260268750595, "grad_norm": 0.8540763854980469, "learning_rate": 4.289688051090054e-05, "loss": 0.4733, "step": 22140 }, { "epoch": 0.7039613710727787, "grad_norm": 0.8622573614120483, "learning_rate": 4.272807361190797e-05, "loss": 0.5003, "step": 22160 }, { "epoch": 0.7045967152704978, "grad_norm": 0.9827342629432678, "learning_rate": 4.2559509225328e-05, "loss": 0.5333, "step": 22180 }, { "epoch": 0.7052320594682169, "grad_norm": 0.8439646363258362, "learning_rate": 4.239118806493013e-05, "loss": 0.4778, "step": 22200 }, { "epoch": 0.705867403665936, "grad_norm": 0.9348493814468384, "learning_rate": 4.222311084345405e-05, "loss": 0.4806, "step": 22220 }, { "epoch": 0.7065027478636552, "grad_norm": 1.0671905279159546, "learning_rate": 4.2055278272606404e-05, "loss": 0.4978, "step": 22240 }, { "epoch": 0.7071380920613742, "grad_norm": 1.2363934516906738, "learning_rate": 4.188769106305787e-05, "loss": 0.5089, "step": 22260 }, { "epoch": 0.7077734362590934, "grad_norm": 0.9339464902877808, "learning_rate": 4.1720349924440295e-05, "loss": 0.4796, "step": 22280 }, { "epoch": 0.7084087804568124, "grad_norm": 0.873092770576477, "learning_rate": 4.155325556534345e-05, "loss": 0.4931, "step": 22300 }, { "epoch": 0.7090441246545316, "grad_norm": 0.7866622805595398, "learning_rate": 4.138640869331215e-05, "loss": 0.501, "step": 22320 }, { "epoch": 0.7096794688522507, "grad_norm": 1.0133357048034668, "learning_rate": 4.121981001484334e-05, "loss": 0.481, "step": 22340 }, { "epoch": 0.7103148130499698, "grad_norm": 0.9386391043663025, "learning_rate": 4.105346023538292e-05, "loss": 0.5303, "step": 22360 }, { "epoch": 0.710950157247689, "grad_norm": 0.7917353510856628, "learning_rate": 4.088736005932289e-05, "loss": 0.4993, "step": 22380 }, { "epoch": 0.711585501445408, "grad_norm": 0.9757121801376343, "learning_rate": 4.0721510189998266e-05, "loss": 0.5102, "step": 22400 }, { "epoch": 0.7122208456431272, "grad_norm": 1.2196959257125854, "learning_rate": 4.055591132968432e-05, "loss": 0.5045, "step": 22420 }, { "epoch": 0.7128561898408463, "grad_norm": 1.0833863019943237, "learning_rate": 4.039056417959328e-05, "loss": 0.5136, "step": 22440 }, { "epoch": 0.7134915340385654, "grad_norm": 0.7548487186431885, "learning_rate": 4.02254694398716e-05, "loss": 0.4864, "step": 22460 }, { "epoch": 0.7141268782362845, "grad_norm": 1.0435632467269897, "learning_rate": 4.006062780959697e-05, "loss": 0.4866, "step": 22480 }, { "epoch": 0.7147622224340037, "grad_norm": 0.7469571828842163, "learning_rate": 3.9896039986775256e-05, "loss": 0.4825, "step": 22500 }, { "epoch": 0.7153975666317227, "grad_norm": 0.8732174634933472, "learning_rate": 3.9731706668337585e-05, "loss": 0.4905, "step": 22520 }, { "epoch": 0.7160329108294419, "grad_norm": 0.8761599063873291, "learning_rate": 3.956762855013749e-05, "loss": 0.4831, "step": 22540 }, { "epoch": 0.7166682550271609, "grad_norm": 0.9746137261390686, "learning_rate": 3.940380632694781e-05, "loss": 0.5111, "step": 22560 }, { "epoch": 0.7173035992248801, "grad_norm": 0.9219092726707458, "learning_rate": 3.924024069245782e-05, "loss": 0.4908, "step": 22580 }, { "epoch": 0.7179389434225992, "grad_norm": 1.0305086374282837, "learning_rate": 3.907693233927038e-05, "loss": 0.5215, "step": 22600 }, { "epoch": 0.7185742876203183, "grad_norm": 0.7786363363265991, "learning_rate": 3.891388195889882e-05, "loss": 0.4792, "step": 22620 }, { "epoch": 0.7192096318180374, "grad_norm": 0.8930706977844238, "learning_rate": 3.875109024176413e-05, "loss": 0.4908, "step": 22640 }, { "epoch": 0.7198449760157566, "grad_norm": 1.0214048624038696, "learning_rate": 3.858855787719209e-05, "loss": 0.5102, "step": 22660 }, { "epoch": 0.7204803202134756, "grad_norm": 0.9279896020889282, "learning_rate": 3.842628555341018e-05, "loss": 0.4772, "step": 22680 }, { "epoch": 0.7211156644111948, "grad_norm": 1.6357091665267944, "learning_rate": 3.826427395754482e-05, "loss": 0.5041, "step": 22700 }, { "epoch": 0.7217510086089138, "grad_norm": 0.8421345949172974, "learning_rate": 3.8102523775618325e-05, "loss": 0.5082, "step": 22720 }, { "epoch": 0.722386352806633, "grad_norm": 0.9193027019500732, "learning_rate": 3.794103569254624e-05, "loss": 0.485, "step": 22740 }, { "epoch": 0.7230216970043521, "grad_norm": 0.8045080304145813, "learning_rate": 3.777981039213411e-05, "loss": 0.5182, "step": 22760 }, { "epoch": 0.7236570412020712, "grad_norm": 0.8535903692245483, "learning_rate": 3.7618848557074804e-05, "loss": 0.4796, "step": 22780 }, { "epoch": 0.7242923853997904, "grad_norm": 0.8225564360618591, "learning_rate": 3.745815086894565e-05, "loss": 0.4812, "step": 22800 }, { "epoch": 0.7249277295975094, "grad_norm": 0.8030312657356262, "learning_rate": 3.729771800820539e-05, "loss": 0.481, "step": 22820 }, { "epoch": 0.7255630737952286, "grad_norm": 0.992080569267273, "learning_rate": 3.713755065419133e-05, "loss": 0.4768, "step": 22840 }, { "epoch": 0.7261984179929477, "grad_norm": 0.9184660911560059, "learning_rate": 3.698563821122103e-05, "loss": 0.5044, "step": 22860 }, { "epoch": 0.7268337621906668, "grad_norm": 0.8250758647918701, "learning_rate": 3.6825990545007096e-05, "loss": 0.5095, "step": 22880 }, { "epoch": 0.7274691063883859, "grad_norm": 1.0519983768463135, "learning_rate": 3.666661038300353e-05, "loss": 0.4944, "step": 22900 }, { "epoch": 0.7281044505861051, "grad_norm": 0.789730966091156, "learning_rate": 3.650749840009022e-05, "loss": 0.4574, "step": 22920 }, { "epoch": 0.7287397947838241, "grad_norm": 0.8896093368530273, "learning_rate": 3.6356591030872534e-05, "loss": 0.5, "step": 22940 }, { "epoch": 0.7293751389815433, "grad_norm": 0.7810101509094238, "learning_rate": 3.6198003934005195e-05, "loss": 0.5053, "step": 22960 }, { "epoch": 0.7300104831792623, "grad_norm": 0.883144199848175, "learning_rate": 3.603968700049657e-05, "loss": 0.514, "step": 22980 }, { "epoch": 0.7306458273769815, "grad_norm": 0.7069016695022583, "learning_rate": 3.588164090072441e-05, "loss": 0.522, "step": 23000 }, { "epoch": 0.7306458273769815, "eval_loss": 0.4499790668487549, "eval_runtime": 45.0673, "eval_samples_per_second": 59.977, "eval_steps_per_second": 30.0, "step": 23000 }, { "epoch": 0.7312811715747006, "grad_norm": 1.0385907888412476, "learning_rate": 3.5723866303919554e-05, "loss": 0.489, "step": 23020 }, { "epoch": 0.7319165157724197, "grad_norm": 0.8796695470809937, "learning_rate": 3.556636387816317e-05, "loss": 0.4963, "step": 23040 }, { "epoch": 0.7325518599701388, "grad_norm": 0.9427993893623352, "learning_rate": 3.540913429038407e-05, "loss": 0.4601, "step": 23060 }, { "epoch": 0.733187204167858, "grad_norm": 0.8525741100311279, "learning_rate": 3.525217820635564e-05, "loss": 0.5034, "step": 23080 }, { "epoch": 0.733822548365577, "grad_norm": 0.8755898475646973, "learning_rate": 3.5095496290693155e-05, "loss": 0.509, "step": 23100 }, { "epoch": 0.7344578925632962, "grad_norm": 1.0328361988067627, "learning_rate": 3.4939089206851025e-05, "loss": 0.4994, "step": 23120 }, { "epoch": 0.7350932367610152, "grad_norm": 1.130226969718933, "learning_rate": 3.478295761711986e-05, "loss": 0.4848, "step": 23140 }, { "epoch": 0.7357285809587344, "grad_norm": 0.733567476272583, "learning_rate": 3.4627102182623696e-05, "loss": 0.5123, "step": 23160 }, { "epoch": 0.7363639251564535, "grad_norm": 1.1062750816345215, "learning_rate": 3.447152356331721e-05, "loss": 0.4767, "step": 23180 }, { "epoch": 0.7369992693541726, "grad_norm": 0.9558404684066772, "learning_rate": 3.431622241798305e-05, "loss": 0.4832, "step": 23200 }, { "epoch": 0.7376346135518917, "grad_norm": 0.8974496722221375, "learning_rate": 3.416119940422877e-05, "loss": 0.4818, "step": 23220 }, { "epoch": 0.7382699577496109, "grad_norm": 1.2721449136734009, "learning_rate": 3.400645517848427e-05, "loss": 0.5102, "step": 23240 }, { "epoch": 0.73890530194733, "grad_norm": 1.0408607721328735, "learning_rate": 3.385199039599902e-05, "loss": 0.4784, "step": 23260 }, { "epoch": 0.7395406461450491, "grad_norm": 0.9826887845993042, "learning_rate": 3.369780571083909e-05, "loss": 0.5039, "step": 23280 }, { "epoch": 0.7401759903427682, "grad_norm": 0.8110315799713135, "learning_rate": 3.354390177588454e-05, "loss": 0.5034, "step": 23300 }, { "epoch": 0.7408113345404873, "grad_norm": 0.8513306975364685, "learning_rate": 3.339027924282673e-05, "loss": 0.509, "step": 23320 }, { "epoch": 0.7414466787382065, "grad_norm": 0.8255580067634583, "learning_rate": 3.323693876216529e-05, "loss": 0.4678, "step": 23340 }, { "epoch": 0.7420820229359255, "grad_norm": 1.1336640119552612, "learning_rate": 3.30838809832056e-05, "loss": 0.4848, "step": 23360 }, { "epoch": 0.7427173671336447, "grad_norm": 0.8720375895500183, "learning_rate": 3.2931106554056005e-05, "loss": 0.4929, "step": 23380 }, { "epoch": 0.7433527113313637, "grad_norm": 1.0169090032577515, "learning_rate": 3.277861612162498e-05, "loss": 0.5066, "step": 23400 }, { "epoch": 0.7439880555290829, "grad_norm": 1.2800534963607788, "learning_rate": 3.262641033161843e-05, "loss": 0.4964, "step": 23420 }, { "epoch": 0.744623399726802, "grad_norm": 0.819925844669342, "learning_rate": 3.2474489828537046e-05, "loss": 0.509, "step": 23440 }, { "epoch": 0.7452587439245211, "grad_norm": 0.8024299144744873, "learning_rate": 3.232285525567343e-05, "loss": 0.4922, "step": 23460 }, { "epoch": 0.7458940881222402, "grad_norm": 1.1049789190292358, "learning_rate": 3.217150725510946e-05, "loss": 0.4907, "step": 23480 }, { "epoch": 0.7465294323199594, "grad_norm": 1.0818272829055786, "learning_rate": 3.2020446467713516e-05, "loss": 0.4806, "step": 23500 }, { "epoch": 0.7471647765176784, "grad_norm": 0.6681995391845703, "learning_rate": 3.18696735331379e-05, "loss": 0.4504, "step": 23520 }, { "epoch": 0.7478001207153976, "grad_norm": 0.8827902674674988, "learning_rate": 3.171918908981595e-05, "loss": 0.5081, "step": 23540 }, { "epoch": 0.7484354649131166, "grad_norm": 1.0249037742614746, "learning_rate": 3.156899377495938e-05, "loss": 0.5297, "step": 23560 }, { "epoch": 0.7490708091108358, "grad_norm": 1.0797147750854492, "learning_rate": 3.141908822455574e-05, "loss": 0.4701, "step": 23580 }, { "epoch": 0.749706153308555, "grad_norm": 0.724281907081604, "learning_rate": 3.126947307336551e-05, "loss": 0.4608, "step": 23600 }, { "epoch": 0.750341497506274, "grad_norm": 0.7410632967948914, "learning_rate": 3.1120148954919485e-05, "loss": 0.4747, "step": 23620 }, { "epoch": 0.7509768417039931, "grad_norm": 1.0309559106826782, "learning_rate": 3.09711165015162e-05, "loss": 0.534, "step": 23640 }, { "epoch": 0.7516121859017123, "grad_norm": 0.9060602784156799, "learning_rate": 3.0822376344219105e-05, "loss": 0.4709, "step": 23660 }, { "epoch": 0.7522475300994313, "grad_norm": 0.9018211364746094, "learning_rate": 3.067392911285395e-05, "loss": 0.5084, "step": 23680 }, { "epoch": 0.7528828742971505, "grad_norm": 1.1375420093536377, "learning_rate": 3.0525775436006107e-05, "loss": 0.5023, "step": 23700 }, { "epoch": 0.7535182184948696, "grad_norm": 0.8034165501594543, "learning_rate": 3.0377915941017955e-05, "loss": 0.4947, "step": 23720 }, { "epoch": 0.7541535626925887, "grad_norm": 1.0958040952682495, "learning_rate": 3.0230351253986143e-05, "loss": 0.5009, "step": 23740 }, { "epoch": 0.7547889068903079, "grad_norm": 0.8740959763526917, "learning_rate": 3.0083081999759067e-05, "loss": 0.4942, "step": 23760 }, { "epoch": 0.7554242510880269, "grad_norm": 0.8798695206642151, "learning_rate": 2.993610880193406e-05, "loss": 0.4676, "step": 23780 }, { "epoch": 0.7560595952857461, "grad_norm": 0.9538172483444214, "learning_rate": 2.9789432282854822e-05, "loss": 0.4441, "step": 23800 }, { "epoch": 0.7566949394834651, "grad_norm": 0.9560829401016235, "learning_rate": 2.9643053063608917e-05, "loss": 0.4995, "step": 23820 }, { "epoch": 0.7573302836811843, "grad_norm": 1.0306763648986816, "learning_rate": 2.9496971764024884e-05, "loss": 0.5042, "step": 23840 }, { "epoch": 0.7579656278789034, "grad_norm": 0.9823128581047058, "learning_rate": 2.9351189002669788e-05, "loss": 0.5274, "step": 23860 }, { "epoch": 0.7586009720766225, "grad_norm": 0.8448672890663147, "learning_rate": 2.920570539684665e-05, "loss": 0.4713, "step": 23880 }, { "epoch": 0.7592363162743416, "grad_norm": 0.8830504417419434, "learning_rate": 2.9060521562591624e-05, "loss": 0.5069, "step": 23900 }, { "epoch": 0.7598716604720608, "grad_norm": 0.9051734805107117, "learning_rate": 2.891563811467154e-05, "loss": 0.48, "step": 23920 }, { "epoch": 0.7605070046697798, "grad_norm": 0.8309674859046936, "learning_rate": 2.877105566658136e-05, "loss": 0.5141, "step": 23940 }, { "epoch": 0.761142348867499, "grad_norm": 0.8684896230697632, "learning_rate": 2.863398169962057e-05, "loss": 0.4518, "step": 23960 }, { "epoch": 0.761777693065218, "grad_norm": 0.959536075592041, "learning_rate": 2.8489987960934184e-05, "loss": 0.483, "step": 23980 }, { "epoch": 0.7624130372629372, "grad_norm": 1.3519070148468018, "learning_rate": 2.8353474370325594e-05, "loss": 0.5062, "step": 24000 }, { "epoch": 0.7624130372629372, "eval_loss": 0.4479082524776459, "eval_runtime": 44.6533, "eval_samples_per_second": 60.533, "eval_steps_per_second": 30.278, "step": 24000 }, { "epoch": 0.7630483814606563, "grad_norm": 0.8832095861434937, "learning_rate": 2.8210071659529526e-05, "loss": 0.5204, "step": 24020 }, { "epoch": 0.7636837256583754, "grad_norm": 0.793205738067627, "learning_rate": 2.8066972936216017e-05, "loss": 0.5037, "step": 24040 }, { "epoch": 0.7643190698560945, "grad_norm": 0.8483644127845764, "learning_rate": 2.79241788063227e-05, "loss": 0.4812, "step": 24060 }, { "epoch": 0.7649544140538137, "grad_norm": 1.50220787525177, "learning_rate": 2.7781689874497406e-05, "loss": 0.501, "step": 24080 }, { "epoch": 0.7655897582515327, "grad_norm": 0.8091638684272766, "learning_rate": 2.7639506744095766e-05, "loss": 0.4932, "step": 24100 }, { "epoch": 0.7662251024492519, "grad_norm": 0.9171321392059326, "learning_rate": 2.74976300171784e-05, "loss": 0.5, "step": 24120 }, { "epoch": 0.766860446646971, "grad_norm": 0.9392116069793701, "learning_rate": 2.7356060294508502e-05, "loss": 0.5075, "step": 24140 }, { "epoch": 0.7674957908446901, "grad_norm": 0.9384047389030457, "learning_rate": 2.7214798175549395e-05, "loss": 0.4893, "step": 24160 }, { "epoch": 0.7681311350424093, "grad_norm": 0.7760775685310364, "learning_rate": 2.707384425846178e-05, "loss": 0.5267, "step": 24180 }, { "epoch": 0.7687664792401283, "grad_norm": 0.8666489720344543, "learning_rate": 2.6933199140101285e-05, "loss": 0.5201, "step": 24200 }, { "epoch": 0.7694018234378475, "grad_norm": 0.9711599946022034, "learning_rate": 2.679286341601609e-05, "loss": 0.4923, "step": 24220 }, { "epoch": 0.7700371676355666, "grad_norm": 0.9399335980415344, "learning_rate": 2.6652837680444153e-05, "loss": 0.5281, "step": 24240 }, { "epoch": 0.7706725118332857, "grad_norm": 0.8116670250892639, "learning_rate": 2.651312252631083e-05, "loss": 0.5111, "step": 24260 }, { "epoch": 0.7713078560310048, "grad_norm": 0.873943030834198, "learning_rate": 2.6373718545226445e-05, "loss": 0.471, "step": 24280 }, { "epoch": 0.7719432002287239, "grad_norm": 0.9560205340385437, "learning_rate": 2.623462632748359e-05, "loss": 0.5101, "step": 24300 }, { "epoch": 0.772578544426443, "grad_norm": 1.011898159980774, "learning_rate": 2.6095846462054763e-05, "loss": 0.4906, "step": 24320 }, { "epoch": 0.7732138886241622, "grad_norm": 1.0334892272949219, "learning_rate": 2.595737953658982e-05, "loss": 0.4905, "step": 24340 }, { "epoch": 0.7738492328218812, "grad_norm": 0.6994766592979431, "learning_rate": 2.581922613741352e-05, "loss": 0.4794, "step": 24360 }, { "epoch": 0.7744845770196004, "grad_norm": 0.9781257510185242, "learning_rate": 2.5681386849523003e-05, "loss": 0.4871, "step": 24380 }, { "epoch": 0.7751199212173194, "grad_norm": 1.0443729162216187, "learning_rate": 2.5543862256585393e-05, "loss": 0.5133, "step": 24400 }, { "epoch": 0.7757552654150386, "grad_norm": 0.8841618299484253, "learning_rate": 2.5406652940935217e-05, "loss": 0.4865, "step": 24420 }, { "epoch": 0.7763906096127577, "grad_norm": 0.8439558148384094, "learning_rate": 2.5269759483571954e-05, "loss": 0.4908, "step": 24440 }, { "epoch": 0.7770259538104768, "grad_norm": 0.9146759510040283, "learning_rate": 2.5133182464157734e-05, "loss": 0.4934, "step": 24460 }, { "epoch": 0.777661298008196, "grad_norm": 0.7785593867301941, "learning_rate": 2.499692246101466e-05, "loss": 0.4857, "step": 24480 }, { "epoch": 0.7782966422059151, "grad_norm": 0.9240188002586365, "learning_rate": 2.4860980051122474e-05, "loss": 0.4958, "step": 24500 }, { "epoch": 0.7789319864036341, "grad_norm": 1.0593191385269165, "learning_rate": 2.4725355810116103e-05, "loss": 0.5077, "step": 24520 }, { "epoch": 0.7795673306013533, "grad_norm": 0.8705240488052368, "learning_rate": 2.4590050312283263e-05, "loss": 0.4792, "step": 24540 }, { "epoch": 0.7802026747990723, "grad_norm": 0.8610863089561462, "learning_rate": 2.4455064130561944e-05, "loss": 0.4949, "step": 24560 }, { "epoch": 0.7808380189967915, "grad_norm": 1.152521014213562, "learning_rate": 2.432039783653799e-05, "loss": 0.5076, "step": 24580 }, { "epoch": 0.7814733631945107, "grad_norm": 0.8608033657073975, "learning_rate": 2.4186052000442806e-05, "loss": 0.4759, "step": 24600 }, { "epoch": 0.7821087073922297, "grad_norm": 1.1664726734161377, "learning_rate": 2.4052027191150762e-05, "loss": 0.4941, "step": 24620 }, { "epoch": 0.7827440515899489, "grad_norm": 0.8805221915245056, "learning_rate": 2.3918323976176883e-05, "loss": 0.4797, "step": 24640 }, { "epoch": 0.783379395787668, "grad_norm": 0.7699743509292603, "learning_rate": 2.3784942921674512e-05, "loss": 0.4903, "step": 24660 }, { "epoch": 0.7840147399853871, "grad_norm": 0.9498074650764465, "learning_rate": 2.365188459243274e-05, "loss": 0.4679, "step": 24680 }, { "epoch": 0.7846500841831062, "grad_norm": 0.815447986125946, "learning_rate": 2.351914955187412e-05, "loss": 0.5114, "step": 24700 }, { "epoch": 0.7852854283808253, "grad_norm": 0.984866738319397, "learning_rate": 2.3386738362052353e-05, "loss": 0.4725, "step": 24720 }, { "epoch": 0.7859207725785444, "grad_norm": 1.0802818536758423, "learning_rate": 2.3254651583649735e-05, "loss": 0.4684, "step": 24740 }, { "epoch": 0.7865561167762636, "grad_norm": 0.8058573007583618, "learning_rate": 2.3122889775974887e-05, "loss": 0.4847, "step": 24760 }, { "epoch": 0.7871914609739826, "grad_norm": 0.8836669921875, "learning_rate": 2.2991453496960447e-05, "loss": 0.4859, "step": 24780 }, { "epoch": 0.7878268051717018, "grad_norm": 0.7214009165763855, "learning_rate": 2.2860343303160535e-05, "loss": 0.4816, "step": 24800 }, { "epoch": 0.7884621493694208, "grad_norm": 0.8268193006515503, "learning_rate": 2.2729559749748575e-05, "loss": 0.4674, "step": 24820 }, { "epoch": 0.78909749356714, "grad_norm": 0.7158612608909607, "learning_rate": 2.2599103390514766e-05, "loss": 0.465, "step": 24840 }, { "epoch": 0.7897328377648591, "grad_norm": 0.8904339671134949, "learning_rate": 2.246897477786396e-05, "loss": 0.5024, "step": 24860 }, { "epoch": 0.7903681819625782, "grad_norm": 0.8315703272819519, "learning_rate": 2.2339174462813127e-05, "loss": 0.4609, "step": 24880 }, { "epoch": 0.7910035261602973, "grad_norm": 0.8962224721908569, "learning_rate": 2.2209702994989045e-05, "loss": 0.4906, "step": 24900 }, { "epoch": 0.7916388703580165, "grad_norm": 0.9301977753639221, "learning_rate": 2.208056092262616e-05, "loss": 0.5216, "step": 24920 }, { "epoch": 0.7922742145557355, "grad_norm": 0.8634437918663025, "learning_rate": 2.1951748792563985e-05, "loss": 0.5031, "step": 24940 }, { "epoch": 0.7929095587534547, "grad_norm": 0.8985020518302917, "learning_rate": 2.1823267150244964e-05, "loss": 0.4709, "step": 24960 }, { "epoch": 0.7935449029511737, "grad_norm": 1.1470792293548584, "learning_rate": 2.16951165397122e-05, "loss": 0.5224, "step": 24980 }, { "epoch": 0.7941802471488929, "grad_norm": 0.919326663017273, "learning_rate": 2.1567297503606987e-05, "loss": 0.5004, "step": 25000 }, { "epoch": 0.7941802471488929, "eval_loss": 0.44602036476135254, "eval_runtime": 44.8391, "eval_samples_per_second": 60.282, "eval_steps_per_second": 30.152, "step": 25000 }, { "epoch": 0.7948155913466121, "grad_norm": 1.1010879278182983, "learning_rate": 2.1439810583166587e-05, "loss": 0.5077, "step": 25020 }, { "epoch": 0.7954509355443311, "grad_norm": 0.8573036789894104, "learning_rate": 2.131900612258364e-05, "loss": 0.4973, "step": 25040 }, { "epoch": 0.7960862797420503, "grad_norm": 0.8931069374084473, "learning_rate": 2.1198502345256165e-05, "loss": 0.4972, "step": 25060 }, { "epoch": 0.7967216239397694, "grad_norm": 1.239161491394043, "learning_rate": 2.107198160794136e-05, "loss": 0.4981, "step": 25080 }, { "epoch": 0.7973569681374885, "grad_norm": 0.9950107336044312, "learning_rate": 2.0945795083658447e-05, "loss": 0.506, "step": 25100 }, { "epoch": 0.7979923123352076, "grad_norm": 0.7783673405647278, "learning_rate": 2.0819943306732082e-05, "loss": 0.4763, "step": 25120 }, { "epoch": 0.7986276565329267, "grad_norm": 0.912331223487854, "learning_rate": 2.0694426810069345e-05, "loss": 0.4622, "step": 25140 }, { "epoch": 0.7992630007306458, "grad_norm": 0.8284201622009277, "learning_rate": 2.0569246125157658e-05, "loss": 0.513, "step": 25160 }, { "epoch": 0.799898344928365, "grad_norm": 1.1468638181686401, "learning_rate": 2.0444401782062518e-05, "loss": 0.4719, "step": 25180 }, { "epoch": 0.800533689126084, "grad_norm": 1.0985773801803589, "learning_rate": 2.0319894309425146e-05, "loss": 0.4871, "step": 25200 }, { "epoch": 0.8011690333238032, "grad_norm": 1.1010768413543701, "learning_rate": 2.0195724234460322e-05, "loss": 0.5459, "step": 25220 }, { "epoch": 0.8018043775215223, "grad_norm": 0.9938257336616516, "learning_rate": 2.0071892082954248e-05, "loss": 0.5127, "step": 25240 }, { "epoch": 0.8024397217192414, "grad_norm": 1.1338539123535156, "learning_rate": 1.9954565018232684e-05, "loss": 0.4838, "step": 25260 }, { "epoch": 0.8030750659169605, "grad_norm": 0.7955858111381531, "learning_rate": 1.9831393324342518e-05, "loss": 0.4865, "step": 25280 }, { "epoch": 0.8037104101146796, "grad_norm": 1.0443702936172485, "learning_rate": 1.9708561096634902e-05, "loss": 0.4749, "step": 25300 }, { "epoch": 0.8043457543123987, "grad_norm": 1.0816038846969604, "learning_rate": 1.958606885523103e-05, "loss": 0.5142, "step": 25320 }, { "epoch": 0.8049810985101179, "grad_norm": 1.2127019166946411, "learning_rate": 1.946391711881239e-05, "loss": 0.4831, "step": 25340 }, { "epoch": 0.8056164427078369, "grad_norm": 0.8780348300933838, "learning_rate": 1.9342106404618632e-05, "loss": 0.5113, "step": 25360 }, { "epoch": 0.8062517869055561, "grad_norm": 0.7795581221580505, "learning_rate": 1.9220637228445438e-05, "loss": 0.4721, "step": 25380 }, { "epoch": 0.8068871311032751, "grad_norm": 0.9518604874610901, "learning_rate": 1.9099510104642216e-05, "loss": 0.4754, "step": 25400 }, { "epoch": 0.8075224753009943, "grad_norm": 1.0051589012145996, "learning_rate": 1.8978725546110022e-05, "loss": 0.4936, "step": 25420 }, { "epoch": 0.8081578194987135, "grad_norm": 0.8047780394554138, "learning_rate": 1.8858284064299326e-05, "loss": 0.4901, "step": 25440 }, { "epoch": 0.8087931636964325, "grad_norm": 1.1246352195739746, "learning_rate": 1.8738186169207917e-05, "loss": 0.5117, "step": 25460 }, { "epoch": 0.8094285078941517, "grad_norm": 0.8150719404220581, "learning_rate": 1.861843236937867e-05, "loss": 0.4685, "step": 25480 }, { "epoch": 0.8100638520918708, "grad_norm": 2.195882558822632, "learning_rate": 1.8499023171897388e-05, "loss": 0.471, "step": 25500 }, { "epoch": 0.8106991962895899, "grad_norm": 0.8962704539299011, "learning_rate": 1.8379959082390798e-05, "loss": 0.481, "step": 25520 }, { "epoch": 0.811334540487309, "grad_norm": 0.8531712889671326, "learning_rate": 1.8261240605024165e-05, "loss": 0.4881, "step": 25540 }, { "epoch": 0.8119698846850281, "grad_norm": 0.9354826807975769, "learning_rate": 1.8142868242499368e-05, "loss": 0.4761, "step": 25560 }, { "epoch": 0.8126052288827472, "grad_norm": 1.0048118829727173, "learning_rate": 1.8024842496052708e-05, "loss": 0.4968, "step": 25580 }, { "epoch": 0.8132405730804664, "grad_norm": 0.8254916071891785, "learning_rate": 1.790716386545275e-05, "loss": 0.5076, "step": 25600 }, { "epoch": 0.8138759172781854, "grad_norm": 0.9708372950553894, "learning_rate": 1.778983284899819e-05, "loss": 0.5197, "step": 25620 }, { "epoch": 0.8145112614759046, "grad_norm": 0.9034101366996765, "learning_rate": 1.767284994351588e-05, "loss": 0.4954, "step": 25640 }, { "epoch": 0.8151466056736237, "grad_norm": 1.3567668199539185, "learning_rate": 1.7556215644358564e-05, "loss": 0.5133, "step": 25660 }, { "epoch": 0.8157819498713428, "grad_norm": 0.9000421166419983, "learning_rate": 1.743993044540282e-05, "loss": 0.524, "step": 25680 }, { "epoch": 0.8164172940690619, "grad_norm": 0.7230278849601746, "learning_rate": 1.7323994839047086e-05, "loss": 0.4831, "step": 25700 }, { "epoch": 0.817052638266781, "grad_norm": 0.8648797273635864, "learning_rate": 1.7208409316209407e-05, "loss": 0.4932, "step": 25720 }, { "epoch": 0.8176879824645001, "grad_norm": 0.9017996788024902, "learning_rate": 1.709317436632547e-05, "loss": 0.4787, "step": 25740 }, { "epoch": 0.8183233266622193, "grad_norm": 0.9122520685195923, "learning_rate": 1.697829047734646e-05, "loss": 0.4721, "step": 25760 }, { "epoch": 0.8189586708599383, "grad_norm": 0.9448441863059998, "learning_rate": 1.6863758135737085e-05, "loss": 0.4772, "step": 25780 }, { "epoch": 0.8195940150576575, "grad_norm": 1.052437424659729, "learning_rate": 1.6749577826473405e-05, "loss": 0.5252, "step": 25800 }, { "epoch": 0.8202293592553767, "grad_norm": 0.9826536774635315, "learning_rate": 1.6635750033040842e-05, "loss": 0.5187, "step": 25820 }, { "epoch": 0.8208647034530957, "grad_norm": 0.8498765826225281, "learning_rate": 1.6522275237432193e-05, "loss": 0.4792, "step": 25840 }, { "epoch": 0.8215000476508149, "grad_norm": 0.9139013886451721, "learning_rate": 1.6409153920145416e-05, "loss": 0.5006, "step": 25860 }, { "epoch": 0.8221353918485339, "grad_norm": 0.9082590937614441, "learning_rate": 1.6296386560181744e-05, "loss": 0.4801, "step": 25880 }, { "epoch": 0.8227707360462531, "grad_norm": 0.8360690474510193, "learning_rate": 1.618397363504366e-05, "loss": 0.491, "step": 25900 }, { "epoch": 0.8234060802439722, "grad_norm": 0.8585413098335266, "learning_rate": 1.6071915620732746e-05, "loss": 0.4952, "step": 25920 }, { "epoch": 0.8240414244416913, "grad_norm": 0.9051182866096497, "learning_rate": 1.5960212991747804e-05, "loss": 0.5021, "step": 25940 }, { "epoch": 0.8246767686394104, "grad_norm": 1.1850552558898926, "learning_rate": 1.584886622108276e-05, "loss": 0.5194, "step": 25960 }, { "epoch": 0.8253121128371295, "grad_norm": 0.8449670672416687, "learning_rate": 1.57378757802247e-05, "loss": 0.4988, "step": 25980 }, { "epoch": 0.8259474570348486, "grad_norm": 0.9663527607917786, "learning_rate": 1.5627242139151867e-05, "loss": 0.4782, "step": 26000 }, { "epoch": 0.8259474570348486, "eval_loss": 0.44560423493385315, "eval_runtime": 45.0247, "eval_samples_per_second": 60.034, "eval_steps_per_second": 30.028, "step": 26000 }, { "epoch": 0.8265828012325678, "grad_norm": 1.0954176187515259, "learning_rate": 1.5516965766331715e-05, "loss": 0.4992, "step": 26020 }, { "epoch": 0.8272181454302868, "grad_norm": 0.9752370119094849, "learning_rate": 1.540704712871881e-05, "loss": 0.5109, "step": 26040 }, { "epoch": 0.827853489628006, "grad_norm": 0.7089188098907471, "learning_rate": 1.5297486691752928e-05, "loss": 0.4669, "step": 26060 }, { "epoch": 0.8284888338257251, "grad_norm": 0.8641648292541504, "learning_rate": 1.5188284919357155e-05, "loss": 0.4905, "step": 26080 }, { "epoch": 0.8291241780234442, "grad_norm": 0.8167259097099304, "learning_rate": 1.5079442273935773e-05, "loss": 0.4776, "step": 26100 }, { "epoch": 0.8297595222211633, "grad_norm": 0.9287614226341248, "learning_rate": 1.4970959216372372e-05, "loss": 0.4803, "step": 26120 }, { "epoch": 0.8303948664188824, "grad_norm": 0.8652564883232117, "learning_rate": 1.4862836206027975e-05, "loss": 0.4623, "step": 26140 }, { "epoch": 0.8310302106166015, "grad_norm": 0.9141151309013367, "learning_rate": 1.4755073700738953e-05, "loss": 0.507, "step": 26160 }, { "epoch": 0.8316655548143207, "grad_norm": 0.9454159736633301, "learning_rate": 1.464767215681515e-05, "loss": 0.5218, "step": 26180 }, { "epoch": 0.8323008990120397, "grad_norm": 0.7766212821006775, "learning_rate": 1.4540632029038026e-05, "loss": 0.5294, "step": 26200 }, { "epoch": 0.8329362432097589, "grad_norm": 0.8662501573562622, "learning_rate": 1.443395377065858e-05, "loss": 0.4931, "step": 26220 }, { "epoch": 0.833571587407478, "grad_norm": 1.0195443630218506, "learning_rate": 1.4327637833395525e-05, "loss": 0.5165, "step": 26240 }, { "epoch": 0.8342069316051971, "grad_norm": 0.9022318124771118, "learning_rate": 1.422168466743341e-05, "loss": 0.4732, "step": 26260 }, { "epoch": 0.8348422758029163, "grad_norm": 0.9162563681602478, "learning_rate": 1.4116094721420625e-05, "loss": 0.496, "step": 26280 }, { "epoch": 0.8354776200006353, "grad_norm": 1.129158854484558, "learning_rate": 1.401086844246755e-05, "loss": 0.4764, "step": 26300 }, { "epoch": 0.8361129641983545, "grad_norm": 0.8695496320724487, "learning_rate": 1.3906006276144601e-05, "loss": 0.4852, "step": 26320 }, { "epoch": 0.8367483083960736, "grad_norm": 1.7362381219863892, "learning_rate": 1.3801508666480512e-05, "loss": 0.4642, "step": 26340 }, { "epoch": 0.8373836525937927, "grad_norm": 0.7645226716995239, "learning_rate": 1.369737605596022e-05, "loss": 0.503, "step": 26360 }, { "epoch": 0.8380189967915118, "grad_norm": 0.8403562903404236, "learning_rate": 1.3593608885523158e-05, "loss": 0.4766, "step": 26380 }, { "epoch": 0.8386543409892309, "grad_norm": 0.7841979265213013, "learning_rate": 1.3490207594561366e-05, "loss": 0.4917, "step": 26400 }, { "epoch": 0.83928968518695, "grad_norm": 0.8631531000137329, "learning_rate": 1.3392315662821897e-05, "loss": 0.4972, "step": 26420 }, { "epoch": 0.8399250293846692, "grad_norm": 1.0436699390411377, "learning_rate": 1.3289629094769217e-05, "loss": 0.4847, "step": 26440 }, { "epoch": 0.8405603735823882, "grad_norm": 0.9521028399467468, "learning_rate": 1.318730969336468e-05, "loss": 0.4972, "step": 26460 }, { "epoch": 0.8411957177801074, "grad_norm": 0.9861098527908325, "learning_rate": 1.3085357891869909e-05, "loss": 0.5114, "step": 26480 }, { "epoch": 0.8418310619778265, "grad_norm": 1.3008265495300293, "learning_rate": 1.2983774121989888e-05, "loss": 0.5071, "step": 26500 }, { "epoch": 0.8424664061755456, "grad_norm": 0.7970487475395203, "learning_rate": 1.2882558813871204e-05, "loss": 0.4945, "step": 26520 }, { "epoch": 0.8431017503732647, "grad_norm": 0.7304345369338989, "learning_rate": 1.2781712396100287e-05, "loss": 0.4902, "step": 26540 }, { "epoch": 0.8437370945709838, "grad_norm": 0.9716693162918091, "learning_rate": 1.2681235295701488e-05, "loss": 0.4857, "step": 26560 }, { "epoch": 0.8443724387687029, "grad_norm": 0.9461120963096619, "learning_rate": 1.2581127938135328e-05, "loss": 0.5139, "step": 26580 }, { "epoch": 0.8450077829664221, "grad_norm": 0.8130011558532715, "learning_rate": 1.2481390747296717e-05, "loss": 0.4788, "step": 26600 }, { "epoch": 0.8456431271641411, "grad_norm": 0.959818959236145, "learning_rate": 1.2382024145513094e-05, "loss": 0.4808, "step": 26620 }, { "epoch": 0.8462784713618603, "grad_norm": 1.2069573402404785, "learning_rate": 1.2283028553542674e-05, "loss": 0.4692, "step": 26640 }, { "epoch": 0.8469138155595795, "grad_norm": 1.0251085758209229, "learning_rate": 1.2184404390572712e-05, "loss": 0.5106, "step": 26660 }, { "epoch": 0.8475491597572985, "grad_norm": 0.9423872828483582, "learning_rate": 1.2086152074217638e-05, "loss": 0.4881, "step": 26680 }, { "epoch": 0.8481845039550177, "grad_norm": 0.8245638608932495, "learning_rate": 1.1988272020517322e-05, "loss": 0.4606, "step": 26700 }, { "epoch": 0.8488198481527367, "grad_norm": 1.0099587440490723, "learning_rate": 1.1890764643935393e-05, "loss": 0.4976, "step": 26720 }, { "epoch": 0.8494551923504559, "grad_norm": 0.8285634517669678, "learning_rate": 1.1793630357357355e-05, "loss": 0.5057, "step": 26740 }, { "epoch": 0.850090536548175, "grad_norm": 0.9125322699546814, "learning_rate": 1.169686957208892e-05, "loss": 0.4856, "step": 26760 }, { "epoch": 0.8507258807458941, "grad_norm": 1.1413007974624634, "learning_rate": 1.1600482697854198e-05, "loss": 0.4916, "step": 26780 }, { "epoch": 0.8513612249436132, "grad_norm": 0.9246459603309631, "learning_rate": 1.1504470142794121e-05, "loss": 0.4807, "step": 26800 }, { "epoch": 0.8519965691413324, "grad_norm": 0.9050401449203491, "learning_rate": 1.140883231346449e-05, "loss": 0.4844, "step": 26820 }, { "epoch": 0.8526319133390514, "grad_norm": 0.8217797875404358, "learning_rate": 1.1313569614834408e-05, "loss": 0.4751, "step": 26840 }, { "epoch": 0.8532672575367706, "grad_norm": 1.0189076662063599, "learning_rate": 1.1218682450284545e-05, "loss": 0.4949, "step": 26860 }, { "epoch": 0.8539026017344896, "grad_norm": 0.7574889659881592, "learning_rate": 1.112417122160535e-05, "loss": 0.4738, "step": 26880 }, { "epoch": 0.8545379459322088, "grad_norm": 0.6649676561355591, "learning_rate": 1.1030036328995497e-05, "loss": 0.4859, "step": 26900 }, { "epoch": 0.8551732901299279, "grad_norm": 0.7144981622695923, "learning_rate": 1.0936278171060032e-05, "loss": 0.4799, "step": 26920 }, { "epoch": 0.855808634327647, "grad_norm": 0.9074038863182068, "learning_rate": 1.0842897144808762e-05, "loss": 0.4951, "step": 26940 }, { "epoch": 0.8564439785253661, "grad_norm": 0.9271389842033386, "learning_rate": 1.0749893645654551e-05, "loss": 0.4692, "step": 26960 }, { "epoch": 0.8570793227230852, "grad_norm": 0.9277658462524414, "learning_rate": 1.0657268067411752e-05, "loss": 0.4711, "step": 26980 }, { "epoch": 0.8577146669208043, "grad_norm": 1.5766148567199707, "learning_rate": 1.0565020802294357e-05, "loss": 0.5081, "step": 27000 }, { "epoch": 0.8577146669208043, "eval_loss": 0.4444785416126251, "eval_runtime": 45.2678, "eval_samples_per_second": 59.711, "eval_steps_per_second": 29.867, "step": 27000 }, { "epoch": 0.8583500111185235, "grad_norm": 0.7567349076271057, "learning_rate": 1.0473152240914419e-05, "loss": 0.4671, "step": 27020 }, { "epoch": 0.8589853553162425, "grad_norm": 1.0230178833007812, "learning_rate": 1.0381662772280498e-05, "loss": 0.4874, "step": 27040 }, { "epoch": 0.8596206995139617, "grad_norm": 0.7454288005828857, "learning_rate": 1.0290552783795849e-05, "loss": 0.4825, "step": 27060 }, { "epoch": 0.8602560437116809, "grad_norm": 0.9813241958618164, "learning_rate": 1.0199822661256852e-05, "loss": 0.4785, "step": 27080 }, { "epoch": 0.8608913879093999, "grad_norm": 0.8269158005714417, "learning_rate": 1.0109472788851427e-05, "loss": 0.4797, "step": 27100 }, { "epoch": 0.861526732107119, "grad_norm": 0.8101191520690918, "learning_rate": 1.001950354915734e-05, "loss": 0.4735, "step": 27120 }, { "epoch": 0.8621620763048381, "grad_norm": 0.903421938419342, "learning_rate": 9.929915323140571e-06, "loss": 0.5, "step": 27140 }, { "epoch": 0.8627974205025573, "grad_norm": 0.7358487248420715, "learning_rate": 9.840708490153817e-06, "loss": 0.4799, "step": 27160 }, { "epoch": 0.8634327647002764, "grad_norm": 0.9838561415672302, "learning_rate": 9.751883427934717e-06, "loss": 0.506, "step": 27180 }, { "epoch": 0.8640681088979955, "grad_norm": 0.9448813796043396, "learning_rate": 9.66344051260436e-06, "loss": 0.4966, "step": 27200 }, { "epoch": 0.8647034530957146, "grad_norm": 1.111055612564087, "learning_rate": 9.575380118665733e-06, "loss": 0.5118, "step": 27220 }, { "epoch": 0.8653387972934338, "grad_norm": 0.968305230140686, "learning_rate": 9.487702619001992e-06, "loss": 0.5002, "step": 27240 }, { "epoch": 0.8659741414911528, "grad_norm": 0.8771995902061462, "learning_rate": 9.400408384874992e-06, "loss": 0.497, "step": 27260 }, { "epoch": 0.866609485688872, "grad_norm": 1.0422018766403198, "learning_rate": 9.31349778592373e-06, "loss": 0.5081, "step": 27280 }, { "epoch": 0.867244829886591, "grad_norm": 0.8950514197349548, "learning_rate": 9.22697119016267e-06, "loss": 0.4957, "step": 27300 }, { "epoch": 0.8678801740843102, "grad_norm": 0.8093190789222717, "learning_rate": 9.140828963980297e-06, "loss": 0.4667, "step": 27320 }, { "epoch": 0.8685155182820293, "grad_norm": 0.8465502262115479, "learning_rate": 9.055071472137466e-06, "loss": 0.4913, "step": 27340 }, { "epoch": 0.8691508624797484, "grad_norm": 0.8349893093109131, "learning_rate": 8.969699077766014e-06, "loss": 0.4738, "step": 27360 }, { "epoch": 0.8697862066774675, "grad_norm": 0.831910252571106, "learning_rate": 8.884712142367024e-06, "loss": 0.4923, "step": 27380 }, { "epoch": 0.8704215508751866, "grad_norm": 0.9581566452980042, "learning_rate": 8.80011102580941e-06, "loss": 0.4856, "step": 27400 }, { "epoch": 0.8710568950729057, "grad_norm": 0.823250412940979, "learning_rate": 8.720097656085246e-06, "loss": 0.4886, "step": 27420 }, { "epoch": 0.8716922392706249, "grad_norm": 0.988389253616333, "learning_rate": 8.636249915153039e-06, "loss": 0.4946, "step": 27440 }, { "epoch": 0.8723275834683439, "grad_norm": 0.85055011510849, "learning_rate": 8.55695289500451e-06, "loss": 0.4885, "step": 27460 }, { "epoch": 0.8729629276660631, "grad_norm": 0.9092792272567749, "learning_rate": 8.473859879755397e-06, "loss": 0.4631, "step": 27480 }, { "epoch": 0.8735982718637822, "grad_norm": 0.930949330329895, "learning_rate": 8.39115442306171e-06, "loss": 0.4955, "step": 27500 }, { "epoch": 0.8742336160615013, "grad_norm": 0.7822802066802979, "learning_rate": 8.308836875131665e-06, "loss": 0.4842, "step": 27520 }, { "epoch": 0.8748689602592205, "grad_norm": 0.7877179384231567, "learning_rate": 8.22690758453094e-06, "loss": 0.5006, "step": 27540 }, { "epoch": 0.8755043044569395, "grad_norm": 0.9965065717697144, "learning_rate": 8.145366898181139e-06, "loss": 0.4866, "step": 27560 }, { "epoch": 0.8761396486546587, "grad_norm": 1.1015229225158691, "learning_rate": 8.064215161358402e-06, "loss": 0.5203, "step": 27580 }, { "epoch": 0.8767749928523778, "grad_norm": 0.7929244637489319, "learning_rate": 7.983452717691852e-06, "loss": 0.477, "step": 27600 }, { "epoch": 0.8774103370500969, "grad_norm": 1.0685256719589233, "learning_rate": 7.903079909162258e-06, "loss": 0.5385, "step": 27620 }, { "epoch": 0.878045681247816, "grad_norm": 1.0020925998687744, "learning_rate": 7.82309707610046e-06, "loss": 0.5061, "step": 27640 }, { "epoch": 0.8786810254455352, "grad_norm": 0.8348806500434875, "learning_rate": 7.743504557185976e-06, "loss": 0.505, "step": 27660 }, { "epoch": 0.8793163696432542, "grad_norm": 0.8327703475952148, "learning_rate": 7.664302689445635e-06, "loss": 0.4633, "step": 27680 }, { "epoch": 0.8799517138409734, "grad_norm": 0.9524950385093689, "learning_rate": 7.5854918082520435e-06, "loss": 0.4859, "step": 27700 }, { "epoch": 0.8805870580386924, "grad_norm": 0.8677568435668945, "learning_rate": 7.507072247322211e-06, "loss": 0.4832, "step": 27720 }, { "epoch": 0.8812224022364116, "grad_norm": 0.9326565265655518, "learning_rate": 7.429044338716196e-06, "loss": 0.493, "step": 27740 }, { "epoch": 0.8818577464341307, "grad_norm": 0.7510032057762146, "learning_rate": 7.35140841283557e-06, "loss": 0.489, "step": 27760 }, { "epoch": 0.8824930906318498, "grad_norm": 0.7510486841201782, "learning_rate": 7.274164798422134e-06, "loss": 0.4741, "step": 27780 }, { "epoch": 0.8831284348295689, "grad_norm": 0.8744218945503235, "learning_rate": 7.197313822556462e-06, "loss": 0.4698, "step": 27800 }, { "epoch": 0.8837637790272881, "grad_norm": 0.7554096579551697, "learning_rate": 7.12085581065658e-06, "loss": 0.4561, "step": 27820 }, { "epoch": 0.8843991232250071, "grad_norm": 1.0702250003814697, "learning_rate": 7.044791086476499e-06, "loss": 0.5074, "step": 27840 }, { "epoch": 0.8850344674227263, "grad_norm": 1.2190712690353394, "learning_rate": 6.969119972104898e-06, "loss": 0.4873, "step": 27860 }, { "epoch": 0.8856698116204453, "grad_norm": 0.8235007524490356, "learning_rate": 6.893842787963789e-06, "loss": 0.4884, "step": 27880 }, { "epoch": 0.8863051558181645, "grad_norm": 0.8809916973114014, "learning_rate": 6.818959852807083e-06, "loss": 0.4746, "step": 27900 }, { "epoch": 0.8869405000158836, "grad_norm": 0.8362717628479004, "learning_rate": 6.744471483719306e-06, "loss": 0.5139, "step": 27920 }, { "epoch": 0.8875758442136027, "grad_norm": 0.9398446083068848, "learning_rate": 6.67037799611423e-06, "loss": 0.5002, "step": 27940 }, { "epoch": 0.8882111884113219, "grad_norm": 0.750577449798584, "learning_rate": 6.596679703733544e-06, "loss": 0.4965, "step": 27960 }, { "epoch": 0.8888465326090409, "grad_norm": 1.0199640989303589, "learning_rate": 6.523376918645474e-06, "loss": 0.5101, "step": 27980 }, { "epoch": 0.88948187680676, "grad_norm": 0.8302307724952698, "learning_rate": 6.4504699512435985e-06, "loss": 0.4608, "step": 28000 }, { "epoch": 0.88948187680676, "eval_loss": 0.4442509412765503, "eval_runtime": 44.8835, "eval_samples_per_second": 60.223, "eval_steps_per_second": 30.122, "step": 28000 }, { "epoch": 0.8901172210044792, "grad_norm": 0.7648799419403076, "learning_rate": 6.377959110245357e-06, "loss": 0.4704, "step": 28020 }, { "epoch": 0.8907525652021983, "grad_norm": 0.8950293064117432, "learning_rate": 6.305844702690878e-06, "loss": 0.4906, "step": 28040 }, { "epoch": 0.8913879093999174, "grad_norm": 0.9124616384506226, "learning_rate": 6.234127033941628e-06, "loss": 0.4939, "step": 28060 }, { "epoch": 0.8920232535976366, "grad_norm": 0.8970253467559814, "learning_rate": 6.1628064076791e-06, "loss": 0.5088, "step": 28080 }, { "epoch": 0.8926585977953556, "grad_norm": 0.9791019558906555, "learning_rate": 6.091883125903575e-06, "loss": 0.4613, "step": 28100 }, { "epoch": 0.8932939419930748, "grad_norm": 1.3384908437728882, "learning_rate": 6.021357488932789e-06, "loss": 0.4737, "step": 28120 }, { "epoch": 0.8939292861907938, "grad_norm": 1.076692819595337, "learning_rate": 5.951229795400726e-06, "loss": 0.5094, "step": 28140 }, { "epoch": 0.894564630388513, "grad_norm": 0.9772495031356812, "learning_rate": 5.881500342256285e-06, "loss": 0.4791, "step": 28160 }, { "epoch": 0.8951999745862321, "grad_norm": 0.946626603603363, "learning_rate": 5.8121694247620485e-06, "loss": 0.4843, "step": 28180 }, { "epoch": 0.8958353187839512, "grad_norm": 0.9328265190124512, "learning_rate": 5.74323733649309e-06, "loss": 0.4822, "step": 28200 }, { "epoch": 0.8964706629816703, "grad_norm": 0.7450932264328003, "learning_rate": 5.674704369335637e-06, "loss": 0.4746, "step": 28220 }, { "epoch": 0.8971060071793895, "grad_norm": 1.0023432970046997, "learning_rate": 5.606570813485856e-06, "loss": 0.4941, "step": 28240 }, { "epoch": 0.8977413513771085, "grad_norm": 0.8717949986457825, "learning_rate": 5.538836957448712e-06, "loss": 0.4801, "step": 28260 }, { "epoch": 0.8983766955748277, "grad_norm": 0.8665459156036377, "learning_rate": 5.474860277416504e-06, "loss": 0.4782, "step": 28280 }, { "epoch": 0.8990120397725467, "grad_norm": 0.8660995364189148, "learning_rate": 5.407906659415618e-06, "loss": 0.4788, "step": 28300 }, { "epoch": 0.8996473839702659, "grad_norm": 0.9390355944633484, "learning_rate": 5.341353582451425e-06, "loss": 0.478, "step": 28320 }, { "epoch": 0.900282728167985, "grad_norm": 0.8287180662155151, "learning_rate": 5.275201328336477e-06, "loss": 0.4846, "step": 28340 }, { "epoch": 0.9009180723657041, "grad_norm": 0.8496334552764893, "learning_rate": 5.209450177186081e-06, "loss": 0.4838, "step": 28360 }, { "epoch": 0.9015534165634232, "grad_norm": 0.9892422556877136, "learning_rate": 5.144100407417063e-06, "loss": 0.4854, "step": 28380 }, { "epoch": 0.9021887607611423, "grad_norm": 0.9813452363014221, "learning_rate": 5.0791522957467365e-06, "loss": 0.4916, "step": 28400 }, { "epoch": 0.9028241049588615, "grad_norm": 0.9126195907592773, "learning_rate": 5.014606117191545e-06, "loss": 0.4949, "step": 28420 }, { "epoch": 0.9034594491565806, "grad_norm": 0.8669445514678955, "learning_rate": 4.950462145066015e-06, "loss": 0.482, "step": 28440 }, { "epoch": 0.9040947933542997, "grad_norm": 0.9803065657615662, "learning_rate": 4.886720650981569e-06, "loss": 0.5025, "step": 28460 }, { "epoch": 0.9047301375520188, "grad_norm": 0.9414586424827576, "learning_rate": 4.823381904845392e-06, "loss": 0.4856, "step": 28480 }, { "epoch": 0.905365481749738, "grad_norm": 0.9295367002487183, "learning_rate": 4.760446174859224e-06, "loss": 0.4876, "step": 28500 }, { "epoch": 0.906000825947457, "grad_norm": 0.8859279751777649, "learning_rate": 4.697913727518332e-06, "loss": 0.5152, "step": 28520 }, { "epoch": 0.9066361701451762, "grad_norm": 0.7441398501396179, "learning_rate": 4.63578482761029e-06, "loss": 0.4787, "step": 28540 }, { "epoch": 0.9072715143428952, "grad_norm": 1.459954023361206, "learning_rate": 4.574059738213876e-06, "loss": 0.4813, "step": 28560 }, { "epoch": 0.9079068585406144, "grad_norm": 0.9451243281364441, "learning_rate": 4.512738720698018e-06, "loss": 0.4835, "step": 28580 }, { "epoch": 0.9085422027383335, "grad_norm": 0.8990492820739746, "learning_rate": 4.451822034720587e-06, "loss": 0.4811, "step": 28600 }, { "epoch": 0.9091775469360526, "grad_norm": 0.7530508637428284, "learning_rate": 4.3913099382273835e-06, "loss": 0.5, "step": 28620 }, { "epoch": 0.9098128911337717, "grad_norm": 0.8113830089569092, "learning_rate": 4.331202687451019e-06, "loss": 0.5075, "step": 28640 }, { "epoch": 0.9104482353314909, "grad_norm": 0.8615418672561646, "learning_rate": 4.2715005369097895e-06, "loss": 0.5152, "step": 28660 }, { "epoch": 0.9110835795292099, "grad_norm": 0.8459773659706116, "learning_rate": 4.212203739406673e-06, "loss": 0.4804, "step": 28680 }, { "epoch": 0.9117189237269291, "grad_norm": 0.8821284770965576, "learning_rate": 4.153312546028199e-06, "loss": 0.5311, "step": 28700 }, { "epoch": 0.9123542679246481, "grad_norm": 1.0187216997146606, "learning_rate": 4.0948272061434035e-06, "loss": 0.4632, "step": 28720 }, { "epoch": 0.9129896121223673, "grad_norm": 0.9274182915687561, "learning_rate": 4.036747967402788e-06, "loss": 0.4832, "step": 28740 }, { "epoch": 0.9136249563200864, "grad_norm": 0.7573745846748352, "learning_rate": 3.979075075737226e-06, "loss": 0.4905, "step": 28760 }, { "epoch": 0.9142603005178055, "grad_norm": 0.9005789160728455, "learning_rate": 3.921808775357027e-06, "loss": 0.5114, "step": 28780 }, { "epoch": 0.9148956447155246, "grad_norm": 0.9073104858398438, "learning_rate": 3.864949308750743e-06, "loss": 0.5018, "step": 28800 }, { "epoch": 0.9155309889132438, "grad_norm": 0.7230907678604126, "learning_rate": 3.808496916684268e-06, "loss": 0.4954, "step": 28820 }, { "epoch": 0.9161663331109628, "grad_norm": 0.7139384746551514, "learning_rate": 3.7524518381997885e-06, "loss": 0.464, "step": 28840 }, { "epoch": 0.916801677308682, "grad_norm": 0.8710399866104126, "learning_rate": 3.696814310614749e-06, "loss": 0.5048, "step": 28860 }, { "epoch": 0.917437021506401, "grad_norm": 0.87566739320755, "learning_rate": 3.6415845695208505e-06, "loss": 0.484, "step": 28880 }, { "epoch": 0.9180723657041202, "grad_norm": 0.9447526335716248, "learning_rate": 3.586762848783076e-06, "loss": 0.5032, "step": 28900 }, { "epoch": 0.9187077099018394, "grad_norm": 0.7784162759780884, "learning_rate": 3.53234938053868e-06, "loss": 0.4451, "step": 28920 }, { "epoch": 0.9193430540995584, "grad_norm": 0.9225743412971497, "learning_rate": 3.478344395196198e-06, "loss": 0.4745, "step": 28940 }, { "epoch": 0.9199783982972776, "grad_norm": 0.9712013602256775, "learning_rate": 3.4247481214345177e-06, "loss": 0.4956, "step": 28960 }, { "epoch": 0.9206137424949966, "grad_norm": 1.2805237770080566, "learning_rate": 3.371560786201855e-06, "loss": 0.4971, "step": 28980 }, { "epoch": 0.9212490866927158, "grad_norm": 0.7866525053977966, "learning_rate": 3.3187826147147994e-06, "loss": 0.497, "step": 29000 }, { "epoch": 0.9212490866927158, "eval_loss": 0.44399821758270264, "eval_runtime": 45.0357, "eval_samples_per_second": 60.019, "eval_steps_per_second": 30.021, "step": 29000 }, { "epoch": 0.9218844308904349, "grad_norm": 0.7901077270507812, "learning_rate": 3.2664138304574153e-06, "loss": 0.514, "step": 29020 }, { "epoch": 0.922519775088154, "grad_norm": 1.0464386940002441, "learning_rate": 3.2144546551802323e-06, "loss": 0.5042, "step": 29040 }, { "epoch": 0.9231551192858731, "grad_norm": 0.8520443439483643, "learning_rate": 3.162905308899322e-06, "loss": 0.4858, "step": 29060 }, { "epoch": 0.9237904634835923, "grad_norm": 0.92030268907547, "learning_rate": 3.1117660098953895e-06, "loss": 0.4766, "step": 29080 }, { "epoch": 0.9244258076813113, "grad_norm": 0.7019485235214233, "learning_rate": 3.06103697471285e-06, "loss": 0.4903, "step": 29100 }, { "epoch": 0.9250611518790305, "grad_norm": 1.3560097217559814, "learning_rate": 3.0107184181588643e-06, "loss": 0.5125, "step": 29120 }, { "epoch": 0.9256964960767495, "grad_norm": 0.9616526365280151, "learning_rate": 2.960810553302462e-06, "loss": 0.512, "step": 29140 }, { "epoch": 0.9263318402744687, "grad_norm": 1.1742409467697144, "learning_rate": 2.9113135914736856e-06, "loss": 0.5007, "step": 29160 }, { "epoch": 0.9269671844721878, "grad_norm": 0.8712571263313293, "learning_rate": 2.8622277422625907e-06, "loss": 0.4717, "step": 29180 }, { "epoch": 0.9276025286699069, "grad_norm": 0.8578605055809021, "learning_rate": 2.8135532135184384e-06, "loss": 0.4989, "step": 29200 }, { "epoch": 0.928237872867626, "grad_norm": 0.8551231026649475, "learning_rate": 2.7652902113488143e-06, "loss": 0.4825, "step": 29220 }, { "epoch": 0.9288732170653452, "grad_norm": 0.82204669713974, "learning_rate": 2.7174389401186996e-06, "loss": 0.4702, "step": 29240 }, { "epoch": 0.9295085612630642, "grad_norm": 0.9263904690742493, "learning_rate": 2.6699996024496575e-06, "loss": 0.4996, "step": 29260 }, { "epoch": 0.9301439054607834, "grad_norm": 1.037817120552063, "learning_rate": 2.6229723992189704e-06, "loss": 0.4986, "step": 29280 }, { "epoch": 0.9307792496585024, "grad_norm": 1.0528874397277832, "learning_rate": 2.5763575295587593e-06, "loss": 0.4794, "step": 29300 }, { "epoch": 0.9314145938562216, "grad_norm": 0.8765133619308472, "learning_rate": 2.5301551908551545e-06, "loss": 0.4878, "step": 29320 }, { "epoch": 0.9320499380539408, "grad_norm": 0.8322685956954956, "learning_rate": 2.484365578747494e-06, "loss": 0.4945, "step": 29340 }, { "epoch": 0.9326852822516598, "grad_norm": 0.8344667553901672, "learning_rate": 2.438988887127436e-06, "loss": 0.4981, "step": 29360 }, { "epoch": 0.933320626449379, "grad_norm": 0.8750690817832947, "learning_rate": 2.3940253081381703e-06, "loss": 0.4969, "step": 29380 }, { "epoch": 0.933955970647098, "grad_norm": 0.808814287185669, "learning_rate": 2.3494750321736093e-06, "loss": 0.4623, "step": 29400 }, { "epoch": 0.9345913148448172, "grad_norm": 0.9626306891441345, "learning_rate": 2.3053382478775754e-06, "loss": 0.5028, "step": 29420 }, { "epoch": 0.9352266590425363, "grad_norm": 0.9727978706359863, "learning_rate": 2.261615142143003e-06, "loss": 0.5059, "step": 29440 }, { "epoch": 0.9358620032402554, "grad_norm": 0.8926533460617065, "learning_rate": 2.2183059001111174e-06, "loss": 0.4764, "step": 29460 }, { "epoch": 0.9364973474379745, "grad_norm": 1.0506230592727661, "learning_rate": 2.1754107051707218e-06, "loss": 0.5069, "step": 29480 }, { "epoch": 0.9371326916356937, "grad_norm": 0.7190736532211304, "learning_rate": 2.1329297389573565e-06, "loss": 0.49, "step": 29500 }, { "epoch": 0.9377680358334127, "grad_norm": 0.7786980867385864, "learning_rate": 2.09086318135252e-06, "loss": 0.4766, "step": 29520 }, { "epoch": 0.9384033800311319, "grad_norm": 0.8696832060813904, "learning_rate": 2.049211210483004e-06, "loss": 0.4959, "step": 29540 }, { "epoch": 0.9390387242288509, "grad_norm": 0.7167271375656128, "learning_rate": 2.0079740027200144e-06, "loss": 0.4927, "step": 29560 }, { "epoch": 0.9396740684265701, "grad_norm": 0.868259072303772, "learning_rate": 1.967151732678518e-06, "loss": 0.4788, "step": 29580 }, { "epoch": 0.9403094126242892, "grad_norm": 0.8658266663551331, "learning_rate": 1.9267445732164325e-06, "loss": 0.4919, "step": 29600 }, { "epoch": 0.9409447568220083, "grad_norm": 1.010276436805725, "learning_rate": 1.8867526954339688e-06, "loss": 0.4811, "step": 29620 }, { "epoch": 0.9415801010197274, "grad_norm": 0.9376817941665649, "learning_rate": 1.8471762686728344e-06, "loss": 0.4723, "step": 29640 }, { "epoch": 0.9422154452174466, "grad_norm": 1.520297646522522, "learning_rate": 1.8080154605155996e-06, "loss": 0.5146, "step": 29660 }, { "epoch": 0.9428507894151656, "grad_norm": 0.8532717227935791, "learning_rate": 1.7692704367848756e-06, "loss": 0.4556, "step": 29680 }, { "epoch": 0.9434861336128848, "grad_norm": 1.069378137588501, "learning_rate": 1.730941361542704e-06, "loss": 0.4789, "step": 29700 }, { "epoch": 0.9441214778106038, "grad_norm": 0.8771205544471741, "learning_rate": 1.6930283970898574e-06, "loss": 0.4819, "step": 29720 }, { "epoch": 0.944756822008323, "grad_norm": 0.8729512095451355, "learning_rate": 1.6555317039650852e-06, "loss": 0.4792, "step": 29740 }, { "epoch": 0.9453921662060422, "grad_norm": 0.8724381923675537, "learning_rate": 1.6184514409444795e-06, "loss": 0.4726, "step": 29760 }, { "epoch": 0.9460275104037612, "grad_norm": 0.9022035598754883, "learning_rate": 1.5817877650408541e-06, "loss": 0.4891, "step": 29780 }, { "epoch": 0.9466628546014804, "grad_norm": 1.003596544265747, "learning_rate": 1.5455408315029562e-06, "loss": 0.4974, "step": 29800 }, { "epoch": 0.9472981987991995, "grad_norm": 0.8569382429122925, "learning_rate": 1.5097107938149113e-06, "loss": 0.4781, "step": 29820 }, { "epoch": 0.9479335429969186, "grad_norm": 0.9094131588935852, "learning_rate": 1.4742978036955457e-06, "loss": 0.5155, "step": 29840 }, { "epoch": 0.9485688871946377, "grad_norm": 1.0451712608337402, "learning_rate": 1.4393020110977206e-06, "loss": 0.4895, "step": 29860 }, { "epoch": 0.9492042313923568, "grad_norm": 1.2386709451675415, "learning_rate": 1.4047235642077217e-06, "loss": 0.4702, "step": 29880 }, { "epoch": 0.9498395755900759, "grad_norm": 0.966143786907196, "learning_rate": 1.3705626094446256e-06, "loss": 0.4962, "step": 29900 }, { "epoch": 0.9504749197877951, "grad_norm": 0.9544230103492737, "learning_rate": 1.33681929145969e-06, "loss": 0.4788, "step": 29920 }, { "epoch": 0.9511102639855141, "grad_norm": 0.8583151698112488, "learning_rate": 1.3034937531357095e-06, "loss": 0.477, "step": 29940 }, { "epoch": 0.9517456081832333, "grad_norm": 0.8361521363258362, "learning_rate": 1.270586135586427e-06, "loss": 0.5162, "step": 29960 }, { "epoch": 0.9523809523809523, "grad_norm": 1.0520914793014526, "learning_rate": 1.2380965781559783e-06, "loss": 0.4762, "step": 29980 }, { "epoch": 0.9530162965786715, "grad_norm": 0.8727782964706421, "learning_rate": 1.2060252184182386e-06, "loss": 0.4929, "step": 30000 }, { "epoch": 0.9530162965786715, "eval_loss": 0.443807452917099, "eval_runtime": 44.5933, "eval_samples_per_second": 60.614, "eval_steps_per_second": 30.318, "step": 30000 }, { "epoch": 0.9536516407763906, "grad_norm": 0.7989442944526672, "learning_rate": 1.174372192176254e-06, "loss": 0.4932, "step": 30020 }, { "epoch": 0.9542869849741097, "grad_norm": 0.7544863224029541, "learning_rate": 1.1431376334616994e-06, "loss": 0.482, "step": 30040 }, { "epoch": 0.9549223291718288, "grad_norm": 0.8897516131401062, "learning_rate": 1.1123216745342779e-06, "loss": 0.4898, "step": 30060 }, { "epoch": 0.955557673369548, "grad_norm": 0.8291769027709961, "learning_rate": 1.0819244458811773e-06, "loss": 0.5021, "step": 30080 }, { "epoch": 0.956193017567267, "grad_norm": 0.8413028717041016, "learning_rate": 1.0519460762165144e-06, "loss": 0.4762, "step": 30100 }, { "epoch": 0.9568283617649862, "grad_norm": 0.9216207265853882, "learning_rate": 1.0223866924807924e-06, "loss": 0.4869, "step": 30120 }, { "epoch": 0.9574637059627052, "grad_norm": 0.8935249447822571, "learning_rate": 9.932464198403325e-07, "loss": 0.4928, "step": 30140 }, { "epoch": 0.9580990501604244, "grad_norm": 0.7496423721313477, "learning_rate": 9.645253816867983e-07, "loss": 0.5266, "step": 30160 }, { "epoch": 0.9587343943581436, "grad_norm": 0.9738262295722961, "learning_rate": 9.362236996366514e-07, "loss": 0.4735, "step": 30180 }, { "epoch": 0.9593697385558626, "grad_norm": 0.9249958395957947, "learning_rate": 9.083414935305956e-07, "loss": 0.4706, "step": 30200 }, { "epoch": 0.9600050827535818, "grad_norm": 1.0667359828948975, "learning_rate": 8.808788814331448e-07, "loss": 0.4721, "step": 30220 }, { "epoch": 0.9606404269513009, "grad_norm": 0.8088135123252869, "learning_rate": 8.53835979632056e-07, "loss": 0.4884, "step": 30240 }, { "epoch": 0.96127577114902, "grad_norm": 0.9164936542510986, "learning_rate": 8.272129026378639e-07, "loss": 0.5022, "step": 30260 }, { "epoch": 0.9619111153467391, "grad_norm": 0.7835588455200195, "learning_rate": 8.010097631834245e-07, "loss": 0.4707, "step": 30280 }, { "epoch": 0.9625464595444582, "grad_norm": 1.2730233669281006, "learning_rate": 7.752266722233614e-07, "loss": 0.4795, "step": 30300 }, { "epoch": 0.9631818037421773, "grad_norm": 0.9977156519889832, "learning_rate": 7.511219051883567e-07, "loss": 0.5209, "step": 30320 }, { "epoch": 0.9638171479398965, "grad_norm": 0.941656231880188, "learning_rate": 7.26158221189377e-07, "loss": 0.4747, "step": 30340 }, { "epoch": 0.9644524921376155, "grad_norm": 0.7258419990539551, "learning_rate": 7.028320832731084e-07, "loss": 0.4961, "step": 30360 }, { "epoch": 0.9650878363353347, "grad_norm": 0.974557638168335, "learning_rate": 6.786882081830093e-07, "loss": 0.4559, "step": 30380 }, { "epoch": 0.9657231805330537, "grad_norm": 0.973461925983429, "learning_rate": 6.549648995460511e-07, "loss": 0.4931, "step": 30400 }, { "epoch": 0.9663585247307729, "grad_norm": 1.0066043138504028, "learning_rate": 6.31662257816279e-07, "loss": 0.4901, "step": 30420 }, { "epoch": 0.966993868928492, "grad_norm": 0.9339585900306702, "learning_rate": 6.087803816664628e-07, "loss": 0.4697, "step": 30440 }, { "epoch": 0.9676292131262111, "grad_norm": 0.8802968859672546, "learning_rate": 5.863193679877088e-07, "loss": 0.4943, "step": 30460 }, { "epoch": 0.9682645573239302, "grad_norm": 0.7557999491691589, "learning_rate": 5.6427931188896e-07, "loss": 0.4761, "step": 30480 }, { "epoch": 0.9688999015216494, "grad_norm": 0.9139352440834045, "learning_rate": 5.426603066967295e-07, "loss": 0.476, "step": 30500 }, { "epoch": 0.9695352457193684, "grad_norm": 0.9125082492828369, "learning_rate": 5.21462443954579e-07, "loss": 0.4792, "step": 30520 }, { "epoch": 0.9701705899170876, "grad_norm": 0.9351817965507507, "learning_rate": 5.006858134228076e-07, "loss": 0.4976, "step": 30540 }, { "epoch": 0.9708059341148066, "grad_norm": 0.743870735168457, "learning_rate": 4.803305030780302e-07, "loss": 0.4695, "step": 30560 }, { "epoch": 0.9714412783125258, "grad_norm": 0.9468183517456055, "learning_rate": 4.603965991128445e-07, "loss": 0.5027, "step": 30580 }, { "epoch": 0.972076622510245, "grad_norm": 1.1194064617156982, "learning_rate": 4.408841859354307e-07, "loss": 0.5146, "step": 30600 }, { "epoch": 0.972711966707964, "grad_norm": 0.7916650176048279, "learning_rate": 4.21793346169197e-07, "loss": 0.4689, "step": 30620 }, { "epoch": 0.9733473109056832, "grad_norm": 0.9158383011817932, "learning_rate": 4.0312416065245717e-07, "loss": 0.5272, "step": 30640 }, { "epoch": 0.9739826551034023, "grad_norm": 0.8861019015312195, "learning_rate": 3.8487670843807555e-07, "loss": 0.4981, "step": 30660 }, { "epoch": 0.9746179993011214, "grad_norm": 1.01827871799469, "learning_rate": 3.670510667931004e-07, "loss": 0.5386, "step": 30680 }, { "epoch": 0.9752533434988405, "grad_norm": 0.9622276425361633, "learning_rate": 3.496473111984866e-07, "loss": 0.5135, "step": 30700 }, { "epoch": 0.9758886876965596, "grad_norm": 1.0768787860870361, "learning_rate": 3.326655153487512e-07, "loss": 0.4943, "step": 30720 }, { "epoch": 0.9765240318942787, "grad_norm": 1.2705291509628296, "learning_rate": 3.16105751151663e-07, "loss": 0.4924, "step": 30740 }, { "epoch": 0.9771593760919979, "grad_norm": 0.9354774951934814, "learning_rate": 2.99968088727931e-07, "loss": 0.4811, "step": 30760 }, { "epoch": 0.9777947202897169, "grad_norm": 0.8442774415016174, "learning_rate": 2.842525964109166e-07, "loss": 0.4652, "step": 30780 }, { "epoch": 0.9784300644874361, "grad_norm": 0.9658933281898499, "learning_rate": 2.6895934074635533e-07, "loss": 0.4767, "step": 30800 }, { "epoch": 0.9790654086851552, "grad_norm": 0.9930063486099243, "learning_rate": 2.5408838649204625e-07, "loss": 0.4791, "step": 30820 }, { "epoch": 0.9797007528828743, "grad_norm": 0.9439179301261902, "learning_rate": 2.396397966176078e-07, "loss": 0.4833, "step": 30840 }, { "epoch": 0.9803360970805934, "grad_norm": 0.8499469757080078, "learning_rate": 2.25613632304178e-07, "loss": 0.4969, "step": 30860 }, { "epoch": 0.9809714412783125, "grad_norm": 1.0228259563446045, "learning_rate": 2.1200995294420323e-07, "loss": 0.4709, "step": 30880 }, { "epoch": 0.9816067854760316, "grad_norm": 1.1045747995376587, "learning_rate": 1.988288161411389e-07, "loss": 0.4964, "step": 30900 }, { "epoch": 0.9822421296737508, "grad_norm": 0.8404049277305603, "learning_rate": 1.8607027770921602e-07, "loss": 0.5289, "step": 30920 }, { "epoch": 0.9828774738714698, "grad_norm": 0.8583685755729675, "learning_rate": 1.7373439167325257e-07, "loss": 0.4824, "step": 30940 }, { "epoch": 0.983512818069189, "grad_norm": 0.8340322375297546, "learning_rate": 1.6240682931759622e-07, "loss": 0.5276, "step": 30960 }, { "epoch": 0.984148162266908, "grad_norm": 0.717254638671875, "learning_rate": 1.508952640646988e-07, "loss": 0.4837, "step": 30980 }, { "epoch": 0.9847835064646272, "grad_norm": 0.7109520435333252, "learning_rate": 1.3980650015292806e-07, "loss": 0.4805, "step": 31000 }, { "epoch": 0.9847835064646272, "eval_loss": 0.4438159465789795, "eval_runtime": 44.826, "eval_samples_per_second": 60.3, "eval_steps_per_second": 30.161, "step": 31000 }, { "epoch": 0.9854188506623464, "grad_norm": 0.8632842302322388, "learning_rate": 1.2914058453658008e-07, "loss": 0.4787, "step": 31020 }, { "epoch": 0.9860541948600654, "grad_norm": 0.9302808046340942, "learning_rate": 1.1889756237943861e-07, "loss": 0.4733, "step": 31040 }, { "epoch": 0.9866895390577846, "grad_norm": 1.0309478044509888, "learning_rate": 1.090774770545755e-07, "loss": 0.498, "step": 31060 }, { "epoch": 0.9873248832555037, "grad_norm": 0.7432119250297546, "learning_rate": 9.968037014420616e-08, "loss": 0.4909, "step": 31080 }, { "epoch": 0.9879602274532228, "grad_norm": 1.0406357049942017, "learning_rate": 9.070628143946768e-08, "loss": 0.4913, "step": 31100 }, { "epoch": 0.9885955716509419, "grad_norm": 0.8807629346847534, "learning_rate": 8.215524894024107e-08, "loss": 0.4843, "step": 31120 }, { "epoch": 0.989230915848661, "grad_norm": 0.815077006816864, "learning_rate": 7.402730885507359e-08, "loss": 0.4877, "step": 31140 }, { "epoch": 0.9898662600463801, "grad_norm": 0.8051480054855347, "learning_rate": 6.632249560092341e-08, "loss": 0.489, "step": 31160 }, { "epoch": 0.9905016042440993, "grad_norm": 0.8251180648803711, "learning_rate": 5.9040841803081895e-08, "loss": 0.4763, "step": 31180 }, { "epoch": 0.9911369484418183, "grad_norm": 0.8782890439033508, "learning_rate": 5.218237829499595e-08, "loss": 0.5012, "step": 31200 }, { "epoch": 0.9917722926395375, "grad_norm": 0.9451269507408142, "learning_rate": 4.574713411816811e-08, "loss": 0.4765, "step": 31220 }, { "epoch": 0.9924076368372566, "grad_norm": 1.2340540885925293, "learning_rate": 3.973513652202332e-08, "loss": 0.4999, "step": 31240 }, { "epoch": 0.9930429810349757, "grad_norm": 1.0101948976516724, "learning_rate": 3.414641096376459e-08, "loss": 0.5118, "step": 31260 }, { "epoch": 0.9936783252326948, "grad_norm": 0.7806993722915649, "learning_rate": 2.8980981108317485e-08, "loss": 0.5068, "step": 31280 }, { "epoch": 0.9943136694304139, "grad_norm": 1.1223636865615845, "learning_rate": 2.4238868828196927e-08, "loss": 0.5182, "step": 31300 }, { "epoch": 0.994949013628133, "grad_norm": 0.8514977693557739, "learning_rate": 1.9920094203418336e-08, "loss": 0.5072, "step": 31320 }, { "epoch": 0.9955843578258522, "grad_norm": 1.1318073272705078, "learning_rate": 1.6024675521397747e-08, "loss": 0.4819, "step": 31340 }, { "epoch": 0.9962197020235712, "grad_norm": 0.9314286708831787, "learning_rate": 1.2552629276929573e-08, "loss": 0.4957, "step": 31360 }, { "epoch": 0.9968550462212904, "grad_norm": 0.7769533395767212, "learning_rate": 9.503970172031196e-09, "loss": 0.5149, "step": 31380 }, { "epoch": 0.9974903904190096, "grad_norm": 0.7601432800292969, "learning_rate": 6.878711115976266e-09, "loss": 0.4933, "step": 31400 }, { "epoch": 0.9981257346167286, "grad_norm": 0.987147331237793, "learning_rate": 4.6768632251614765e-09, "loss": 0.4693, "step": 31420 }, { "epoch": 0.9987610788144478, "grad_norm": 0.8807405829429626, "learning_rate": 2.8984358230954577e-09, "loss": 0.474, "step": 31440 }, { "epoch": 0.9993964230121668, "grad_norm": 0.7518433332443237, "learning_rate": 1.5434364403543733e-09, "loss": 0.5076, "step": 31460 }, { "epoch": 1.0, "step": 31479, "total_flos": 0.0, "train_loss": 0.3508217529017671, "train_runtime": 14676.7422, "train_samples_per_second": 68.633, "train_steps_per_second": 2.145 } ], "logging_steps": 20, "max_steps": 31479, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }