| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 362.58514285714284, | |
| "global_step": 1088, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.29, | |
| "learning_rate": 0.0001, | |
| "loss": 3.7248, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "learning_rate": 0.0001, | |
| "loss": 2.1406, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0705, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6039, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 5.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4378, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 6.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3218, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 7.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2751, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 9.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2423, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 10.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1763, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 11.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1574, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 13.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.137, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 14.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1274, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 15.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.141, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 17.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1345, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 18.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0848, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 19.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0719, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 21.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0814, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 21.29, | |
| "eval_exact_match": 0.5145067698259188, | |
| "eval_loss": 0.15547168254852295, | |
| "eval_runtime": 2276.8146, | |
| "eval_samples_per_second": 0.454, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 22.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0665, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 23.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0593, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 25.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0652, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 26.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0896, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 27.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0483, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 29.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0497, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 30.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0479, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 31.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0376, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 33.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0374, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 34.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0342, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 35.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0354, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 37.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.029, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 38.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0274, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 39.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.024, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 41.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0331, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 42.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0204, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 42.59, | |
| "eval_exact_match": 0.6392649903288201, | |
| "eval_loss": 0.19595815241336823, | |
| "eval_runtime": 1825.3052, | |
| "eval_samples_per_second": 0.566, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 43.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0191, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 45.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.019, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 46.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0203, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 47.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0182, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 49.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0158, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 50.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0118, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 51.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0139, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 53.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.016, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 54.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.022, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 55.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0184, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 57.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0102, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 58.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0096, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 59.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0093, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 61.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0096, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 62.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0075, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 63.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0072, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 63.88, | |
| "eval_exact_match": 0.6460348162475822, | |
| "eval_loss": 0.24530905485153198, | |
| "eval_runtime": 1845.9477, | |
| "eval_samples_per_second": 0.56, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 65.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.007, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 66.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0091, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 67.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0082, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 69.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0063, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 70.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0447, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 71.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0114, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 73.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0065, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 74.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0046, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 75.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0054, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 77.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0048, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 78.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0051, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 79.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0041, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 81.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0044, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 82.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0222, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 83.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0049, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 85.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0632, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 85.29, | |
| "eval_exact_match": 0.648936170212766, | |
| "eval_loss": 0.21414929628372192, | |
| "eval_runtime": 1740.439, | |
| "eval_samples_per_second": 0.594, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 86.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0277, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 87.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0074, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 89.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0041, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 90.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0027, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 91.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0026, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 93.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0024, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 94.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0021, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 95.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.002, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 97.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.002, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 98.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0018, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 99.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0017, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 101.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0019, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 102.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0017, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 103.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0016, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 105.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0018, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 106.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0015, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 106.59, | |
| "eval_exact_match": 0.6808510638297872, | |
| "eval_loss": 0.2944816052913666, | |
| "eval_runtime": 1854.2635, | |
| "eval_samples_per_second": 0.558, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 107.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0013, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 109.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0014, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 110.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0017, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 111.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0013, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 113.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0013, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 114.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0011, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 115.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0015, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 117.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0021, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 118.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0013, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 119.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.001, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 121.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0012, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 122.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.001, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 123.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 125.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0013, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 126.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0012, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 127.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 127.88, | |
| "eval_exact_match": 0.6731141199226306, | |
| "eval_loss": 0.31063422560691833, | |
| "eval_runtime": 1905.6919, | |
| "eval_samples_per_second": 0.543, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 129.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0012, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 130.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.001, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 131.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0008, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 133.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0011, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 134.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 135.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 137.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0011, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 138.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 139.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.001, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 141.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.001, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 142.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 143.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0008, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 145.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.001, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 146.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 147.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 149.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.001, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 149.29, | |
| "eval_exact_match": 0.6847195357833655, | |
| "eval_loss": 0.32885223627090454, | |
| "eval_runtime": 1904.5416, | |
| "eval_samples_per_second": 0.543, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 150.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0012, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 151.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0013, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 153.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.001, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 154.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0008, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 155.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0006, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 157.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0008, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 158.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0096, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 159.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.002, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 161.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 162.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 163.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 165.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 166.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 167.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 169.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0148, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 170.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0127, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 170.59, | |
| "eval_exact_match": 0.6740812379110251, | |
| "eval_loss": 0.22413159906864166, | |
| "eval_runtime": 1746.478, | |
| "eval_samples_per_second": 0.592, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 171.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0023, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 173.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.001, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 174.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 175.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 177.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.001, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 178.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0006, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 179.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0044, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 181.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0013, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 182.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 183.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0006, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 185.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 186.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0005, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 187.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 189.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 190.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0005, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 191.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0013, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 191.88, | |
| "eval_exact_match": 0.6876208897485493, | |
| "eval_loss": 0.3054460287094116, | |
| "eval_runtime": 1863.2506, | |
| "eval_samples_per_second": 0.555, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 193.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0006, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 194.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0005, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 195.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 197.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0069, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 198.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0008, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 199.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0005, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 201.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0023, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 202.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0024, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 203.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0253, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 205.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0083, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 206.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 207.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 209.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0006, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 210.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 211.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0005, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 213.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0019, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 213.29, | |
| "eval_exact_match": 0.6382978723404256, | |
| "eval_loss": 0.3269137442111969, | |
| "eval_runtime": 2353.6231, | |
| "eval_samples_per_second": 0.439, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 214.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0095, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 215.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 217.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 218.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0006, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 219.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 221.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 222.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 223.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.001, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 225.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0005, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 226.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 227.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 229.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 230.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 231.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0005, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 233.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0005, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 234.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0101, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 234.59, | |
| "eval_exact_match": 0.6769825918762089, | |
| "eval_loss": 0.28256723284721375, | |
| "eval_runtime": 1983.1098, | |
| "eval_samples_per_second": 0.521, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 235.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.005, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 237.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0025, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 238.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0009, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 239.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 241.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 242.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 243.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 245.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0111, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 246.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0112, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 247.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0026, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 249.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0006, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 250.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 251.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 253.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 254.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 255.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 255.88, | |
| "eval_exact_match": 0.6982591876208898, | |
| "eval_loss": 0.3340039849281311, | |
| "eval_runtime": 1918.3137, | |
| "eval_samples_per_second": 0.539, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 257.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 258.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 259.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0005, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 261.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 262.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 263.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 265.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 796 | |
| }, | |
| { | |
| "epoch": 266.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 267.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 269.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 808 | |
| }, | |
| { | |
| "epoch": 270.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 271.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 273.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 274.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 824 | |
| }, | |
| { | |
| "epoch": 275.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0011, | |
| "step": 828 | |
| }, | |
| { | |
| "epoch": 277.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0012, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 277.29, | |
| "eval_exact_match": 0.6924564796905223, | |
| "eval_loss": 0.3206212818622589, | |
| "eval_runtime": 1930.7912, | |
| "eval_samples_per_second": 0.536, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 278.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 836 | |
| }, | |
| { | |
| "epoch": 279.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0005, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 281.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 844 | |
| }, | |
| { | |
| "epoch": 282.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0013, | |
| "step": 848 | |
| }, | |
| { | |
| "epoch": 283.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 285.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 856 | |
| }, | |
| { | |
| "epoch": 286.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 287.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0058, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 289.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0153, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 290.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.004, | |
| "step": 872 | |
| }, | |
| { | |
| "epoch": 291.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 293.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 294.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 884 | |
| }, | |
| { | |
| "epoch": 295.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0005, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 297.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 892 | |
| }, | |
| { | |
| "epoch": 298.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 298.59, | |
| "eval_exact_match": 0.7030947775628626, | |
| "eval_loss": 0.3318493366241455, | |
| "eval_runtime": 1959.3529, | |
| "eval_samples_per_second": 0.528, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 299.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 301.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 904 | |
| }, | |
| { | |
| "epoch": 302.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 908 | |
| }, | |
| { | |
| "epoch": 303.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 305.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 916 | |
| }, | |
| { | |
| "epoch": 306.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 307.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 309.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 310.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 932 | |
| }, | |
| { | |
| "epoch": 311.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 313.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 314.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "step": 944 | |
| }, | |
| { | |
| "epoch": 315.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0005, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 317.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 318.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 956 | |
| }, | |
| { | |
| "epoch": 319.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 319.88, | |
| "eval_exact_match": 0.6972920696324951, | |
| "eval_loss": 0.34125566482543945, | |
| "eval_runtime": 1929.7453, | |
| "eval_samples_per_second": 0.536, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 321.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 964 | |
| }, | |
| { | |
| "epoch": 322.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 323.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 325.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "step": 976 | |
| }, | |
| { | |
| "epoch": 326.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 327.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 329.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 988 | |
| }, | |
| { | |
| "epoch": 330.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 331.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0006, | |
| "step": 996 | |
| }, | |
| { | |
| "epoch": 333.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.015, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 334.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0006, | |
| "step": 1004 | |
| }, | |
| { | |
| "epoch": 335.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 337.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "step": 1012 | |
| }, | |
| { | |
| "epoch": 338.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 1016 | |
| }, | |
| { | |
| "epoch": 339.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 341.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 341.29, | |
| "eval_exact_match": 0.6963249516441006, | |
| "eval_loss": 0.3414294123649597, | |
| "eval_runtime": 1925.945, | |
| "eval_samples_per_second": 0.537, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 342.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "step": 1028 | |
| }, | |
| { | |
| "epoch": 343.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 1032 | |
| }, | |
| { | |
| "epoch": 345.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 1036 | |
| }, | |
| { | |
| "epoch": 346.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 347.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 1044 | |
| }, | |
| { | |
| "epoch": 349.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "step": 1048 | |
| }, | |
| { | |
| "epoch": 350.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "step": 1052 | |
| }, | |
| { | |
| "epoch": 351.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 353.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 354.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "step": 1064 | |
| }, | |
| { | |
| "epoch": 355.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 1068 | |
| }, | |
| { | |
| "epoch": 357.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "step": 1072 | |
| }, | |
| { | |
| "epoch": 358.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "step": 1076 | |
| }, | |
| { | |
| "epoch": 359.88, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 361.29, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "step": 1084 | |
| }, | |
| { | |
| "epoch": 362.59, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 362.59, | |
| "eval_exact_match": 0.7127659574468085, | |
| "eval_loss": 0.3638148605823517, | |
| "eval_runtime": 1954.8096, | |
| "eval_samples_per_second": 0.529, | |
| "step": 1088 | |
| } | |
| ], | |
| "max_steps": 9216, | |
| "num_train_epochs": 3072, | |
| "total_flos": 6.244604651315036e+17, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |