nadahlberg
commited on
Commit
•
1553183
1
Parent(s):
1098d3f
Training in progress, step 102000, checkpoint
Browse files- last-checkpoint/model.safetensors +1 -1
- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/rng_state_4.pth +1 -1
- last-checkpoint/rng_state_5.pth +1 -1
- last-checkpoint/rng_state_6.pth +1 -1
- last-checkpoint/rng_state_7.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +1411 -3
last-checkpoint/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2997015224
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7821e784728fa25fcfdc9d782f516d1e1f9b338613feccdf922f1d690b910a85
|
3 |
size 2997015224
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5994123294
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9f7fc10eaccc24db892ba90a3c868629f6f74d56d4af8eec371960b9460a18fd
|
3 |
size 5994123294
|
last-checkpoint/rng_state_0.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2cf990d87a7056e9e4b0a252156f6b30795db2a686df2fb1c99f6e3a11d9838f
|
3 |
size 15920
|
last-checkpoint/rng_state_1.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:76db20779ac73e63d7d686b30d532d98ecbc31c9d6b323e5308a241a5f958a43
|
3 |
size 15920
|
last-checkpoint/rng_state_2.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce61d981d04468713b611e91a2f0d4a82eab3612f8080a73371b4286cb63626d
|
3 |
size 15920
|
last-checkpoint/rng_state_3.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f071c6bab394ba16aa2e81723b39d293147108a946add4b7cb9e8995f7a2eda6
|
3 |
size 15920
|
last-checkpoint/rng_state_4.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7365d7bb904d56209b9f093837e31cc34422778fa119f818fd18ab35415e1bdc
|
3 |
size 15920
|
last-checkpoint/rng_state_5.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9adc39521416cf27c3bd1677c4843d13bc25278518dfb7775d844fc53cf3a78f
|
3 |
size 15920
|
last-checkpoint/rng_state_6.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4647715470bd8450f77c0b9ab529748ba4914076bb6ec38c8c5c5b31cf8666dd
|
3 |
size 15920
|
last-checkpoint/rng_state_7.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e7904bd2bef78f87b42aca6356cfdbb66809fe1ebc3b4a680d13a9faa13505e0
|
3 |
size 15920
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a01d2518bf51c6d331e2a2fd21d75544661ef5a07c1d1563fa31a405709da4a5
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 0.
|
5 |
"eval_steps": 2000,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -70407,6 +70407,1414 @@
|
|
70407 |
"eval_samples_per_second": 27.748,
|
70408 |
"eval_steps_per_second": 0.444,
|
70409 |
"step": 100000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70410 |
}
|
70411 |
],
|
70412 |
"logging_steps": 10,
|
@@ -70426,7 +71834,7 @@
|
|
70426 |
"attributes": {}
|
70427 |
}
|
70428 |
},
|
70429 |
-
"total_flos": 2.
|
70430 |
"train_batch_size": 8,
|
70431 |
"trial_name": null,
|
70432 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.51,
|
5 |
"eval_steps": 2000,
|
6 |
+
"global_step": 102000,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
70407 |
"eval_samples_per_second": 27.748,
|
70408 |
"eval_steps_per_second": 0.444,
|
70409 |
"step": 100000
|
70410 |
+
},
|
70411 |
+
{
|
70412 |
+
"epoch": 0.50005,
|
70413 |
+
"grad_norm": 2.046875,
|
70414 |
+
"learning_rate": 0.00015073869346733667,
|
70415 |
+
"loss": 2.0577,
|
70416 |
+
"step": 100010
|
70417 |
+
},
|
70418 |
+
{
|
70419 |
+
"epoch": 0.5001,
|
70420 |
+
"grad_norm": 1.96875,
|
70421 |
+
"learning_rate": 0.00015072361809045226,
|
70422 |
+
"loss": 2.04,
|
70423 |
+
"step": 100020
|
70424 |
+
},
|
70425 |
+
{
|
70426 |
+
"epoch": 0.50015,
|
70427 |
+
"grad_norm": 2.046875,
|
70428 |
+
"learning_rate": 0.00015070854271356782,
|
70429 |
+
"loss": 2.0467,
|
70430 |
+
"step": 100030
|
70431 |
+
},
|
70432 |
+
{
|
70433 |
+
"epoch": 0.5002,
|
70434 |
+
"grad_norm": 2.078125,
|
70435 |
+
"learning_rate": 0.0001506934673366834,
|
70436 |
+
"loss": 2.0113,
|
70437 |
+
"step": 100040
|
70438 |
+
},
|
70439 |
+
{
|
70440 |
+
"epoch": 0.50025,
|
70441 |
+
"grad_norm": 2.078125,
|
70442 |
+
"learning_rate": 0.000150678391959799,
|
70443 |
+
"loss": 2.0461,
|
70444 |
+
"step": 100050
|
70445 |
+
},
|
70446 |
+
{
|
70447 |
+
"epoch": 0.5003,
|
70448 |
+
"grad_norm": 1.859375,
|
70449 |
+
"learning_rate": 0.00015066331658291455,
|
70450 |
+
"loss": 1.9994,
|
70451 |
+
"step": 100060
|
70452 |
+
},
|
70453 |
+
{
|
70454 |
+
"epoch": 0.50035,
|
70455 |
+
"grad_norm": 2.0625,
|
70456 |
+
"learning_rate": 0.00015064824120603016,
|
70457 |
+
"loss": 2.0283,
|
70458 |
+
"step": 100070
|
70459 |
+
},
|
70460 |
+
{
|
70461 |
+
"epoch": 0.5004,
|
70462 |
+
"grad_norm": 2.0625,
|
70463 |
+
"learning_rate": 0.00015063316582914572,
|
70464 |
+
"loss": 2.0629,
|
70465 |
+
"step": 100080
|
70466 |
+
},
|
70467 |
+
{
|
70468 |
+
"epoch": 0.50045,
|
70469 |
+
"grad_norm": 2.09375,
|
70470 |
+
"learning_rate": 0.00015061809045226128,
|
70471 |
+
"loss": 2.0144,
|
70472 |
+
"step": 100090
|
70473 |
+
},
|
70474 |
+
{
|
70475 |
+
"epoch": 0.5005,
|
70476 |
+
"grad_norm": 1.828125,
|
70477 |
+
"learning_rate": 0.0001506030150753769,
|
70478 |
+
"loss": 2.082,
|
70479 |
+
"step": 100100
|
70480 |
+
},
|
70481 |
+
{
|
70482 |
+
"epoch": 0.50055,
|
70483 |
+
"grad_norm": 2.140625,
|
70484 |
+
"learning_rate": 0.00015058793969849245,
|
70485 |
+
"loss": 2.0549,
|
70486 |
+
"step": 100110
|
70487 |
+
},
|
70488 |
+
{
|
70489 |
+
"epoch": 0.5006,
|
70490 |
+
"grad_norm": 1.9296875,
|
70491 |
+
"learning_rate": 0.000150572864321608,
|
70492 |
+
"loss": 2.0591,
|
70493 |
+
"step": 100120
|
70494 |
+
},
|
70495 |
+
{
|
70496 |
+
"epoch": 0.50065,
|
70497 |
+
"grad_norm": 1.9296875,
|
70498 |
+
"learning_rate": 0.0001505577889447236,
|
70499 |
+
"loss": 1.9906,
|
70500 |
+
"step": 100130
|
70501 |
+
},
|
70502 |
+
{
|
70503 |
+
"epoch": 0.5007,
|
70504 |
+
"grad_norm": 1.9453125,
|
70505 |
+
"learning_rate": 0.00015054271356783919,
|
70506 |
+
"loss": 2.0959,
|
70507 |
+
"step": 100140
|
70508 |
+
},
|
70509 |
+
{
|
70510 |
+
"epoch": 0.50075,
|
70511 |
+
"grad_norm": 1.875,
|
70512 |
+
"learning_rate": 0.00015052763819095477,
|
70513 |
+
"loss": 2.0214,
|
70514 |
+
"step": 100150
|
70515 |
+
},
|
70516 |
+
{
|
70517 |
+
"epoch": 0.5008,
|
70518 |
+
"grad_norm": 2.0625,
|
70519 |
+
"learning_rate": 0.00015051256281407033,
|
70520 |
+
"loss": 2.0448,
|
70521 |
+
"step": 100160
|
70522 |
+
},
|
70523 |
+
{
|
70524 |
+
"epoch": 0.50085,
|
70525 |
+
"grad_norm": 1.96875,
|
70526 |
+
"learning_rate": 0.00015049748743718592,
|
70527 |
+
"loss": 2.05,
|
70528 |
+
"step": 100170
|
70529 |
+
},
|
70530 |
+
{
|
70531 |
+
"epoch": 0.5009,
|
70532 |
+
"grad_norm": 2.21875,
|
70533 |
+
"learning_rate": 0.0001504824120603015,
|
70534 |
+
"loss": 2.014,
|
70535 |
+
"step": 100180
|
70536 |
+
},
|
70537 |
+
{
|
70538 |
+
"epoch": 0.50095,
|
70539 |
+
"grad_norm": 1.7734375,
|
70540 |
+
"learning_rate": 0.00015046733668341706,
|
70541 |
+
"loss": 1.9987,
|
70542 |
+
"step": 100190
|
70543 |
+
},
|
70544 |
+
{
|
70545 |
+
"epoch": 0.501,
|
70546 |
+
"grad_norm": 2.0,
|
70547 |
+
"learning_rate": 0.00015045226130653267,
|
70548 |
+
"loss": 2.0231,
|
70549 |
+
"step": 100200
|
70550 |
+
},
|
70551 |
+
{
|
70552 |
+
"epoch": 0.50105,
|
70553 |
+
"grad_norm": 1.859375,
|
70554 |
+
"learning_rate": 0.00015043718592964823,
|
70555 |
+
"loss": 2.0519,
|
70556 |
+
"step": 100210
|
70557 |
+
},
|
70558 |
+
{
|
70559 |
+
"epoch": 0.5011,
|
70560 |
+
"grad_norm": 2.015625,
|
70561 |
+
"learning_rate": 0.0001504221105527638,
|
70562 |
+
"loss": 2.0005,
|
70563 |
+
"step": 100220
|
70564 |
+
},
|
70565 |
+
{
|
70566 |
+
"epoch": 0.50115,
|
70567 |
+
"grad_norm": 1.9296875,
|
70568 |
+
"learning_rate": 0.0001504070351758794,
|
70569 |
+
"loss": 2.0114,
|
70570 |
+
"step": 100230
|
70571 |
+
},
|
70572 |
+
{
|
70573 |
+
"epoch": 0.5012,
|
70574 |
+
"grad_norm": 1.8359375,
|
70575 |
+
"learning_rate": 0.00015039195979899496,
|
70576 |
+
"loss": 2.0219,
|
70577 |
+
"step": 100240
|
70578 |
+
},
|
70579 |
+
{
|
70580 |
+
"epoch": 0.50125,
|
70581 |
+
"grad_norm": 2.140625,
|
70582 |
+
"learning_rate": 0.00015037688442211052,
|
70583 |
+
"loss": 2.0079,
|
70584 |
+
"step": 100250
|
70585 |
+
},
|
70586 |
+
{
|
70587 |
+
"epoch": 0.5013,
|
70588 |
+
"grad_norm": 1.9453125,
|
70589 |
+
"learning_rate": 0.00015036180904522614,
|
70590 |
+
"loss": 2.0236,
|
70591 |
+
"step": 100260
|
70592 |
+
},
|
70593 |
+
{
|
70594 |
+
"epoch": 0.50135,
|
70595 |
+
"grad_norm": 2.09375,
|
70596 |
+
"learning_rate": 0.0001503467336683417,
|
70597 |
+
"loss": 2.0025,
|
70598 |
+
"step": 100270
|
70599 |
+
},
|
70600 |
+
{
|
70601 |
+
"epoch": 0.5014,
|
70602 |
+
"grad_norm": 1.90625,
|
70603 |
+
"learning_rate": 0.00015033165829145726,
|
70604 |
+
"loss": 2.0407,
|
70605 |
+
"step": 100280
|
70606 |
+
},
|
70607 |
+
{
|
70608 |
+
"epoch": 0.50145,
|
70609 |
+
"grad_norm": 2.296875,
|
70610 |
+
"learning_rate": 0.00015031658291457284,
|
70611 |
+
"loss": 2.0013,
|
70612 |
+
"step": 100290
|
70613 |
+
},
|
70614 |
+
{
|
70615 |
+
"epoch": 0.5015,
|
70616 |
+
"grad_norm": 2.046875,
|
70617 |
+
"learning_rate": 0.00015030150753768843,
|
70618 |
+
"loss": 1.9921,
|
70619 |
+
"step": 100300
|
70620 |
+
},
|
70621 |
+
{
|
70622 |
+
"epoch": 0.50155,
|
70623 |
+
"grad_norm": 1.9453125,
|
70624 |
+
"learning_rate": 0.000150286432160804,
|
70625 |
+
"loss": 2.0336,
|
70626 |
+
"step": 100310
|
70627 |
+
},
|
70628 |
+
{
|
70629 |
+
"epoch": 0.5016,
|
70630 |
+
"grad_norm": 2.171875,
|
70631 |
+
"learning_rate": 0.00015027135678391957,
|
70632 |
+
"loss": 2.0329,
|
70633 |
+
"step": 100320
|
70634 |
+
},
|
70635 |
+
{
|
70636 |
+
"epoch": 0.50165,
|
70637 |
+
"grad_norm": 1.9921875,
|
70638 |
+
"learning_rate": 0.00015025628140703519,
|
70639 |
+
"loss": 2.0327,
|
70640 |
+
"step": 100330
|
70641 |
+
},
|
70642 |
+
{
|
70643 |
+
"epoch": 0.5017,
|
70644 |
+
"grad_norm": 1.78125,
|
70645 |
+
"learning_rate": 0.00015024120603015074,
|
70646 |
+
"loss": 2.0726,
|
70647 |
+
"step": 100340
|
70648 |
+
},
|
70649 |
+
{
|
70650 |
+
"epoch": 0.50175,
|
70651 |
+
"grad_norm": 1.828125,
|
70652 |
+
"learning_rate": 0.0001502261306532663,
|
70653 |
+
"loss": 2.0025,
|
70654 |
+
"step": 100350
|
70655 |
+
},
|
70656 |
+
{
|
70657 |
+
"epoch": 0.5018,
|
70658 |
+
"grad_norm": 1.9140625,
|
70659 |
+
"learning_rate": 0.00015021105527638192,
|
70660 |
+
"loss": 2.0309,
|
70661 |
+
"step": 100360
|
70662 |
+
},
|
70663 |
+
{
|
70664 |
+
"epoch": 0.50185,
|
70665 |
+
"grad_norm": 1.9609375,
|
70666 |
+
"learning_rate": 0.00015019597989949748,
|
70667 |
+
"loss": 2.0078,
|
70668 |
+
"step": 100370
|
70669 |
+
},
|
70670 |
+
{
|
70671 |
+
"epoch": 0.5019,
|
70672 |
+
"grad_norm": 1.921875,
|
70673 |
+
"learning_rate": 0.00015018090452261303,
|
70674 |
+
"loss": 2.0121,
|
70675 |
+
"step": 100380
|
70676 |
+
},
|
70677 |
+
{
|
70678 |
+
"epoch": 0.50195,
|
70679 |
+
"grad_norm": 1.8984375,
|
70680 |
+
"learning_rate": 0.00015016582914572865,
|
70681 |
+
"loss": 2.0177,
|
70682 |
+
"step": 100390
|
70683 |
+
},
|
70684 |
+
{
|
70685 |
+
"epoch": 0.502,
|
70686 |
+
"grad_norm": 1.8515625,
|
70687 |
+
"learning_rate": 0.0001501507537688442,
|
70688 |
+
"loss": 2.012,
|
70689 |
+
"step": 100400
|
70690 |
+
},
|
70691 |
+
{
|
70692 |
+
"epoch": 0.50205,
|
70693 |
+
"grad_norm": 1.9453125,
|
70694 |
+
"learning_rate": 0.00015013567839195977,
|
70695 |
+
"loss": 2.0336,
|
70696 |
+
"step": 100410
|
70697 |
+
},
|
70698 |
+
{
|
70699 |
+
"epoch": 0.5021,
|
70700 |
+
"grad_norm": 2.03125,
|
70701 |
+
"learning_rate": 0.00015012060301507538,
|
70702 |
+
"loss": 2.0294,
|
70703 |
+
"step": 100420
|
70704 |
+
},
|
70705 |
+
{
|
70706 |
+
"epoch": 0.50215,
|
70707 |
+
"grad_norm": 1.875,
|
70708 |
+
"learning_rate": 0.00015010552763819094,
|
70709 |
+
"loss": 2.0701,
|
70710 |
+
"step": 100430
|
70711 |
+
},
|
70712 |
+
{
|
70713 |
+
"epoch": 0.5022,
|
70714 |
+
"grad_norm": 2.140625,
|
70715 |
+
"learning_rate": 0.00015009045226130652,
|
70716 |
+
"loss": 2.042,
|
70717 |
+
"step": 100440
|
70718 |
+
},
|
70719 |
+
{
|
70720 |
+
"epoch": 0.50225,
|
70721 |
+
"grad_norm": 2.15625,
|
70722 |
+
"learning_rate": 0.00015007537688442208,
|
70723 |
+
"loss": 2.0562,
|
70724 |
+
"step": 100450
|
70725 |
+
},
|
70726 |
+
{
|
70727 |
+
"epoch": 0.5023,
|
70728 |
+
"grad_norm": 1.71875,
|
70729 |
+
"learning_rate": 0.0001500603015075377,
|
70730 |
+
"loss": 1.9848,
|
70731 |
+
"step": 100460
|
70732 |
+
},
|
70733 |
+
{
|
70734 |
+
"epoch": 0.50235,
|
70735 |
+
"grad_norm": 2.03125,
|
70736 |
+
"learning_rate": 0.00015004522613065326,
|
70737 |
+
"loss": 2.0385,
|
70738 |
+
"step": 100470
|
70739 |
+
},
|
70740 |
+
{
|
70741 |
+
"epoch": 0.5024,
|
70742 |
+
"grad_norm": 1.8515625,
|
70743 |
+
"learning_rate": 0.00015003015075376881,
|
70744 |
+
"loss": 2.0441,
|
70745 |
+
"step": 100480
|
70746 |
+
},
|
70747 |
+
{
|
70748 |
+
"epoch": 0.50245,
|
70749 |
+
"grad_norm": 1.8828125,
|
70750 |
+
"learning_rate": 0.00015001507537688443,
|
70751 |
+
"loss": 1.9888,
|
70752 |
+
"step": 100490
|
70753 |
+
},
|
70754 |
+
{
|
70755 |
+
"epoch": 0.5025,
|
70756 |
+
"grad_norm": 2.109375,
|
70757 |
+
"learning_rate": 0.00015,
|
70758 |
+
"loss": 2.0933,
|
70759 |
+
"step": 100500
|
70760 |
+
},
|
70761 |
+
{
|
70762 |
+
"epoch": 0.50255,
|
70763 |
+
"grad_norm": 1.921875,
|
70764 |
+
"learning_rate": 0.00014998492462311557,
|
70765 |
+
"loss": 1.9895,
|
70766 |
+
"step": 100510
|
70767 |
+
},
|
70768 |
+
{
|
70769 |
+
"epoch": 0.5026,
|
70770 |
+
"grad_norm": 2.0625,
|
70771 |
+
"learning_rate": 0.00014996984924623113,
|
70772 |
+
"loss": 2.0241,
|
70773 |
+
"step": 100520
|
70774 |
+
},
|
70775 |
+
{
|
70776 |
+
"epoch": 0.50265,
|
70777 |
+
"grad_norm": 1.9296875,
|
70778 |
+
"learning_rate": 0.00014995477386934672,
|
70779 |
+
"loss": 2.0868,
|
70780 |
+
"step": 100530
|
70781 |
+
},
|
70782 |
+
{
|
70783 |
+
"epoch": 0.5027,
|
70784 |
+
"grad_norm": 1.90625,
|
70785 |
+
"learning_rate": 0.00014993969849246228,
|
70786 |
+
"loss": 1.9649,
|
70787 |
+
"step": 100540
|
70788 |
+
},
|
70789 |
+
{
|
70790 |
+
"epoch": 0.50275,
|
70791 |
+
"grad_norm": 2.09375,
|
70792 |
+
"learning_rate": 0.00014992462311557786,
|
70793 |
+
"loss": 2.0709,
|
70794 |
+
"step": 100550
|
70795 |
+
},
|
70796 |
+
{
|
70797 |
+
"epoch": 0.5028,
|
70798 |
+
"grad_norm": 2.046875,
|
70799 |
+
"learning_rate": 0.00014990954773869345,
|
70800 |
+
"loss": 2.0065,
|
70801 |
+
"step": 100560
|
70802 |
+
},
|
70803 |
+
{
|
70804 |
+
"epoch": 0.50285,
|
70805 |
+
"grad_norm": 1.828125,
|
70806 |
+
"learning_rate": 0.00014989447236180904,
|
70807 |
+
"loss": 2.0105,
|
70808 |
+
"step": 100570
|
70809 |
+
},
|
70810 |
+
{
|
70811 |
+
"epoch": 0.5029,
|
70812 |
+
"grad_norm": 2.09375,
|
70813 |
+
"learning_rate": 0.00014987939698492462,
|
70814 |
+
"loss": 2.004,
|
70815 |
+
"step": 100580
|
70816 |
+
},
|
70817 |
+
{
|
70818 |
+
"epoch": 0.50295,
|
70819 |
+
"grad_norm": 1.9921875,
|
70820 |
+
"learning_rate": 0.0001498643216080402,
|
70821 |
+
"loss": 2.0418,
|
70822 |
+
"step": 100590
|
70823 |
+
},
|
70824 |
+
{
|
70825 |
+
"epoch": 0.503,
|
70826 |
+
"grad_norm": 1.921875,
|
70827 |
+
"learning_rate": 0.00014984924623115577,
|
70828 |
+
"loss": 1.9932,
|
70829 |
+
"step": 100600
|
70830 |
+
},
|
70831 |
+
{
|
70832 |
+
"epoch": 0.50305,
|
70833 |
+
"grad_norm": 2.03125,
|
70834 |
+
"learning_rate": 0.00014983417085427135,
|
70835 |
+
"loss": 2.0319,
|
70836 |
+
"step": 100610
|
70837 |
+
},
|
70838 |
+
{
|
70839 |
+
"epoch": 0.5031,
|
70840 |
+
"grad_norm": 2.09375,
|
70841 |
+
"learning_rate": 0.0001498190954773869,
|
70842 |
+
"loss": 2.0594,
|
70843 |
+
"step": 100620
|
70844 |
+
},
|
70845 |
+
{
|
70846 |
+
"epoch": 0.50315,
|
70847 |
+
"grad_norm": 1.875,
|
70848 |
+
"learning_rate": 0.0001498040201005025,
|
70849 |
+
"loss": 2.0186,
|
70850 |
+
"step": 100630
|
70851 |
+
},
|
70852 |
+
{
|
70853 |
+
"epoch": 0.5032,
|
70854 |
+
"grad_norm": 1.875,
|
70855 |
+
"learning_rate": 0.00014978894472361808,
|
70856 |
+
"loss": 2.0526,
|
70857 |
+
"step": 100640
|
70858 |
+
},
|
70859 |
+
{
|
70860 |
+
"epoch": 0.50325,
|
70861 |
+
"grad_norm": 2.265625,
|
70862 |
+
"learning_rate": 0.00014977386934673364,
|
70863 |
+
"loss": 1.9934,
|
70864 |
+
"step": 100650
|
70865 |
+
},
|
70866 |
+
{
|
70867 |
+
"epoch": 0.5033,
|
70868 |
+
"grad_norm": 1.9765625,
|
70869 |
+
"learning_rate": 0.00014975879396984923,
|
70870 |
+
"loss": 1.9944,
|
70871 |
+
"step": 100660
|
70872 |
+
},
|
70873 |
+
{
|
70874 |
+
"epoch": 0.50335,
|
70875 |
+
"grad_norm": 1.890625,
|
70876 |
+
"learning_rate": 0.00014974371859296482,
|
70877 |
+
"loss": 2.0696,
|
70878 |
+
"step": 100670
|
70879 |
+
},
|
70880 |
+
{
|
70881 |
+
"epoch": 0.5034,
|
70882 |
+
"grad_norm": 1.9609375,
|
70883 |
+
"learning_rate": 0.00014972864321608037,
|
70884 |
+
"loss": 2.0373,
|
70885 |
+
"step": 100680
|
70886 |
+
},
|
70887 |
+
{
|
70888 |
+
"epoch": 0.50345,
|
70889 |
+
"grad_norm": 2.046875,
|
70890 |
+
"learning_rate": 0.00014971356783919596,
|
70891 |
+
"loss": 2.0697,
|
70892 |
+
"step": 100690
|
70893 |
+
},
|
70894 |
+
{
|
70895 |
+
"epoch": 0.5035,
|
70896 |
+
"grad_norm": 2.078125,
|
70897 |
+
"learning_rate": 0.00014969849246231155,
|
70898 |
+
"loss": 2.015,
|
70899 |
+
"step": 100700
|
70900 |
+
},
|
70901 |
+
{
|
70902 |
+
"epoch": 0.50355,
|
70903 |
+
"grad_norm": 2.015625,
|
70904 |
+
"learning_rate": 0.00014968341708542713,
|
70905 |
+
"loss": 2.026,
|
70906 |
+
"step": 100710
|
70907 |
+
},
|
70908 |
+
{
|
70909 |
+
"epoch": 0.5036,
|
70910 |
+
"grad_norm": 1.859375,
|
70911 |
+
"learning_rate": 0.00014966834170854272,
|
70912 |
+
"loss": 1.9736,
|
70913 |
+
"step": 100720
|
70914 |
+
},
|
70915 |
+
{
|
70916 |
+
"epoch": 0.50365,
|
70917 |
+
"grad_norm": 2.109375,
|
70918 |
+
"learning_rate": 0.00014965326633165828,
|
70919 |
+
"loss": 1.9758,
|
70920 |
+
"step": 100730
|
70921 |
+
},
|
70922 |
+
{
|
70923 |
+
"epoch": 0.5037,
|
70924 |
+
"grad_norm": 2.03125,
|
70925 |
+
"learning_rate": 0.00014963819095477386,
|
70926 |
+
"loss": 2.0381,
|
70927 |
+
"step": 100740
|
70928 |
+
},
|
70929 |
+
{
|
70930 |
+
"epoch": 0.50375,
|
70931 |
+
"grad_norm": 1.984375,
|
70932 |
+
"learning_rate": 0.00014962311557788945,
|
70933 |
+
"loss": 2.005,
|
70934 |
+
"step": 100750
|
70935 |
+
},
|
70936 |
+
{
|
70937 |
+
"epoch": 0.5038,
|
70938 |
+
"grad_norm": 2.1875,
|
70939 |
+
"learning_rate": 0.000149608040201005,
|
70940 |
+
"loss": 2.0392,
|
70941 |
+
"step": 100760
|
70942 |
+
},
|
70943 |
+
{
|
70944 |
+
"epoch": 0.50385,
|
70945 |
+
"grad_norm": 1.921875,
|
70946 |
+
"learning_rate": 0.0001495929648241206,
|
70947 |
+
"loss": 1.9888,
|
70948 |
+
"step": 100770
|
70949 |
+
},
|
70950 |
+
{
|
70951 |
+
"epoch": 0.5039,
|
70952 |
+
"grad_norm": 2.0,
|
70953 |
+
"learning_rate": 0.00014957788944723615,
|
70954 |
+
"loss": 2.0717,
|
70955 |
+
"step": 100780
|
70956 |
+
},
|
70957 |
+
{
|
70958 |
+
"epoch": 0.50395,
|
70959 |
+
"grad_norm": 2.09375,
|
70960 |
+
"learning_rate": 0.00014956281407035174,
|
70961 |
+
"loss": 2.0205,
|
70962 |
+
"step": 100790
|
70963 |
+
},
|
70964 |
+
{
|
70965 |
+
"epoch": 0.504,
|
70966 |
+
"grad_norm": 2.421875,
|
70967 |
+
"learning_rate": 0.00014954773869346733,
|
70968 |
+
"loss": 2.0285,
|
70969 |
+
"step": 100800
|
70970 |
+
},
|
70971 |
+
{
|
70972 |
+
"epoch": 0.50405,
|
70973 |
+
"grad_norm": 1.9453125,
|
70974 |
+
"learning_rate": 0.00014953266331658289,
|
70975 |
+
"loss": 2.086,
|
70976 |
+
"step": 100810
|
70977 |
+
},
|
70978 |
+
{
|
70979 |
+
"epoch": 0.5041,
|
70980 |
+
"grad_norm": 2.015625,
|
70981 |
+
"learning_rate": 0.00014951758793969847,
|
70982 |
+
"loss": 1.9678,
|
70983 |
+
"step": 100820
|
70984 |
+
},
|
70985 |
+
{
|
70986 |
+
"epoch": 0.50415,
|
70987 |
+
"grad_norm": 1.828125,
|
70988 |
+
"learning_rate": 0.00014950251256281406,
|
70989 |
+
"loss": 2.1181,
|
70990 |
+
"step": 100830
|
70991 |
+
},
|
70992 |
+
{
|
70993 |
+
"epoch": 0.5042,
|
70994 |
+
"grad_norm": 1.765625,
|
70995 |
+
"learning_rate": 0.00014948743718592964,
|
70996 |
+
"loss": 2.0051,
|
70997 |
+
"step": 100840
|
70998 |
+
},
|
70999 |
+
{
|
71000 |
+
"epoch": 0.50425,
|
71001 |
+
"grad_norm": 1.875,
|
71002 |
+
"learning_rate": 0.0001494723618090452,
|
71003 |
+
"loss": 2.0193,
|
71004 |
+
"step": 100850
|
71005 |
+
},
|
71006 |
+
{
|
71007 |
+
"epoch": 0.5043,
|
71008 |
+
"grad_norm": 1.875,
|
71009 |
+
"learning_rate": 0.0001494572864321608,
|
71010 |
+
"loss": 2.0467,
|
71011 |
+
"step": 100860
|
71012 |
+
},
|
71013 |
+
{
|
71014 |
+
"epoch": 0.50435,
|
71015 |
+
"grad_norm": 2.09375,
|
71016 |
+
"learning_rate": 0.00014944221105527637,
|
71017 |
+
"loss": 2.0017,
|
71018 |
+
"step": 100870
|
71019 |
+
},
|
71020 |
+
{
|
71021 |
+
"epoch": 0.5044,
|
71022 |
+
"grad_norm": 1.90625,
|
71023 |
+
"learning_rate": 0.00014942713567839196,
|
71024 |
+
"loss": 2.0517,
|
71025 |
+
"step": 100880
|
71026 |
+
},
|
71027 |
+
{
|
71028 |
+
"epoch": 0.50445,
|
71029 |
+
"grad_norm": 2.078125,
|
71030 |
+
"learning_rate": 0.00014941206030150752,
|
71031 |
+
"loss": 1.9835,
|
71032 |
+
"step": 100890
|
71033 |
+
},
|
71034 |
+
{
|
71035 |
+
"epoch": 0.5045,
|
71036 |
+
"grad_norm": 1.8203125,
|
71037 |
+
"learning_rate": 0.0001493969849246231,
|
71038 |
+
"loss": 2.0614,
|
71039 |
+
"step": 100900
|
71040 |
+
},
|
71041 |
+
{
|
71042 |
+
"epoch": 0.50455,
|
71043 |
+
"grad_norm": 1.953125,
|
71044 |
+
"learning_rate": 0.0001493819095477387,
|
71045 |
+
"loss": 2.0655,
|
71046 |
+
"step": 100910
|
71047 |
+
},
|
71048 |
+
{
|
71049 |
+
"epoch": 0.5046,
|
71050 |
+
"grad_norm": 2.109375,
|
71051 |
+
"learning_rate": 0.00014936683417085425,
|
71052 |
+
"loss": 1.9778,
|
71053 |
+
"step": 100920
|
71054 |
+
},
|
71055 |
+
{
|
71056 |
+
"epoch": 0.50465,
|
71057 |
+
"grad_norm": 1.9296875,
|
71058 |
+
"learning_rate": 0.00014935175879396984,
|
71059 |
+
"loss": 2.0143,
|
71060 |
+
"step": 100930
|
71061 |
+
},
|
71062 |
+
{
|
71063 |
+
"epoch": 0.5047,
|
71064 |
+
"grad_norm": 2.0625,
|
71065 |
+
"learning_rate": 0.0001493366834170854,
|
71066 |
+
"loss": 1.9989,
|
71067 |
+
"step": 100940
|
71068 |
+
},
|
71069 |
+
{
|
71070 |
+
"epoch": 0.50475,
|
71071 |
+
"grad_norm": 2.03125,
|
71072 |
+
"learning_rate": 0.00014932160804020098,
|
71073 |
+
"loss": 2.0735,
|
71074 |
+
"step": 100950
|
71075 |
+
},
|
71076 |
+
{
|
71077 |
+
"epoch": 0.5048,
|
71078 |
+
"grad_norm": 1.984375,
|
71079 |
+
"learning_rate": 0.00014930653266331657,
|
71080 |
+
"loss": 2.0113,
|
71081 |
+
"step": 100960
|
71082 |
+
},
|
71083 |
+
{
|
71084 |
+
"epoch": 0.50485,
|
71085 |
+
"grad_norm": 1.765625,
|
71086 |
+
"learning_rate": 0.00014929145728643215,
|
71087 |
+
"loss": 2.046,
|
71088 |
+
"step": 100970
|
71089 |
+
},
|
71090 |
+
{
|
71091 |
+
"epoch": 0.5049,
|
71092 |
+
"grad_norm": 2.015625,
|
71093 |
+
"learning_rate": 0.0001492763819095477,
|
71094 |
+
"loss": 1.9569,
|
71095 |
+
"step": 100980
|
71096 |
+
},
|
71097 |
+
{
|
71098 |
+
"epoch": 0.50495,
|
71099 |
+
"grad_norm": 2.0625,
|
71100 |
+
"learning_rate": 0.0001492613065326633,
|
71101 |
+
"loss": 2.0329,
|
71102 |
+
"step": 100990
|
71103 |
+
},
|
71104 |
+
{
|
71105 |
+
"epoch": 0.505,
|
71106 |
+
"grad_norm": 1.96875,
|
71107 |
+
"learning_rate": 0.00014924623115577889,
|
71108 |
+
"loss": 1.9891,
|
71109 |
+
"step": 101000
|
71110 |
+
},
|
71111 |
+
{
|
71112 |
+
"epoch": 0.50505,
|
71113 |
+
"grad_norm": 2.015625,
|
71114 |
+
"learning_rate": 0.00014923115577889447,
|
71115 |
+
"loss": 1.9908,
|
71116 |
+
"step": 101010
|
71117 |
+
},
|
71118 |
+
{
|
71119 |
+
"epoch": 0.5051,
|
71120 |
+
"grad_norm": 1.7421875,
|
71121 |
+
"learning_rate": 0.00014921608040201003,
|
71122 |
+
"loss": 2.0593,
|
71123 |
+
"step": 101020
|
71124 |
+
},
|
71125 |
+
{
|
71126 |
+
"epoch": 0.50515,
|
71127 |
+
"grad_norm": 1.9609375,
|
71128 |
+
"learning_rate": 0.00014920100502512562,
|
71129 |
+
"loss": 1.9822,
|
71130 |
+
"step": 101030
|
71131 |
+
},
|
71132 |
+
{
|
71133 |
+
"epoch": 0.5052,
|
71134 |
+
"grad_norm": 1.7734375,
|
71135 |
+
"learning_rate": 0.0001491859296482412,
|
71136 |
+
"loss": 2.0216,
|
71137 |
+
"step": 101040
|
71138 |
+
},
|
71139 |
+
{
|
71140 |
+
"epoch": 0.50525,
|
71141 |
+
"grad_norm": 1.8359375,
|
71142 |
+
"learning_rate": 0.00014917085427135676,
|
71143 |
+
"loss": 1.9841,
|
71144 |
+
"step": 101050
|
71145 |
+
},
|
71146 |
+
{
|
71147 |
+
"epoch": 0.5053,
|
71148 |
+
"grad_norm": 1.9296875,
|
71149 |
+
"learning_rate": 0.00014915577889447235,
|
71150 |
+
"loss": 1.9997,
|
71151 |
+
"step": 101060
|
71152 |
+
},
|
71153 |
+
{
|
71154 |
+
"epoch": 0.50535,
|
71155 |
+
"grad_norm": 1.984375,
|
71156 |
+
"learning_rate": 0.00014914070351758793,
|
71157 |
+
"loss": 2.044,
|
71158 |
+
"step": 101070
|
71159 |
+
},
|
71160 |
+
{
|
71161 |
+
"epoch": 0.5054,
|
71162 |
+
"grad_norm": 2.03125,
|
71163 |
+
"learning_rate": 0.0001491256281407035,
|
71164 |
+
"loss": 2.0443,
|
71165 |
+
"step": 101080
|
71166 |
+
},
|
71167 |
+
{
|
71168 |
+
"epoch": 0.50545,
|
71169 |
+
"grad_norm": 1.9453125,
|
71170 |
+
"learning_rate": 0.00014911055276381908,
|
71171 |
+
"loss": 2.0243,
|
71172 |
+
"step": 101090
|
71173 |
+
},
|
71174 |
+
{
|
71175 |
+
"epoch": 0.5055,
|
71176 |
+
"grad_norm": 1.8359375,
|
71177 |
+
"learning_rate": 0.00014909547738693467,
|
71178 |
+
"loss": 2.0287,
|
71179 |
+
"step": 101100
|
71180 |
+
},
|
71181 |
+
{
|
71182 |
+
"epoch": 0.50555,
|
71183 |
+
"grad_norm": 1.8046875,
|
71184 |
+
"learning_rate": 0.00014908040201005022,
|
71185 |
+
"loss": 2.036,
|
71186 |
+
"step": 101110
|
71187 |
+
},
|
71188 |
+
{
|
71189 |
+
"epoch": 0.5056,
|
71190 |
+
"grad_norm": 2.046875,
|
71191 |
+
"learning_rate": 0.0001490653266331658,
|
71192 |
+
"loss": 2.0173,
|
71193 |
+
"step": 101120
|
71194 |
+
},
|
71195 |
+
{
|
71196 |
+
"epoch": 0.50565,
|
71197 |
+
"grad_norm": 1.921875,
|
71198 |
+
"learning_rate": 0.0001490502512562814,
|
71199 |
+
"loss": 1.9824,
|
71200 |
+
"step": 101130
|
71201 |
+
},
|
71202 |
+
{
|
71203 |
+
"epoch": 0.5057,
|
71204 |
+
"grad_norm": 1.765625,
|
71205 |
+
"learning_rate": 0.00014903517587939698,
|
71206 |
+
"loss": 2.0331,
|
71207 |
+
"step": 101140
|
71208 |
+
},
|
71209 |
+
{
|
71210 |
+
"epoch": 0.50575,
|
71211 |
+
"grad_norm": 2.21875,
|
71212 |
+
"learning_rate": 0.00014902010050251257,
|
71213 |
+
"loss": 2.0081,
|
71214 |
+
"step": 101150
|
71215 |
+
},
|
71216 |
+
{
|
71217 |
+
"epoch": 0.5058,
|
71218 |
+
"grad_norm": 1.9921875,
|
71219 |
+
"learning_rate": 0.00014900502512562813,
|
71220 |
+
"loss": 2.0302,
|
71221 |
+
"step": 101160
|
71222 |
+
},
|
71223 |
+
{
|
71224 |
+
"epoch": 0.50585,
|
71225 |
+
"grad_norm": 1.8203125,
|
71226 |
+
"learning_rate": 0.00014898994974874371,
|
71227 |
+
"loss": 2.0202,
|
71228 |
+
"step": 101170
|
71229 |
+
},
|
71230 |
+
{
|
71231 |
+
"epoch": 0.5059,
|
71232 |
+
"grad_norm": 2.125,
|
71233 |
+
"learning_rate": 0.00014897487437185927,
|
71234 |
+
"loss": 2.0086,
|
71235 |
+
"step": 101180
|
71236 |
+
},
|
71237 |
+
{
|
71238 |
+
"epoch": 0.50595,
|
71239 |
+
"grad_norm": 1.96875,
|
71240 |
+
"learning_rate": 0.00014895979899497486,
|
71241 |
+
"loss": 2.0516,
|
71242 |
+
"step": 101190
|
71243 |
+
},
|
71244 |
+
{
|
71245 |
+
"epoch": 0.506,
|
71246 |
+
"grad_norm": 1.9453125,
|
71247 |
+
"learning_rate": 0.00014894472361809044,
|
71248 |
+
"loss": 2.0198,
|
71249 |
+
"step": 101200
|
71250 |
+
},
|
71251 |
+
{
|
71252 |
+
"epoch": 0.50605,
|
71253 |
+
"grad_norm": 1.8984375,
|
71254 |
+
"learning_rate": 0.000148929648241206,
|
71255 |
+
"loss": 2.068,
|
71256 |
+
"step": 101210
|
71257 |
+
},
|
71258 |
+
{
|
71259 |
+
"epoch": 0.5061,
|
71260 |
+
"grad_norm": 1.90625,
|
71261 |
+
"learning_rate": 0.0001489145728643216,
|
71262 |
+
"loss": 1.9807,
|
71263 |
+
"step": 101220
|
71264 |
+
},
|
71265 |
+
{
|
71266 |
+
"epoch": 0.50615,
|
71267 |
+
"grad_norm": 1.8828125,
|
71268 |
+
"learning_rate": 0.00014889949748743718,
|
71269 |
+
"loss": 2.0408,
|
71270 |
+
"step": 101230
|
71271 |
+
},
|
71272 |
+
{
|
71273 |
+
"epoch": 0.5062,
|
71274 |
+
"grad_norm": 1.9609375,
|
71275 |
+
"learning_rate": 0.00014888442211055274,
|
71276 |
+
"loss": 2.0004,
|
71277 |
+
"step": 101240
|
71278 |
+
},
|
71279 |
+
{
|
71280 |
+
"epoch": 0.50625,
|
71281 |
+
"grad_norm": 1.8515625,
|
71282 |
+
"learning_rate": 0.00014886934673366832,
|
71283 |
+
"loss": 2.0351,
|
71284 |
+
"step": 101250
|
71285 |
+
},
|
71286 |
+
{
|
71287 |
+
"epoch": 0.5063,
|
71288 |
+
"grad_norm": 2.203125,
|
71289 |
+
"learning_rate": 0.0001488542713567839,
|
71290 |
+
"loss": 2.0074,
|
71291 |
+
"step": 101260
|
71292 |
+
},
|
71293 |
+
{
|
71294 |
+
"epoch": 0.50635,
|
71295 |
+
"grad_norm": 1.9296875,
|
71296 |
+
"learning_rate": 0.0001488391959798995,
|
71297 |
+
"loss": 2.0361,
|
71298 |
+
"step": 101270
|
71299 |
+
},
|
71300 |
+
{
|
71301 |
+
"epoch": 0.5064,
|
71302 |
+
"grad_norm": 1.8984375,
|
71303 |
+
"learning_rate": 0.00014882412060301508,
|
71304 |
+
"loss": 2.0219,
|
71305 |
+
"step": 101280
|
71306 |
+
},
|
71307 |
+
{
|
71308 |
+
"epoch": 0.50645,
|
71309 |
+
"grad_norm": 2.0,
|
71310 |
+
"learning_rate": 0.00014880904522613064,
|
71311 |
+
"loss": 2.0133,
|
71312 |
+
"step": 101290
|
71313 |
+
},
|
71314 |
+
{
|
71315 |
+
"epoch": 0.5065,
|
71316 |
+
"grad_norm": 1.796875,
|
71317 |
+
"learning_rate": 0.00014879396984924622,
|
71318 |
+
"loss": 1.9861,
|
71319 |
+
"step": 101300
|
71320 |
+
},
|
71321 |
+
{
|
71322 |
+
"epoch": 0.50655,
|
71323 |
+
"grad_norm": 1.9375,
|
71324 |
+
"learning_rate": 0.0001487788944723618,
|
71325 |
+
"loss": 2.0125,
|
71326 |
+
"step": 101310
|
71327 |
+
},
|
71328 |
+
{
|
71329 |
+
"epoch": 0.5066,
|
71330 |
+
"grad_norm": 1.7265625,
|
71331 |
+
"learning_rate": 0.00014876381909547737,
|
71332 |
+
"loss": 2.0377,
|
71333 |
+
"step": 101320
|
71334 |
+
},
|
71335 |
+
{
|
71336 |
+
"epoch": 0.50665,
|
71337 |
+
"grad_norm": 2.0,
|
71338 |
+
"learning_rate": 0.00014874874371859296,
|
71339 |
+
"loss": 2.0377,
|
71340 |
+
"step": 101330
|
71341 |
+
},
|
71342 |
+
{
|
71343 |
+
"epoch": 0.5067,
|
71344 |
+
"grad_norm": 1.921875,
|
71345 |
+
"learning_rate": 0.00014873366834170851,
|
71346 |
+
"loss": 2.0038,
|
71347 |
+
"step": 101340
|
71348 |
+
},
|
71349 |
+
{
|
71350 |
+
"epoch": 0.50675,
|
71351 |
+
"grad_norm": 1.828125,
|
71352 |
+
"learning_rate": 0.0001487185929648241,
|
71353 |
+
"loss": 2.0242,
|
71354 |
+
"step": 101350
|
71355 |
+
},
|
71356 |
+
{
|
71357 |
+
"epoch": 0.5068,
|
71358 |
+
"grad_norm": 1.8984375,
|
71359 |
+
"learning_rate": 0.0001487035175879397,
|
71360 |
+
"loss": 2.0106,
|
71361 |
+
"step": 101360
|
71362 |
+
},
|
71363 |
+
{
|
71364 |
+
"epoch": 0.50685,
|
71365 |
+
"grad_norm": 1.8125,
|
71366 |
+
"learning_rate": 0.00014868844221105525,
|
71367 |
+
"loss": 2.0362,
|
71368 |
+
"step": 101370
|
71369 |
+
},
|
71370 |
+
{
|
71371 |
+
"epoch": 0.5069,
|
71372 |
+
"grad_norm": 2.078125,
|
71373 |
+
"learning_rate": 0.00014867336683417083,
|
71374 |
+
"loss": 2.0324,
|
71375 |
+
"step": 101380
|
71376 |
+
},
|
71377 |
+
{
|
71378 |
+
"epoch": 0.50695,
|
71379 |
+
"grad_norm": 1.921875,
|
71380 |
+
"learning_rate": 0.00014865829145728642,
|
71381 |
+
"loss": 2.0529,
|
71382 |
+
"step": 101390
|
71383 |
+
},
|
71384 |
+
{
|
71385 |
+
"epoch": 0.507,
|
71386 |
+
"grad_norm": 1.9765625,
|
71387 |
+
"learning_rate": 0.000148643216080402,
|
71388 |
+
"loss": 2.0232,
|
71389 |
+
"step": 101400
|
71390 |
+
},
|
71391 |
+
{
|
71392 |
+
"epoch": 0.50705,
|
71393 |
+
"grad_norm": 1.9296875,
|
71394 |
+
"learning_rate": 0.0001486281407035176,
|
71395 |
+
"loss": 2.0135,
|
71396 |
+
"step": 101410
|
71397 |
+
},
|
71398 |
+
{
|
71399 |
+
"epoch": 0.5071,
|
71400 |
+
"grad_norm": 1.875,
|
71401 |
+
"learning_rate": 0.00014861306532663315,
|
71402 |
+
"loss": 1.9847,
|
71403 |
+
"step": 101420
|
71404 |
+
},
|
71405 |
+
{
|
71406 |
+
"epoch": 0.50715,
|
71407 |
+
"grad_norm": 2.046875,
|
71408 |
+
"learning_rate": 0.00014859798994974874,
|
71409 |
+
"loss": 2.0022,
|
71410 |
+
"step": 101430
|
71411 |
+
},
|
71412 |
+
{
|
71413 |
+
"epoch": 0.5072,
|
71414 |
+
"grad_norm": 2.09375,
|
71415 |
+
"learning_rate": 0.00014858291457286432,
|
71416 |
+
"loss": 2.0359,
|
71417 |
+
"step": 101440
|
71418 |
+
},
|
71419 |
+
{
|
71420 |
+
"epoch": 0.50725,
|
71421 |
+
"grad_norm": 2.0,
|
71422 |
+
"learning_rate": 0.00014856783919597988,
|
71423 |
+
"loss": 1.9862,
|
71424 |
+
"step": 101450
|
71425 |
+
},
|
71426 |
+
{
|
71427 |
+
"epoch": 0.5073,
|
71428 |
+
"grad_norm": 2.15625,
|
71429 |
+
"learning_rate": 0.00014855276381909547,
|
71430 |
+
"loss": 2.0993,
|
71431 |
+
"step": 101460
|
71432 |
+
},
|
71433 |
+
{
|
71434 |
+
"epoch": 0.50735,
|
71435 |
+
"grad_norm": 1.84375,
|
71436 |
+
"learning_rate": 0.00014853768844221105,
|
71437 |
+
"loss": 1.9999,
|
71438 |
+
"step": 101470
|
71439 |
+
},
|
71440 |
+
{
|
71441 |
+
"epoch": 0.5074,
|
71442 |
+
"grad_norm": 2.21875,
|
71443 |
+
"learning_rate": 0.0001485226130653266,
|
71444 |
+
"loss": 2.0576,
|
71445 |
+
"step": 101480
|
71446 |
+
},
|
71447 |
+
{
|
71448 |
+
"epoch": 0.50745,
|
71449 |
+
"grad_norm": 1.96875,
|
71450 |
+
"learning_rate": 0.0001485075376884422,
|
71451 |
+
"loss": 2.0463,
|
71452 |
+
"step": 101490
|
71453 |
+
},
|
71454 |
+
{
|
71455 |
+
"epoch": 0.5075,
|
71456 |
+
"grad_norm": 2.046875,
|
71457 |
+
"learning_rate": 0.00014849246231155776,
|
71458 |
+
"loss": 1.9991,
|
71459 |
+
"step": 101500
|
71460 |
+
},
|
71461 |
+
{
|
71462 |
+
"epoch": 0.50755,
|
71463 |
+
"grad_norm": 2.015625,
|
71464 |
+
"learning_rate": 0.00014847738693467334,
|
71465 |
+
"loss": 2.0225,
|
71466 |
+
"step": 101510
|
71467 |
+
},
|
71468 |
+
{
|
71469 |
+
"epoch": 0.5076,
|
71470 |
+
"grad_norm": 2.0,
|
71471 |
+
"learning_rate": 0.00014846231155778893,
|
71472 |
+
"loss": 1.9933,
|
71473 |
+
"step": 101520
|
71474 |
+
},
|
71475 |
+
{
|
71476 |
+
"epoch": 0.50765,
|
71477 |
+
"grad_norm": 2.03125,
|
71478 |
+
"learning_rate": 0.00014844723618090452,
|
71479 |
+
"loss": 2.068,
|
71480 |
+
"step": 101530
|
71481 |
+
},
|
71482 |
+
{
|
71483 |
+
"epoch": 0.5077,
|
71484 |
+
"grad_norm": 2.140625,
|
71485 |
+
"learning_rate": 0.0001484321608040201,
|
71486 |
+
"loss": 2.0123,
|
71487 |
+
"step": 101540
|
71488 |
+
},
|
71489 |
+
{
|
71490 |
+
"epoch": 0.50775,
|
71491 |
+
"grad_norm": 2.046875,
|
71492 |
+
"learning_rate": 0.00014841708542713566,
|
71493 |
+
"loss": 2.0615,
|
71494 |
+
"step": 101550
|
71495 |
+
},
|
71496 |
+
{
|
71497 |
+
"epoch": 0.5078,
|
71498 |
+
"grad_norm": 1.78125,
|
71499 |
+
"learning_rate": 0.00014840201005025125,
|
71500 |
+
"loss": 2.0239,
|
71501 |
+
"step": 101560
|
71502 |
+
},
|
71503 |
+
{
|
71504 |
+
"epoch": 0.50785,
|
71505 |
+
"grad_norm": 1.9140625,
|
71506 |
+
"learning_rate": 0.00014838693467336683,
|
71507 |
+
"loss": 2.0751,
|
71508 |
+
"step": 101570
|
71509 |
+
},
|
71510 |
+
{
|
71511 |
+
"epoch": 0.5079,
|
71512 |
+
"grad_norm": 2.0625,
|
71513 |
+
"learning_rate": 0.0001483718592964824,
|
71514 |
+
"loss": 2.0723,
|
71515 |
+
"step": 101580
|
71516 |
+
},
|
71517 |
+
{
|
71518 |
+
"epoch": 0.50795,
|
71519 |
+
"grad_norm": 2.265625,
|
71520 |
+
"learning_rate": 0.00014835678391959798,
|
71521 |
+
"loss": 1.9995,
|
71522 |
+
"step": 101590
|
71523 |
+
},
|
71524 |
+
{
|
71525 |
+
"epoch": 0.508,
|
71526 |
+
"grad_norm": 1.9765625,
|
71527 |
+
"learning_rate": 0.00014834170854271356,
|
71528 |
+
"loss": 1.9753,
|
71529 |
+
"step": 101600
|
71530 |
+
},
|
71531 |
+
{
|
71532 |
+
"epoch": 0.50805,
|
71533 |
+
"grad_norm": 2.046875,
|
71534 |
+
"learning_rate": 0.00014832663316582912,
|
71535 |
+
"loss": 1.977,
|
71536 |
+
"step": 101610
|
71537 |
+
},
|
71538 |
+
{
|
71539 |
+
"epoch": 0.5081,
|
71540 |
+
"grad_norm": 2.09375,
|
71541 |
+
"learning_rate": 0.0001483115577889447,
|
71542 |
+
"loss": 2.0894,
|
71543 |
+
"step": 101620
|
71544 |
+
},
|
71545 |
+
{
|
71546 |
+
"epoch": 0.50815,
|
71547 |
+
"grad_norm": 1.9140625,
|
71548 |
+
"learning_rate": 0.0001482964824120603,
|
71549 |
+
"loss": 1.9872,
|
71550 |
+
"step": 101630
|
71551 |
+
},
|
71552 |
+
{
|
71553 |
+
"epoch": 0.5082,
|
71554 |
+
"grad_norm": 2.03125,
|
71555 |
+
"learning_rate": 0.00014828140703517585,
|
71556 |
+
"loss": 2.0307,
|
71557 |
+
"step": 101640
|
71558 |
+
},
|
71559 |
+
{
|
71560 |
+
"epoch": 0.50825,
|
71561 |
+
"grad_norm": 1.7109375,
|
71562 |
+
"learning_rate": 0.00014826633165829144,
|
71563 |
+
"loss": 2.0058,
|
71564 |
+
"step": 101650
|
71565 |
+
},
|
71566 |
+
{
|
71567 |
+
"epoch": 0.5083,
|
71568 |
+
"grad_norm": 1.8828125,
|
71569 |
+
"learning_rate": 0.00014825125628140703,
|
71570 |
+
"loss": 2.0163,
|
71571 |
+
"step": 101660
|
71572 |
+
},
|
71573 |
+
{
|
71574 |
+
"epoch": 0.50835,
|
71575 |
+
"grad_norm": 1.7734375,
|
71576 |
+
"learning_rate": 0.0001482361809045226,
|
71577 |
+
"loss": 2.0325,
|
71578 |
+
"step": 101670
|
71579 |
+
},
|
71580 |
+
{
|
71581 |
+
"epoch": 0.5084,
|
71582 |
+
"grad_norm": 1.9296875,
|
71583 |
+
"learning_rate": 0.00014822110552763817,
|
71584 |
+
"loss": 2.0308,
|
71585 |
+
"step": 101680
|
71586 |
+
},
|
71587 |
+
{
|
71588 |
+
"epoch": 0.50845,
|
71589 |
+
"grad_norm": 1.9921875,
|
71590 |
+
"learning_rate": 0.00014820603015075376,
|
71591 |
+
"loss": 2.0286,
|
71592 |
+
"step": 101690
|
71593 |
+
},
|
71594 |
+
{
|
71595 |
+
"epoch": 0.5085,
|
71596 |
+
"grad_norm": 2.015625,
|
71597 |
+
"learning_rate": 0.00014819095477386934,
|
71598 |
+
"loss": 2.0159,
|
71599 |
+
"step": 101700
|
71600 |
+
},
|
71601 |
+
{
|
71602 |
+
"epoch": 0.50855,
|
71603 |
+
"grad_norm": 1.890625,
|
71604 |
+
"learning_rate": 0.0001481758793969849,
|
71605 |
+
"loss": 2.04,
|
71606 |
+
"step": 101710
|
71607 |
+
},
|
71608 |
+
{
|
71609 |
+
"epoch": 0.5086,
|
71610 |
+
"grad_norm": 1.9296875,
|
71611 |
+
"learning_rate": 0.0001481608040201005,
|
71612 |
+
"loss": 2.0119,
|
71613 |
+
"step": 101720
|
71614 |
+
},
|
71615 |
+
{
|
71616 |
+
"epoch": 0.50865,
|
71617 |
+
"grad_norm": 2.0,
|
71618 |
+
"learning_rate": 0.00014814572864321607,
|
71619 |
+
"loss": 1.9724,
|
71620 |
+
"step": 101730
|
71621 |
+
},
|
71622 |
+
{
|
71623 |
+
"epoch": 0.5087,
|
71624 |
+
"grad_norm": 2.078125,
|
71625 |
+
"learning_rate": 0.00014813065326633163,
|
71626 |
+
"loss": 2.0856,
|
71627 |
+
"step": 101740
|
71628 |
+
},
|
71629 |
+
{
|
71630 |
+
"epoch": 0.50875,
|
71631 |
+
"grad_norm": 1.9375,
|
71632 |
+
"learning_rate": 0.00014811557788944722,
|
71633 |
+
"loss": 2.027,
|
71634 |
+
"step": 101750
|
71635 |
+
},
|
71636 |
+
{
|
71637 |
+
"epoch": 0.5088,
|
71638 |
+
"grad_norm": 2.015625,
|
71639 |
+
"learning_rate": 0.0001481005025125628,
|
71640 |
+
"loss": 2.0297,
|
71641 |
+
"step": 101760
|
71642 |
+
},
|
71643 |
+
{
|
71644 |
+
"epoch": 0.50885,
|
71645 |
+
"grad_norm": 1.9375,
|
71646 |
+
"learning_rate": 0.00014808542713567836,
|
71647 |
+
"loss": 2.0425,
|
71648 |
+
"step": 101770
|
71649 |
+
},
|
71650 |
+
{
|
71651 |
+
"epoch": 0.5089,
|
71652 |
+
"grad_norm": 2.40625,
|
71653 |
+
"learning_rate": 0.00014807035175879395,
|
71654 |
+
"loss": 2.0579,
|
71655 |
+
"step": 101780
|
71656 |
+
},
|
71657 |
+
{
|
71658 |
+
"epoch": 0.50895,
|
71659 |
+
"grad_norm": 1.9140625,
|
71660 |
+
"learning_rate": 0.00014805527638190954,
|
71661 |
+
"loss": 1.9863,
|
71662 |
+
"step": 101790
|
71663 |
+
},
|
71664 |
+
{
|
71665 |
+
"epoch": 0.509,
|
71666 |
+
"grad_norm": 2.0625,
|
71667 |
+
"learning_rate": 0.00014804020100502512,
|
71668 |
+
"loss": 1.9923,
|
71669 |
+
"step": 101800
|
71670 |
+
},
|
71671 |
+
{
|
71672 |
+
"epoch": 0.50905,
|
71673 |
+
"grad_norm": 2.109375,
|
71674 |
+
"learning_rate": 0.00014802512562814068,
|
71675 |
+
"loss": 2.0216,
|
71676 |
+
"step": 101810
|
71677 |
+
},
|
71678 |
+
{
|
71679 |
+
"epoch": 0.5091,
|
71680 |
+
"grad_norm": 2.140625,
|
71681 |
+
"learning_rate": 0.00014801005025125627,
|
71682 |
+
"loss": 2.0225,
|
71683 |
+
"step": 101820
|
71684 |
+
},
|
71685 |
+
{
|
71686 |
+
"epoch": 0.50915,
|
71687 |
+
"grad_norm": 1.921875,
|
71688 |
+
"learning_rate": 0.00014799497487437185,
|
71689 |
+
"loss": 1.9862,
|
71690 |
+
"step": 101830
|
71691 |
+
},
|
71692 |
+
{
|
71693 |
+
"epoch": 0.5092,
|
71694 |
+
"grad_norm": 1.984375,
|
71695 |
+
"learning_rate": 0.00014797989949748744,
|
71696 |
+
"loss": 1.9726,
|
71697 |
+
"step": 101840
|
71698 |
+
},
|
71699 |
+
{
|
71700 |
+
"epoch": 0.50925,
|
71701 |
+
"grad_norm": 1.8359375,
|
71702 |
+
"learning_rate": 0.000147964824120603,
|
71703 |
+
"loss": 2.0245,
|
71704 |
+
"step": 101850
|
71705 |
+
},
|
71706 |
+
{
|
71707 |
+
"epoch": 0.5093,
|
71708 |
+
"grad_norm": 1.890625,
|
71709 |
+
"learning_rate": 0.00014794974874371859,
|
71710 |
+
"loss": 2.0225,
|
71711 |
+
"step": 101860
|
71712 |
+
},
|
71713 |
+
{
|
71714 |
+
"epoch": 0.50935,
|
71715 |
+
"grad_norm": 1.9609375,
|
71716 |
+
"learning_rate": 0.00014793467336683414,
|
71717 |
+
"loss": 2.0797,
|
71718 |
+
"step": 101870
|
71719 |
+
},
|
71720 |
+
{
|
71721 |
+
"epoch": 0.5094,
|
71722 |
+
"grad_norm": 1.9765625,
|
71723 |
+
"learning_rate": 0.00014791959798994973,
|
71724 |
+
"loss": 2.0775,
|
71725 |
+
"step": 101880
|
71726 |
+
},
|
71727 |
+
{
|
71728 |
+
"epoch": 0.50945,
|
71729 |
+
"grad_norm": 2.03125,
|
71730 |
+
"learning_rate": 0.00014790452261306532,
|
71731 |
+
"loss": 2.0884,
|
71732 |
+
"step": 101890
|
71733 |
+
},
|
71734 |
+
{
|
71735 |
+
"epoch": 0.5095,
|
71736 |
+
"grad_norm": 1.71875,
|
71737 |
+
"learning_rate": 0.00014788944723618088,
|
71738 |
+
"loss": 2.0254,
|
71739 |
+
"step": 101900
|
71740 |
+
},
|
71741 |
+
{
|
71742 |
+
"epoch": 0.50955,
|
71743 |
+
"grad_norm": 1.90625,
|
71744 |
+
"learning_rate": 0.00014787437185929646,
|
71745 |
+
"loss": 1.9893,
|
71746 |
+
"step": 101910
|
71747 |
+
},
|
71748 |
+
{
|
71749 |
+
"epoch": 0.5096,
|
71750 |
+
"grad_norm": 1.9296875,
|
71751 |
+
"learning_rate": 0.00014785929648241205,
|
71752 |
+
"loss": 2.0477,
|
71753 |
+
"step": 101920
|
71754 |
+
},
|
71755 |
+
{
|
71756 |
+
"epoch": 0.50965,
|
71757 |
+
"grad_norm": 2.125,
|
71758 |
+
"learning_rate": 0.00014784422110552763,
|
71759 |
+
"loss": 2.0333,
|
71760 |
+
"step": 101930
|
71761 |
+
},
|
71762 |
+
{
|
71763 |
+
"epoch": 0.5097,
|
71764 |
+
"grad_norm": 1.828125,
|
71765 |
+
"learning_rate": 0.0001478291457286432,
|
71766 |
+
"loss": 2.009,
|
71767 |
+
"step": 101940
|
71768 |
+
},
|
71769 |
+
{
|
71770 |
+
"epoch": 0.50975,
|
71771 |
+
"grad_norm": 2.046875,
|
71772 |
+
"learning_rate": 0.00014781407035175878,
|
71773 |
+
"loss": 2.0322,
|
71774 |
+
"step": 101950
|
71775 |
+
},
|
71776 |
+
{
|
71777 |
+
"epoch": 0.5098,
|
71778 |
+
"grad_norm": 2.03125,
|
71779 |
+
"learning_rate": 0.00014779899497487437,
|
71780 |
+
"loss": 1.9683,
|
71781 |
+
"step": 101960
|
71782 |
+
},
|
71783 |
+
{
|
71784 |
+
"epoch": 0.50985,
|
71785 |
+
"grad_norm": 1.828125,
|
71786 |
+
"learning_rate": 0.00014778391959798995,
|
71787 |
+
"loss": 2.0255,
|
71788 |
+
"step": 101970
|
71789 |
+
},
|
71790 |
+
{
|
71791 |
+
"epoch": 0.5099,
|
71792 |
+
"grad_norm": 2.09375,
|
71793 |
+
"learning_rate": 0.0001477688442211055,
|
71794 |
+
"loss": 2.016,
|
71795 |
+
"step": 101980
|
71796 |
+
},
|
71797 |
+
{
|
71798 |
+
"epoch": 0.50995,
|
71799 |
+
"grad_norm": 1.8515625,
|
71800 |
+
"learning_rate": 0.0001477537688442211,
|
71801 |
+
"loss": 2.0768,
|
71802 |
+
"step": 101990
|
71803 |
+
},
|
71804 |
+
{
|
71805 |
+
"epoch": 0.51,
|
71806 |
+
"grad_norm": 2.109375,
|
71807 |
+
"learning_rate": 0.00014773869346733668,
|
71808 |
+
"loss": 1.9938,
|
71809 |
+
"step": 102000
|
71810 |
+
},
|
71811 |
+
{
|
71812 |
+
"epoch": 0.51,
|
71813 |
+
"eval_loss": 2.0225799083709717,
|
71814 |
+
"eval_runtime": 90.2278,
|
71815 |
+
"eval_samples_per_second": 27.708,
|
71816 |
+
"eval_steps_per_second": 0.443,
|
71817 |
+
"step": 102000
|
71818 |
}
|
71819 |
],
|
71820 |
"logging_steps": 10,
|
|
|
71834 |
"attributes": {}
|
71835 |
}
|
71836 |
},
|
71837 |
+
"total_flos": 2.471196266234852e+19,
|
71838 |
"train_batch_size": 8,
|
71839 |
"trial_name": null,
|
71840 |
"trial_params": null
|