zlucia commited on
Commit
6c19ed1
1 Parent(s): 7777e6c

End of training

Browse files
README.md CHANGED
@@ -15,8 +15,6 @@ should probably proofread and complete it, then remove this comment. -->
15
  # Mistral-7B-v0.1_case-briefs
16
 
17
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
18
- It achieves the following results on the evaluation set:
19
- - Loss: 1.1290
20
 
21
  ## Model description
22
 
@@ -35,26 +33,19 @@ More information needed
35
  ### Training hyperparameters
36
 
37
  The following hyperparameters were used during training:
38
- - learning_rate: 3e-05
39
- - train_batch_size: 4
40
- - eval_batch_size: 4
41
  - seed: 42
42
- - gradient_accumulation_steps: 4
43
  - total_train_batch_size: 16
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: constant
46
  - lr_scheduler_warmup_ratio: 0.03
47
- - num_epochs: 2.0
48
 
49
  ### Training results
50
 
51
- | Training Loss | Epoch | Step | Validation Loss |
52
- |:-------------:|:-----:|:----:|:---------------:|
53
- | 1.1008 | 0.34 | 50 | 1.1499 |
54
- | 1.0663 | 0.68 | 100 | 1.1314 |
55
- | 1.04 | 1.02 | 150 | 1.1263 |
56
- | 1.0182 | 1.36 | 200 | 1.1319 |
57
- | 1.0291 | 1.7 | 250 | 1.1290 |
58
 
59
 
60
  ### Framework versions
@@ -62,5 +53,5 @@ The following hyperparameters were used during training:
62
  - PEFT 0.7.1
63
  - Transformers 4.37.2
64
  - Pytorch 2.1.2+cu121
65
- - Datasets 2.16.1
66
  - Tokenizers 0.15.1
 
15
  # Mistral-7B-v0.1_case-briefs
16
 
17
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
 
 
18
 
19
  ## Model description
20
 
 
33
  ### Training hyperparameters
34
 
35
  The following hyperparameters were used during training:
36
+ - learning_rate: 0.0002
37
+ - train_batch_size: 1
38
+ - eval_batch_size: 8
39
  - seed: 42
40
+ - gradient_accumulation_steps: 16
41
  - total_train_batch_size: 16
42
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
43
  - lr_scheduler_type: constant
44
  - lr_scheduler_warmup_ratio: 0.03
45
+ - training_steps: 1
46
 
47
  ### Training results
48
 
 
 
 
 
 
 
 
49
 
50
 
51
  ### Framework versions
 
53
  - PEFT 0.7.1
54
  - Transformers 4.37.2
55
  - Pytorch 2.1.2+cu121
56
+ - Datasets 2.17.1
57
  - Tokenizers 0.15.1
adapter_config.json CHANGED
@@ -10,7 +10,7 @@
10
  "layers_to_transform": null,
11
  "loftq_config": {},
12
  "lora_alpha": 16,
13
- "lora_dropout": 0.1,
14
  "megatron_config": null,
15
  "megatron_core": "megatron.core",
16
  "modules_to_save": null,
@@ -19,13 +19,13 @@
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
- "k_proj",
23
  "down_proj",
 
 
24
  "q_proj",
25
- "up_proj",
26
  "v_proj",
27
- "gate_proj",
28
- "o_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
 
10
  "layers_to_transform": null,
11
  "loftq_config": {},
12
  "lora_alpha": 16,
13
+ "lora_dropout": 0.0,
14
  "megatron_config": null,
15
  "megatron_core": "megatron.core",
16
  "modules_to_save": null,
 
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
 
22
  "down_proj",
23
+ "o_proj",
24
+ "gate_proj",
25
  "q_proj",
 
26
  "v_proj",
27
+ "up_proj",
28
+ "k_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbd8e69323bbf994e6f8255a1b0e552b1ffe36a2838fc0d7ddc47be401beb79e
3
  size 335605144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c65d4d58e8101f2ce059a1c109f69e013107064461a40e5c947d8e82529537f
3
  size 335605144
all_results.json CHANGED
@@ -1,11 +1,7 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_loss": 1.182191252708435,
4
- "eval_runtime": 106.7178,
5
- "eval_samples_per_second": 2.455,
6
- "eval_steps_per_second": 0.618,
7
- "train_loss": 1.0449795049874961,
8
- "train_runtime": 831.3165,
9
- "train_samples_per_second": 5.668,
10
- "train_steps_per_second": 0.354
11
  }
 
1
  {
2
+ "epoch": 0.01,
3
+ "train_loss": 1.3776695728302002,
4
+ "train_runtime": 13.1306,
5
+ "train_samples_per_second": 1.219,
6
+ "train_steps_per_second": 0.076
 
 
 
 
7
  }
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 2.0,
3
- "train_loss": 1.0449795049874961,
4
- "train_runtime": 831.3165,
5
- "train_samples_per_second": 5.668,
6
- "train_steps_per_second": 0.354
7
  }
 
1
  {
2
+ "epoch": 0.01,
3
+ "train_loss": 1.3776695728302002,
4
+ "train_runtime": 13.1306,
5
+ "train_samples_per_second": 1.219,
6
+ "train_steps_per_second": 0.076
7
  }
trainer_state.json CHANGED
@@ -1,244 +1,30 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.99660441426146,
5
- "eval_steps": 50,
6
- "global_step": 294,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.07,
13
- "learning_rate": 3e-05,
14
- "loss": 1.2025,
15
- "step": 10
16
- },
17
- {
18
- "epoch": 0.14,
19
- "learning_rate": 3e-05,
20
- "loss": 1.1367,
21
- "step": 20
22
- },
23
- {
24
- "epoch": 0.2,
25
- "learning_rate": 3e-05,
26
- "loss": 1.1325,
27
- "step": 30
28
- },
29
- {
30
- "epoch": 0.27,
31
- "learning_rate": 3e-05,
32
- "loss": 1.1111,
33
- "step": 40
34
- },
35
- {
36
- "epoch": 0.34,
37
- "learning_rate": 3e-05,
38
- "loss": 1.1008,
39
- "step": 50
40
- },
41
- {
42
- "epoch": 0.34,
43
- "eval_loss": 1.1499062776565552,
44
- "eval_runtime": 15.0698,
45
- "eval_samples_per_second": 17.386,
46
- "eval_steps_per_second": 4.38,
47
- "step": 50
48
- },
49
- {
50
- "epoch": 0.41,
51
- "learning_rate": 3e-05,
52
- "loss": 1.0935,
53
- "step": 60
54
- },
55
- {
56
- "epoch": 0.48,
57
- "learning_rate": 3e-05,
58
- "loss": 1.0925,
59
- "step": 70
60
- },
61
- {
62
- "epoch": 0.54,
63
- "learning_rate": 3e-05,
64
- "loss": 1.1389,
65
- "step": 80
66
- },
67
- {
68
- "epoch": 0.61,
69
- "learning_rate": 3e-05,
70
- "loss": 1.1186,
71
- "step": 90
72
- },
73
- {
74
- "epoch": 0.68,
75
- "learning_rate": 3e-05,
76
- "loss": 1.0663,
77
- "step": 100
78
- },
79
- {
80
- "epoch": 0.68,
81
- "eval_loss": 1.1313854455947876,
82
- "eval_runtime": 15.0724,
83
- "eval_samples_per_second": 17.383,
84
- "eval_steps_per_second": 4.379,
85
- "step": 100
86
- },
87
- {
88
- "epoch": 0.75,
89
- "learning_rate": 3e-05,
90
- "loss": 1.0572,
91
- "step": 110
92
- },
93
- {
94
- "epoch": 0.81,
95
- "learning_rate": 3e-05,
96
- "loss": 1.1099,
97
- "step": 120
98
- },
99
- {
100
- "epoch": 0.88,
101
- "learning_rate": 3e-05,
102
- "loss": 1.094,
103
- "step": 130
104
- },
105
- {
106
- "epoch": 0.95,
107
- "learning_rate": 3e-05,
108
- "loss": 1.074,
109
- "step": 140
110
- },
111
- {
112
- "epoch": 1.02,
113
- "learning_rate": 3e-05,
114
- "loss": 1.04,
115
- "step": 150
116
- },
117
- {
118
- "epoch": 1.02,
119
- "eval_loss": 1.1262978315353394,
120
- "eval_runtime": 15.1147,
121
- "eval_samples_per_second": 17.334,
122
- "eval_steps_per_second": 4.367,
123
- "step": 150
124
- },
125
- {
126
- "epoch": 1.09,
127
- "learning_rate": 3e-05,
128
- "loss": 0.9901,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 1.15,
133
- "learning_rate": 3e-05,
134
- "loss": 1.0634,
135
- "step": 170
136
- },
137
- {
138
- "epoch": 1.22,
139
- "learning_rate": 3e-05,
140
- "loss": 0.9784,
141
- "step": 180
142
- },
143
- {
144
- "epoch": 1.29,
145
- "learning_rate": 3e-05,
146
- "loss": 0.9506,
147
- "step": 190
148
- },
149
- {
150
- "epoch": 1.36,
151
- "learning_rate": 3e-05,
152
- "loss": 1.0182,
153
- "step": 200
154
- },
155
- {
156
- "epoch": 1.36,
157
- "eval_loss": 1.1319481134414673,
158
- "eval_runtime": 15.0625,
159
- "eval_samples_per_second": 17.394,
160
- "eval_steps_per_second": 4.382,
161
- "step": 200
162
- },
163
- {
164
- "epoch": 1.43,
165
- "learning_rate": 3e-05,
166
- "loss": 0.9876,
167
- "step": 210
168
- },
169
- {
170
- "epoch": 1.49,
171
- "learning_rate": 3e-05,
172
- "loss": 0.9059,
173
- "step": 220
174
- },
175
- {
176
- "epoch": 1.56,
177
- "learning_rate": 3e-05,
178
- "loss": 0.997,
179
- "step": 230
180
- },
181
- {
182
- "epoch": 1.63,
183
- "learning_rate": 3e-05,
184
- "loss": 0.9893,
185
- "step": 240
186
- },
187
- {
188
- "epoch": 1.7,
189
- "learning_rate": 3e-05,
190
- "loss": 1.0291,
191
- "step": 250
192
- },
193
- {
194
- "epoch": 1.7,
195
- "eval_loss": 1.1290254592895508,
196
- "eval_runtime": 15.0498,
197
- "eval_samples_per_second": 17.409,
198
- "eval_steps_per_second": 4.385,
199
- "step": 250
200
- },
201
- {
202
- "epoch": 1.77,
203
- "learning_rate": 3e-05,
204
- "loss": 0.966,
205
- "step": 260
206
- },
207
- {
208
- "epoch": 1.83,
209
- "learning_rate": 3e-05,
210
- "loss": 1.0419,
211
- "step": 270
212
- },
213
- {
214
- "epoch": 1.9,
215
- "learning_rate": 3e-05,
216
- "loss": 0.9625,
217
- "step": 280
218
- },
219
- {
220
- "epoch": 1.97,
221
- "learning_rate": 3e-05,
222
- "loss": 0.9313,
223
- "step": 290
224
- },
225
- {
226
- "epoch": 2.0,
227
- "step": 294,
228
- "total_flos": 7.119009482145792e+16,
229
- "train_loss": 1.0449795049874961,
230
- "train_runtime": 831.3165,
231
- "train_samples_per_second": 5.668,
232
- "train_steps_per_second": 0.354
233
  }
234
  ],
235
  "logging_steps": 10,
236
- "max_steps": 294,
237
  "num_input_tokens_seen": 0,
238
- "num_train_epochs": 2,
239
  "save_steps": 250,
240
- "total_flos": 7.119009482145792e+16,
241
- "train_batch_size": 4,
242
  "trial_name": null,
243
  "trial_params": null
244
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.006791171477079796,
5
+ "eval_steps": 500,
6
+ "global_step": 1,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01,
13
+ "step": 1,
14
+ "total_flos": 477581879672832.0,
15
+ "train_loss": 1.3776695728302002,
16
+ "train_runtime": 13.1306,
17
+ "train_samples_per_second": 1.219,
18
+ "train_steps_per_second": 0.076
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  }
20
  ],
21
  "logging_steps": 10,
22
+ "max_steps": 1,
23
  "num_input_tokens_seen": 0,
24
+ "num_train_epochs": 1,
25
  "save_steps": 250,
26
+ "total_flos": 477581879672832.0,
27
+ "train_batch_size": 1,
28
  "trial_name": null,
29
  "trial_params": null
30
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9515e50a85ffdadfcbe8222eb903c1e1790911eadb7489ccbbcd0e8696cafb3
3
  size 6648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b820acf2b0d501ce1b35e31a5661398a52cf98d95dbd0b403f87e2dc33c54bb1
3
  size 6648