vikp commited on
Commit
6222c18
1 Parent(s): 35faf1a

Upload model

Browse files
Files changed (2) hide show
  1. config.json +140 -33
  2. model.safetensors +2 -2
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "vikp/text_recognizer_5",
3
  "architectures": [
4
  "OCREncoderDecoderModel"
5
  ],
@@ -17,6 +17,7 @@
17
  "attention"
18
  ],
19
  "bos_token_id": 1,
 
20
  "chunk_size_feed_forward": 0,
21
  "conv1d_width": 4,
22
  "cross_attention_hidden_size": null,
@@ -24,45 +25,37 @@
24
  0,
25
  1,
26
  2,
27
- 3,
28
- 4,
29
- 5,
30
- 6,
31
- 7,
32
- 8,
33
- 9,
34
- 10,
35
- 11
36
  ],
37
  "decoder_start_token_id": null,
38
  "diversity_penalty": 0.0,
39
  "do_sample": false,
40
  "early_stopping": false,
 
 
 
 
41
  "encoder_no_repeat_ngram_size": 0,
42
  "eos_token_id": 1,
43
  "exponential_decay_length_penalty": null,
44
- "final_w_init_variance_scale": 0.2,
45
  "finetuning_task": null,
46
  "forced_bos_token_id": null,
47
  "forced_eos_token_id": null,
48
  "global_attn_layers": [
49
  0,
50
  1,
51
- 3,
52
- 5,
53
- 7,
54
- 9,
55
- 11
56
  ],
57
  "head_dim": 64,
58
  "hidden_activation": "gelu_pytorch_tanh",
59
- "hidden_size": 1024,
60
  "id2label": {
61
  "0": "LABEL_0",
62
  "1": "LABEL_1"
63
  },
64
  "init_std": 0.02,
65
- "intermediate_size": 4096,
66
  "is_decoder": false,
67
  "is_encoder_decoder": false,
68
  "label2id": {
@@ -71,15 +64,15 @@
71
  },
72
  "length_penalty": 1.0,
73
  "logits_soft_cap": 30.0,
74
- "lru_width": 1024,
75
  "max_length": 20,
76
  "min_length": 0,
77
  "model_type": "surya_ocr",
78
  "no_repeat_ngram_size": 0,
79
- "num_attention_heads": 16,
80
  "num_beam_groups": 1,
81
  "num_beams": 1,
82
- "num_hidden_layers": 10,
83
  "num_key_value_heads": 2,
84
  "num_return_sequences": 1,
85
  "output_attentions": false,
@@ -96,13 +89,8 @@
96
  "rms_norm_eps": 1e-06,
97
  "rope_theta": 10000.0,
98
  "self_attn_layers": [
99
- 0,
100
  1,
101
- 3,
102
- 5,
103
- 7,
104
- 9,
105
- 11
106
  ],
107
  "sep_token_id": null,
108
  "suppress_tokens": null,
@@ -137,7 +125,7 @@
137
  "depths": [
138
  2,
139
  2,
140
- 14,
141
  2
142
  ],
143
  "diversity_penalty": 0.0,
@@ -160,7 +148,7 @@
160
  "1": "LABEL_1"
161
  },
162
  "image_size": [
163
- 196,
164
  896
165
  ],
166
  "initializer_range": 0.02,
@@ -187,10 +175,10 @@
187
  32
188
  ],
189
  "num_kv_heads": [
190
- 1,
191
- 2,
192
  4,
193
- 8
 
 
194
  ],
195
  "num_layers": 4,
196
  "num_return_sequences": 1,
@@ -222,12 +210,131 @@
222
  "typical_p": 1.0,
223
  "use_absolute_embeddings": true,
224
  "use_bfloat16": false,
225
- "window_size": 7
226
  },
227
  "eos_token_id": 1,
228
  "is_encoder_decoder": true,
229
  "model_type": "vision-encoder-decoder",
230
  "pad_token_id": 0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  "tie_word_embeddings": false,
232
  "torch_dtype": "float16",
233
  "transformers_version": "4.43.3"
 
1
  {
2
+ "_name_or_path": "vikp/text_recognizer_sar4",
3
  "architectures": [
4
  "OCREncoderDecoderModel"
5
  ],
 
17
  "attention"
18
  ],
19
  "bos_token_id": 1,
20
+ "causal": true,
21
  "chunk_size_feed_forward": 0,
22
  "conv1d_width": 4,
23
  "cross_attention_hidden_size": null,
 
25
  0,
26
  1,
27
  2,
28
+ 3
 
 
 
 
 
 
 
 
29
  ],
30
  "decoder_start_token_id": null,
31
  "diversity_penalty": 0.0,
32
  "do_sample": false,
33
  "early_stopping": false,
34
+ "encoder_cross_attn_layers": [
35
+ 2
36
+ ],
37
+ "encoder_hidden_size": 1280,
38
  "encoder_no_repeat_ngram_size": 0,
39
  "eos_token_id": 1,
40
  "exponential_decay_length_penalty": null,
41
+ "final_w_init_variance_scale": 0.5,
42
  "finetuning_task": null,
43
  "forced_bos_token_id": null,
44
  "forced_eos_token_id": null,
45
  "global_attn_layers": [
46
  0,
47
  1,
48
+ 3
 
 
 
 
49
  ],
50
  "head_dim": 64,
51
  "hidden_activation": "gelu_pytorch_tanh",
52
+ "hidden_size": 512,
53
  "id2label": {
54
  "0": "LABEL_0",
55
  "1": "LABEL_1"
56
  },
57
  "init_std": 0.02,
58
+ "intermediate_size": 2048,
59
  "is_decoder": false,
60
  "is_encoder_decoder": false,
61
  "label2id": {
 
64
  },
65
  "length_penalty": 1.0,
66
  "logits_soft_cap": 30.0,
67
+ "lru_width": 512,
68
  "max_length": 20,
69
  "min_length": 0,
70
  "model_type": "surya_ocr",
71
  "no_repeat_ngram_size": 0,
72
+ "num_attention_heads": 8,
73
  "num_beam_groups": 1,
74
  "num_beams": 1,
75
+ "num_hidden_layers": 4,
76
  "num_key_value_heads": 2,
77
  "num_return_sequences": 1,
78
  "output_attentions": false,
 
89
  "rms_norm_eps": 1e-06,
90
  "rope_theta": 10000.0,
91
  "self_attn_layers": [
 
92
  1,
93
+ 3
 
 
 
 
94
  ],
95
  "sep_token_id": null,
96
  "suppress_tokens": null,
 
125
  "depths": [
126
  2,
127
  2,
128
+ 18,
129
  2
130
  ],
131
  "diversity_penalty": 0.0,
 
148
  "1": "LABEL_1"
149
  },
150
  "image_size": [
151
+ 256,
152
  896
153
  ],
154
  "initializer_range": 0.02,
 
175
  32
176
  ],
177
  "num_kv_heads": [
 
 
178
  4,
179
+ 8,
180
+ 16,
181
+ 32
182
  ],
183
  "num_layers": 4,
184
  "num_return_sequences": 1,
 
210
  "typical_p": 1.0,
211
  "use_absolute_embeddings": true,
212
  "use_bfloat16": false,
213
+ "window_size": 8
214
  },
215
  "eos_token_id": 1,
216
  "is_encoder_decoder": true,
217
  "model_type": "vision-encoder-decoder",
218
  "pad_token_id": 0,
219
+ "text_encoder": {
220
+ "_name_or_path": "",
221
+ "add_cross_attention": false,
222
+ "architectures": null,
223
+ "attention_bias": false,
224
+ "attention_dropout": 0.0,
225
+ "attention_window_size": 16,
226
+ "aux_heads": 0,
227
+ "bad_words_ids": null,
228
+ "begin_suppress_tokens": null,
229
+ "block_types": [
230
+ "attention"
231
+ ],
232
+ "bos_token_id": 1,
233
+ "causal": false,
234
+ "chunk_size_feed_forward": 0,
235
+ "conv1d_width": 4,
236
+ "cross_attention_hidden_size": null,
237
+ "cross_attn_layers": [
238
+ 0,
239
+ 1,
240
+ 2,
241
+ 3,
242
+ 4,
243
+ 6,
244
+ 8,
245
+ 10,
246
+ 11
247
+ ],
248
+ "decoder_start_token_id": null,
249
+ "diversity_penalty": 0.0,
250
+ "do_sample": false,
251
+ "early_stopping": false,
252
+ "encoder_hidden_size": 1024,
253
+ "encoder_no_repeat_ngram_size": 0,
254
+ "eos_token_id": 1,
255
+ "exponential_decay_length_penalty": null,
256
+ "final_w_init_variance_scale": 0.16666666666666666,
257
+ "finetuning_task": null,
258
+ "forced_bos_token_id": null,
259
+ "forced_eos_token_id": null,
260
+ "global_attn_layers": [
261
+ 0,
262
+ 1,
263
+ 3,
264
+ 5,
265
+ 7,
266
+ 9,
267
+ 11
268
+ ],
269
+ "head_dim": 80,
270
+ "hidden_activation": "gelu_pytorch_tanh",
271
+ "hidden_size": 1280,
272
+ "id2label": {
273
+ "0": "LABEL_0",
274
+ "1": "LABEL_1"
275
+ },
276
+ "init_std": 0.02,
277
+ "intermediate_size": 5120,
278
+ "is_decoder": false,
279
+ "is_encoder_decoder": false,
280
+ "iteration_count": 1,
281
+ "label2id": {
282
+ "LABEL_0": 0,
283
+ "LABEL_1": 1
284
+ },
285
+ "length_penalty": 1.0,
286
+ "logits_soft_cap": 30.0,
287
+ "lru_width": 1280,
288
+ "max_length": 20,
289
+ "min_length": 0,
290
+ "model_type": "surya_ocr",
291
+ "no_repeat_ngram_size": 0,
292
+ "num_attention_heads": 16,
293
+ "num_beam_groups": 1,
294
+ "num_beams": 1,
295
+ "num_hidden_layers": 12,
296
+ "num_key_value_heads": 4,
297
+ "num_return_sequences": 1,
298
+ "output_attentions": false,
299
+ "output_hidden_states": false,
300
+ "output_scores": false,
301
+ "pad_token_id": 0,
302
+ "prefix": null,
303
+ "problem_type": null,
304
+ "pruned_heads": {},
305
+ "query_token_count": 128,
306
+ "remove_invalid_values": false,
307
+ "repetition_penalty": 1.0,
308
+ "return_dict": true,
309
+ "return_dict_in_generate": false,
310
+ "rms_norm_eps": 1e-06,
311
+ "rope_theta": 10000.0,
312
+ "self_attn_layers": [
313
+ 1,
314
+ 3,
315
+ 5,
316
+ 7,
317
+ 9,
318
+ 11
319
+ ],
320
+ "sep_token_id": null,
321
+ "suppress_tokens": null,
322
+ "task_specific_params": null,
323
+ "temperature": 1.0,
324
+ "tf_legacy_loss": false,
325
+ "tie_encoder_decoder": false,
326
+ "tie_word_embeddings": true,
327
+ "tokenizer_class": null,
328
+ "top_k": 50,
329
+ "top_p": 1.0,
330
+ "torch_dtype": null,
331
+ "torchscript": false,
332
+ "typical_p": 1.0,
333
+ "use_bfloat16": false,
334
+ "use_cache": true,
335
+ "vocab_size": 256,
336
+ "w_init_variance_scale": 0.01
337
+ },
338
  "tie_word_embeddings": false,
339
  "torch_dtype": "float16",
340
  "transformers_version": "4.43.3"
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e13019d80ebd1d81d91c8a5f3e6baa7394362ce9ef68010a9fa4cb069609538b
3
- size 730939880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea4b103038b00ee69b5b4e029aafd7d101708f05d453d8568aa8a9c6bb730a27
3
+ size 940533088