Erin commited on
Commit
42df499
1 Parent(s): 49bac78

Upload 7 files

Browse files
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/iyunwen/nlpdata/PublicPretrainedModel/bge-base-zh/",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "directionality": "bidi",
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "LABEL_0"
16
+ },
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "label2id": {
20
+ "LABEL_0": 0
21
+ },
22
+ "layer_norm_eps": 1e-12,
23
+ "max_position_embeddings": 512,
24
+ "model_type": "bert",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 12,
27
+ "output_past": true,
28
+ "pad_token_id": 0,
29
+ "pooler_fc_size": 768,
30
+ "pooler_num_attention_heads": 12,
31
+ "pooler_num_fc_layers": 3,
32
+ "pooler_size_per_head": 128,
33
+ "pooler_type": "first_token_transform",
34
+ "position_embedding_type": "absolute",
35
+ "torch_dtype": "float16",
36
+ "transformers_version": "4.33.0",
37
+ "type_vocab_size": 2,
38
+ "use_cache": true,
39
+ "vocab_size": 21128
40
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2522c4ebc80aed7a198da5f9a4295775dd2c025981dd2c848ad35e38e37a2456
3
+ size 204602025
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": true,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
train_config.yml ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task_name: general
2
+ model_name: bge
3
+ model_dir: /iyunwen/nlpdata/PublicPretrainedModel/bge-base-zh/
4
+ use_deepspeed: true
5
+ desc: "piccolo"
6
+ train_method: "ewc"
7
+ ewc_ratio: 10.0
8
+ cosent_ratio: 20.0
9
+ in_batch_ratio: 30.0
10
+ save_steps: 50
11
+ hard_neg_ratio: 0.2
12
+ in_batch_train_paths:
13
+ # synthetic_qp里的qp还是bge的向量
14
+ synthetic_qp:
15
+ - /iyunwen/nlpdata/work/LP/Data/VecData/v2/wudao_synthetic_alpaca2_hfl_0_100000_vec_neg.jsonl
16
+ - /iyunwen/nlpdata/work/LP/Data/VecData/v2/m3e_synthetic_alpaca2_hfl_0_100000_vec_neg.jsonl
17
+ # normal里的hard neg默认是bm25
18
+ normal:
19
+ - /iyunwen/nlpdata/work/LP/Data/VecData/v2/m3e_long_length_hard_neg.jsonl
20
+ - /iyunwen/nlpdata/work/LP/Data/VecData/v2/wudao_long_length_hard_neg.jsonl
21
+ - /iyunwen/nlpdata/work/LP/Data/VecData/stella/mrc_data.jsonl
22
+ - /iyunwen/nlpdata/work/LP/Data/VecData/stella/guowang_data.jsonl
23
+
24
+
25
+ pair_train_paths:
26
+ binclf:
27
+ - /iyunwen/nlpdata/work/LP/Data/VecData/v2/binclf_data.jsonl
28
+ nli:
29
+ - /iyunwen/nlpdata/work/LP/Data/VecData/v2/nli_data.jsonl
30
+
31
+ loader_idxs: null
32
+ in_batch_bsz: 128
33
+ pair_bsz: 128
34
+ max_length: 512
35
+
36
+ auto_ouput_dir: false
37
+ train_args:
38
+ seed: 666
39
+ output_dir: /iyunwen/nlpdata/work/LP/model_path/vec_embedding/stella/s4/
40
+ evaluation_strategy: "no"
41
+ num_train_epochs: 4
42
+ logging_steps: 9999999
43
+ eval_steps: 9999999
44
+ per_device_train_batch_size: 128
45
+ gradient_accumulation_steps: 1
46
+ per_device_eval_batch_size: 32
47
+ learning_rate: 5.0e-06
48
+ weight_decay: 0.00001
49
+ warmup_ratio: 0.05
50
+ lr_scheduler_type: "linear"
51
+ dataloader_drop_last: false
52
+
53
+ fp16: true
54
+ gradient_checkpointing: true
55
+ deepspeed:
56
+ fp16:
57
+ enabled: true
58
+ hysteresis: 2
59
+ initial_scale_power: 16
60
+ loss_scale: 0
61
+ loss_scale_window: 1000
62
+ min_loss_scale: 1
63
+ train_micro_batch_size_per_gpu: 128
64
+ train_batch_size: "auto"
65
+ gradient_accumulation_steps: 1
66
+ gradient_clipping: auto
67
+ optimizer:
68
+ params:
69
+ adam_w_mode: true
70
+ lr: 1e-6
71
+ torch_adam: true
72
+ weight_decay: auto
73
+ type: AdamW
74
+ scheduler:
75
+ params:
76
+ total_num_steps: auto
77
+ warmup_max_lr: auto
78
+ warmup_min_lr: auto
79
+ warmup_num_steps: auto
80
+ type: WarmupDecayLR
81
+ steps_per_print: 4
82
+ wall_clock_breakdown: false
83
+ zero_optimization:
84
+ allgather_bucket_size: 200000000.0
85
+ allgather_partitions: true
86
+ contiguous_gradients: true
87
+ overlap_comm: true
88
+ reduce_bucket_size: auto
89
+ reduce_scatter: true
90
+ stage: 0
91
+
vocab.txt ADDED
The diff for this file is too large to render. See raw diff