File size: 3,407 Bytes
7657bb3
 
 
 
 
 
 
 
3aea04f
7657bb3
3aea04f
7657bb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
name: "iwslt14_deenfr_prompt"
joeynmt_version: "2.3.0"
model_dir: "iwslt14_prompt"
use_cuda: True
fp16: True
random_seed: 42

data:
    #train: "iwslt14_prompt/train"        # cf. https://wit3.fbk.eu/2014-01
    #dev: "iwslt14_prompt/dev"
    test: "iwslt14_prompt/test.ref.de-en" # ['TED.dev2010', 'TEDX.dev2012', 'TED.tst2010', 'TED.tst2011', 'TED.tst2012']
    dataset_type: "tsv"
    sample_dev_subset: 500
    src:
        lang: "src"
        max_length: 512
        lowercase: False
        normalize: False
        level: "bpe"
        voc_limit: 32000
        voc_min_freq: 1
        voc_file: "iwslt14_prompt/src_vocab.txt"
        tokenizer_type: "sentencepiece"
        tokenizer_cfg:
            model_file: "iwslt14_prompt/sp.model"
            model_type: "unigram"
            character_coverage: 1.0
    trg:
        lang: "trg"
        max_length: 512
        lowercase: False
        normalize: False
        level: "bpe"
        voc_limit: 32000
        voc_min_freq: 1
        voc_file: "iwslt14_prompt/trg_vocab.txt"
        tokenizer_type: "sentencepiece"
        tokenizer_cfg:
            model_file: "iwslt14_prompt/sp.model"
            model_type: "unigram"
            character_coverage: 1.0
    special_symbols:
        unk_token: "<unk>"
        unk_id: 0
        pad_token: "<pad>"
        pad_id: 1
        bos_token: "<s>"
        bos_id: 2
        eos_token: "</s>"
        eos_id: 3
        sep_token: "<sep>"
        sep_id: 4
        lang_tags: ["<de>", "<en>", "<fr>"]

testing:
    load_model: "iwslt14_prompt/avg5.ckpt"
    n_best: 1
    beam_size: 5
    beam_alpha: 1.0
    batch_size: 32
    batch_type: "sentence"
    max_output_length: 512
    eval_metrics: ["bleu"]
    sacrebleu_cfg:
        tokenize: "13a"
        lowercase: True

training:
    #load_model: "iwslt14_prompt/latest.ckpt"
    #reset_best_ckpt: True
    #reset_scheduler: True
    #reset_optimizer: True
    #reset_iter_state: True
    optimizer: "adamw"
    normalization: "tokens"
    adam_betas: [0.9, 0.98]
    scheduling: "warmupinversesquareroot"
    learning_rate_warmup: 10000
    learning_rate: 0.0002
    learning_rate_min: 0.0000001
    weight_decay: 0.001
    label_smoothing: 0.1
    loss: "crossentropy"
    batch_size: 32
    batch_type: "sentence"
    batch_multiplier: 4
    early_stopping_metric: "bleu"
    epochs: 50
    validation_freq: 1000
    logging_freq: 100
    overwrite: False
    shuffle: True
    print_valid_sents: [0, 1, 2, 3]
    keep_best_ckpts: 5

model:
    initializer: "xavier_uniform"
    bias_initializer: "zeros"
    init_gain: 1.0
    embed_initializer: "xavier_uniform"
    embed_init_gain: 1.0
    tied_embeddings: True
    tied_softmax: True
    encoder:
        type: "transformer"
        num_layers: 6
        num_heads: 8
        embeddings:
            embedding_dim: 1024
            scale: True
            dropout: 0.1
        # typically ff_size = 4 x hidden_size
        hidden_size: 1024
        ff_size: 4096
        dropout: 0.1
        layer_norm: "pre"
        activation: "relu"
    decoder:
        type: "transformer"
        num_layers: 6
        num_heads: 8
        embeddings:
            embedding_dim: 1024
            scale: True
            dropout: 0.1
        # typically ff_size = 4 x hidden_size
        hidden_size: 1024
        ff_size: 4096
        dropout: 0.1
        layer_norm: "pre"
        activation: "relu"