transformer.big.yaml 3.3 KB
Newer Older
Z
Zeyu Chen 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
# The frequency to save trained models when training.
save_step: 10000
# The frequency to fetch and print output when training.
print_step: 100
# Path of the checkpoint, to resume the previous training
init_from_checkpoint: ""
# Path of the pretrain model, to better solve the current task
init_from_pretrain_model: ""
# Path of trained parameter, to make prediction
init_from_params: "./trained_models/step_final/"
# The directory for saving model
save_model: "trained_models"
# The directory for saving inference model.
inference_model_dir: "infer_model"
# Set seed for CE or debug
random_seed: None
# The pattern to match training data files.
training_file: "gen_data/wmt14_ende_data_bpe/train.tok.clean.bpe.33708.en-de"
# The pattern to match validation data files.
validation_file: "gen_data/wmt14_ende_data_bpe/newstest2013.tok.bpe.33708.en-de"
# The pattern to match test data files.
predict_file: "gen_data/wmt14_ende_data_bpe/newstest2014.tok.bpe.33708.en-de"
# The file to output the translation results of predict_file to.
output_file: "predict.txt"
# The path of vocabulary file of source language.
src_vocab_fpath: "gen_data/wmt14_ende_data_bpe/vocab_all.bpe.33708"
# The path of vocabulary file of target language.
trg_vocab_fpath: "gen_data/wmt14_ende_data_bpe/vocab_all.bpe.33708"
# The <bos>, <eos> and <unk> tokens in the dictionary.
special_token: ["<s>", "<e>", "<unk>"]

# Whether to use cuda
use_gpu: True

# Args for reader, see reader.py for details
token_delimiter: " "
use_token_batch: True
pool_size: 200000
sort_type: "pool"
shuffle: True
shuffle_batch: True
batch_size: 4096
infer_batch_size: 16

# Hyparams for training:
# The number of epoches for training
epoch: 30

# The hyper parameters for Adam optimizer.
# This static learning_rate will be applied to the LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate: 2.0
beta1: 0.9
beta2: 0.997
eps: 1e-9
# The parameters for learning rate scheduling.
warmup_steps: 8000
# The weight used to mix up the ground-truth distribution and the fixed
# uniform distribution in label smoothing when training.
# Set this as zero if label smoothing is not wanted.
label_smooth_eps: 0.1

# Hyparams for generation:
# The parameters for beam search.
beam_size: 5
max_out_len: 1024
# The number of decoded sentences to output.
n_best: 1

# Hyparams for model:
# These following five vocabularies related configurations will be set
# automatically according to the passed vocabulary path and special tokens.
# Size of source word dictionary.
src_vocab_size: 10000
# Size of target word dictionay
trg_vocab_size: 10000
# Index for <bos> token
bos_idx: 0
# Index for <eos> token
eos_idx: 1
# Index for <unk> token
unk_idx: 2
# Max length of sequences deciding the size of position encoding table.
max_length: 1024
# The dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model: 1024
# Size of the hidden layer in position-wise feed-forward networks.
d_inner_hid: 4096
# Number of head used in multi-head attention.
n_head: 16
# Number of sub-layers to be stacked in the encoder and decoder.
n_layer: 6
# Dropout rates.
dropout: 0.1
# The flag indicating whether to share embedding and softmax weights.
# Vocabularies in source and target should be same for weight sharing.
weight_sharing: True