transformer.big.yaml 3.3 KB
Newer Older
Z
Zeyu Chen 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
# The frequency to save trained models when training.
save_step: 10000
# The frequency to fetch and print output when training.
print_step: 10
# Path of the checkpoint, to resume the previous training
init_from_checkpoint: ""
# Path of the pretrain model, to better solve the current task
init_from_pretrain_model: ""
# Path of trained parameter, to make prediction
init_from_params: "./trained_models/step_final/"
# The directory for saving model
save_model: "trained_models"
# Set seed for CE or debug
random_seed: None
# The file to output the translation results of predict_file to.
output_file: "predict.txt"
# The <bos>, <eos> and <unk> tokens in the dictionary.
special_token: ["<s>", "<e>", "<unk>"]
19 20
# The directory to store data.
root: None
Z
Zeyu Chen 已提交
21 22 23 24 25 26 27 28 29

# Whether to use cuda
use_gpu: True

# Args for reader, see reader.py for details
pool_size: 200000
sort_type: "global"
batch_size: 4096
infer_batch_size: 16
L
liu zhengxi 已提交
30 31 32 33 34 35
shuffle_batch: True
# Data shuffle only works when sort_type is pool or none
shuffle: True
# shuffle_seed must be set when shuffle is True and using multi-cards to train. 
# Otherwise, the number of batches cannot be guaranteed. 
shuffle_seed: 128
Z
Zeyu Chen 已提交
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68

# Hyparams for training:
# The number of epoches for training
epoch: 30

# The hyper parameters for Adam optimizer.
# This static learning_rate will be applied to the LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate: 2.0
beta1: 0.9
beta2: 0.997
eps: 1e-9
# The parameters for learning rate scheduling.
warmup_steps: 4000
# The weight used to mix up the ground-truth distribution and the fixed
# uniform distribution in label smoothing when training.
# Set this as zero if label smoothing is not wanted.
label_smooth_eps: 0.1

# Hyparams for generation:
# The parameters for beam search.
beam_size: 5
max_out_len: 1024
# The number of decoded sentences to output.
n_best: 1

# Hyparams for model:
# These following five vocabularies related configurations will be set
# automatically according to the passed vocabulary path and special tokens.
# Size of source word dictionary.
src_vocab_size: 10000
# Size of target word dictionay
trg_vocab_size: 10000
69 70
# Used to pad vocab size to be multiple of pad_factor.
pad_factor: 8
71
# Used to pad sequence length to be multiple of pad_seq.
L
liu zhengxi 已提交
72
pad_seq: 1
73 74
# Used to make batch size to be multiple of bsz_multi.
bsz_multi: 8
Z
Zeyu Chen 已提交
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
# Index for <bos> token
bos_idx: 0
# Index for <eos> token
eos_idx: 1
# Index for <unk> token
unk_idx: 2
# Max length of sequences deciding the size of position encoding table.
max_length: 1024
# The dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model: 1024
# Size of the hidden layer in position-wise feed-forward networks.
d_inner_hid: 4096
# Number of head used in multi-head attention.
n_head: 16
# Number of sub-layers to be stacked in the encoder and decoder.
n_layer: 6
# Dropout rates.
dropout: 0.1
# The flag indicating whether to share embedding and softmax weights.
# Vocabularies in source and target should be same for weight sharing.
weight_sharing: True

Z
Zhang Ting 已提交
99
# Mixed precision training
100
use_amp: False
Z
Zhang Ting 已提交
101 102
use_pure_fp16: False
scale_loss: 128.0
103 104

# Whether to use multi-card/multi-node distributed training.
L
liu zhengxi 已提交
105
# Only works for static graph for now.
106 107
is_distributed: True

Z
Zeyu Chen 已提交
108
max_iter: None