# The frequency to save trained models when training. save_step: 10000 # The frequency to fetch and print output when training. print_step: 100 # Path of the checkpoint, to resume the previous training init_from_checkpoint: "" # Path of the pretrain model, to better solve the current task init_from_pretrain_model: "" # Path of trained parameter, to make prediction init_from_params: "./trained_models/step_final/" # The directory for saving model save_model: "trained_models" # The directory for saving inference model. inference_model_dir: "infer_model" # Set seed for CE or debug random_seed: None # The pattern to match training data files. training_file: "gen_data/wmt14_ende_data_bpe/train.tok.clean.bpe.33708.en-de" # The pattern to match validation data files. validation_file: "gen_data/wmt14_ende_data_bpe/newstest2013.tok.bpe.33708.en-de" # The pattern to match test data files. predict_file: "gen_data/wmt14_ende_data_bpe/newstest2014.tok.bpe.33708.en-de" # The file to output the translation results of predict_file to. output_file: "predict.txt" # The path of vocabulary file of source language. src_vocab_fpath: "gen_data/wmt14_ende_data_bpe/vocab_all.bpe.33708" # The path of vocabulary file of target language. trg_vocab_fpath: "gen_data/wmt14_ende_data_bpe/vocab_all.bpe.33708" # The , and tokens in the dictionary. special_token: ["", "", ""] # Whether to use cuda use_gpu: True # Args for reader, see reader.py for details token_delimiter: " " use_token_batch: True pool_size: 200000 sort_type: "pool" shuffle: True shuffle_batch: True batch_size: 4096 infer_batch_size: 16 # Hyparams for training: # The number of epoches for training epoch: 30 # The hyper parameters for Adam optimizer. # This static learning_rate will be applied to the LearningRateScheduler # derived learning rate the to get the final learning rate. learning_rate: 2.0 beta1: 0.9 beta2: 0.997 eps: 1e-9 # The parameters for learning rate scheduling. warmup_steps: 8000 # The weight used to mix up the ground-truth distribution and the fixed # uniform distribution in label smoothing when training. # Set this as zero if label smoothing is not wanted. label_smooth_eps: 0.1 # Hyparams for generation: # The parameters for beam search. beam_size: 5 max_out_len: 1024 # The number of decoded sentences to output. n_best: 1 # Hyparams for model: # These following five vocabularies related configurations will be set # automatically according to the passed vocabulary path and special tokens. # Size of source word dictionary. src_vocab_size: 10000 # Size of target word dictionay trg_vocab_size: 10000 # Index for token bos_idx: 0 # Index for token eos_idx: 1 # Index for token unk_idx: 2 # Max length of sequences deciding the size of position encoding table. max_length: 1024 # The dimension for word embeddings, which is also the last dimension of # the input and output of multi-head attention, position-wise feed-forward # networks, encoder and decoder. d_model: 1024 # Size of the hidden layer in position-wise feed-forward networks. d_inner_hid: 4096 # Number of head used in multi-head attention. n_head: 16 # Number of sub-layers to be stacked in the encoder and decoder. n_layer: 6 # Dropout rates. dropout: 0.1 # The flag indicating whether to share embedding and softmax weights. # Vocabularies in source and target should be same for weight sharing. weight_sharing: True