output_name="seq2seq" init_model="./model_files/unimo_base_en" data_path='./data/gigaword' # hyper params lr_scheduler="linear_warmup_decay" use_fp16="False" # Merge the ALLReduce times of a layer use_fuse="True" use_hierarchical_allreduce="True" loss_scaling=12800 skip_steps=100 save_steps=20000 validation_steps=20000 label_smooth=0.1 weight_decay=0.01 max_seq_len=512 #decoding params do_decode="true" max_src_len=128 max_tgt_len=32 max_out_len=32 min_out_len=5 beam_size=5 length_penalty=0.6 block_trigram="False" use_multi_gpu_test="True" #adam optimizer beta1=0.9 beta2=0.98 epsilon=1e-06 #data tokenized_input="True" continuous_position="False" #dataset train_set="train.tsv" dev_set="dev.20k.tsv" pred_set="test.tsv" test_set="test.tsv" do_train="true" do_val="false" do_test="true" do_pred="false" #evaluate eval_script="bash ./src/eval/tasks/gigaword/eval.sh" eval_mertrics="rouge-1,rouge-2,rouge-l" ## turning params in_tokens="False" pred_batch_size=32 epoch=10 BATCH_SIZE=("32") LR_RATE=("3e-5") DD_RAND_SEED=("1") WARMUP_PROP=("0.06") config_path="./model_files/config/unimo_base_en.json" vocab_file="./model_files/dict/unimo_en.vocab.txt" bpe_json="./model_files/dict/unimo_en.encoder.json" bpe_file="./model_files/dict/unimo_en.vocab.bpe"