Change default of some args & activate travis-ci

5191bf60 · Yibing Liu · 880a14e9 · 5191bf60 · 5191bf60 · 5191bf60
7 changed file
--- a/.travis.yml
+++ b/.travis.yml
+language: cpp
+cache: ccache
+sudo: required
+dist: trusty
+services:
+  - docker
+os:
+  - linux
+env:
+  - JOB=PRE_COMMIT
+
+addons:
+  apt:
+    packages:
+      - git
+      - python
+      - python-pip
+      - python2.7-dev
+  ssh_known_hosts: 13.229.163.131
+before_install:
+  - sudo pip install -U virtualenv pre-commit pip
+
+script:
+  - exit_code=0
+  - .travis/precommit.sh || exit_code=$(( exit_code | $? ))
+
+notifications:
+  email:
+    on_success: change
+    on_failure: always
--- a/.travis/precommit.sh
+++ b/.travis/precommit.sh
+#!/bin/bash
+function abort(){
+    echo "Your commit does not fit PaddlePaddle code style" 1>&2
+    echo "Please use pre-commit scripts to auto-format your code" 1>&2
+    exit 1
+}
+
+trap 'abort' 0
+set -e
+cd `dirname $0`
+cd ..
+export PATH=/usr/bin:$PATH
+pre-commit install
+
+if ! pre-commit run -a ; then
+  ls -lh
+  git diff  --exit-code
+  exit 1
+fi
+
+trap : 0
--- a/BERT/convert_params.py
+++ b/BERT/convert_params.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 import numpy as np
 import argparse
 import collections
-from args import print_arguments
+from utils.args import print_arguments
 import tensorflow as tf
 import paddle.fluid as fluid
 from tensorflow.python import pywrap_tensorflow

--- a/BERT/predict_classifier.py
+++ b/BERT/predict_classifier.py
@@ -41,7 +41,7 @@ model_g.add_arg("use_fp16",                     bool, False, "Whether to resume
 data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options.")
 data_g.add_arg("data_dir",      str,  None,  "Directory to test data.")
 data_g.add_arg("vocab_path",    str,  None,  "Vocabulary path.")
-data_g.add_arg("max_seq_len",   int,  512,   "Number of words of the longest seqence.")
+data_g.add_arg("max_seq_len",   int,  128,   "Number of words of the longest seqence.")
 data_g.add_arg("batch_size",    int,  32,    "Total examples' number in batch for training. see also --in_tokens.")
 data_g.add_arg("in_tokens",     bool, False,
              "If set, the batch size will be the maximum number of tokens in one batch. "
@@ -51,7 +51,6 @@ data_g.add_arg("do_lower_case", bool, True,

 run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
 run_type_g.add_arg("use_cuda",          bool,   True,  "If set, use GPU for training.")
-run_type_g.add_arg("use_fast_executor", bool,   False, "If set, use fast parallel executor (in experiment).")
 run_type_g.add_arg("task_name",         str,    None,
                   "The name of task to perform fine-tuning, should be in {'xnli', 'mnli', 'cola', 'mrpc'}.")
 run_type_g.add_arg("do_prediction",     bool,   True,  "Whether to do prediction on test set.")

--- a/BERT/run_classifier.py
+++ b/BERT/run_classifier.py
@@ -44,7 +44,7 @@ model_g.add_arg("init_pretraining_params",  str,  None,
 model_g.add_arg("checkpoints",              str,  "checkpoints",  "Path to save checkpoints.")

 train_g = ArgumentGroup(parser, "training", "training options.")
-train_g.add_arg("epoch",             int,    100,     "Number of epoches for training.")
+train_g.add_arg("epoch",             int,    3,       "Number of epoches for fine-tuning.")
 train_g.add_arg("learning_rate",     float,  5e-5,    "Learning rate used to train with warmup.")
 train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay",
                "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
@@ -65,13 +65,13 @@ data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data process
 data_g.add_arg("data_dir",      str,  None,  "Path to training data.")
 data_g.add_arg("vocab_path",    str,  None,  "Vocabulary path.")
 data_g.add_arg("max_seq_len",   int,  512,   "Number of words of the longest seqence.")
-data_g.add_arg("batch_size",    int,  32,  "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("batch_size",    int,  32,    "Total examples' number in batch for training. see also --in_tokens.")
 data_g.add_arg("in_tokens",     bool, False,
              "If set, the batch size will be the maximum number of tokens in one batch. "
              "Otherwise, it will be the maximum number of examples in one batch.")
 data_g.add_arg("do_lower_case", bool, True,
               "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
-data_g.add_arg("random_seed",   int,  0,  "Random seed.")
+data_g.add_arg("random_seed",   int,  0,     "Random seed.")

 run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
 run_type_g.add_arg("use_cuda",                     bool,   True,  "If set, use GPU for training.")

--- a/BERT/run_squad.py
+++ b/BERT/run_squad.py
@@ -43,16 +43,15 @@ model_g.add_arg("init_pretraining_params",  str,  None,
 model_g.add_arg("checkpoints",              str,  "checkpoints",  "Path to save checkpoints.")

 train_g = ArgumentGroup(parser, "training", "training options.")
-train_g.add_arg("epoch",             int,    100,     "Number of epoches for training.")
-train_g.add_arg("learning_rate",     float,  5e-5,    "Learning rate used to train with warmup.")
+train_g.add_arg("epoch",             int,    3,      "Number of epoches for fine-tuning.")
+train_g.add_arg("learning_rate",     float,  5e-5,   "Learning rate used to train with warmup.")
 train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay",
                "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
-train_g.add_arg("weight_decay",      float,  0.01,    "Weight decay rate for L2 regularizer.")
+train_g.add_arg("weight_decay",      float,  0.01,   "Weight decay rate for L2 regularizer.")
 train_g.add_arg("warmup_proportion", float,  0.1,
                "Proportion of training steps to perform linear learning rate warmup for.")
-train_g.add_arg("save_steps",        int,    10000,   "The steps interval to save checkpoints.")
-train_g.add_arg("validation_steps",  int,    1000,    "The steps interval to evaluate model performance.")
-train_g.add_arg("use_fp16",          bool,   False,   "Whether to use fp16 mixed precision training.")
+train_g.add_arg("save_steps",        int,    1000,   "The steps interval to save checkpoints.")
+train_g.add_arg("use_fp16",          bool,   False,  "Whether to use fp16 mixed precision training.")
 train_g.add_arg("loss_scaling",      float,  1.0,
                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")

@@ -67,9 +66,9 @@ data_g.add_arg("vocab_path",                str,   None,  "Vocabulary path.")
 data_g.add_arg("version_2_with_negative",   bool,  False,
               "If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
 data_g.add_arg("max_seq_len",               int,   512,   "Number of words of the longest seqence.")
-data_g.add_arg("max_query_length",          int,   64,   "Max query length.")
-data_g.add_arg("max_answer_length",         int,   64,   "Max answer length.")
-data_g.add_arg("batch_size",                int,   12,  "Total samples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("max_query_length",          int,   64,    "Max query length.")
+data_g.add_arg("max_answer_length",         int,   30,    "Max answer length.")
+data_g.add_arg("batch_size",                int,   12,    "Total examples' number in batch for training. see also --in_tokens.")
 data_g.add_arg("in_tokens",                 bool,  False,
               "If set, the batch size will be the maximum number of tokens in one batch. "
               "Otherwise, it will be the maximum number of examples in one batch.")
@@ -81,7 +80,7 @@ data_g.add_arg("n_best_size",               int,   20,
               "The total number of n-best predictions to generate in the nbest_predictions.json output file.")
 data_g.add_arg("null_score_diff_threshold", float, 0.0,
               "If null_score - best_non_null is greater than the threshold predict null.")
-data_g.add_arg("random_seed",               int,   0,  "Random seed.")
+data_g.add_arg("random_seed",               int,   0,      "Random seed.")

 run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
 run_type_g.add_arg("use_cuda",              bool,   True,  "If set, use GPU for training.")

--- a/BERT/train.py
+++ b/BERT/train.py
@@ -65,7 +65,7 @@ data_g.add_arg("validation_set_dir",  str,  "./data/validation/",  "Path to trai
 data_g.add_arg("test_set_dir",        str,  None,                  "Path to training data.")
 data_g.add_arg("vocab_path",          str,  "./config/vocab.txt",  "Vocabulary path.")
 data_g.add_arg("max_seq_len",         int,  512,                   "Number of words of the longest seqence.")
-data_g.add_arg("batch_size",          int,  8192,                  "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("batch_size",          int,  16,                    "Total examples' number in batch for training. see also --in_tokens.")
 data_g.add_arg("in_tokens",           bool, False,
              "If set, the batch size will be the maximum number of tokens in one batch. "
              "Otherwise, it will be the maximum number of examples in one batch.")