diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000000000000000000000000000000000..94e00a168462c08c09847a18d4dcae319c7e2a6c --- /dev/null +++ b/.travis.yml @@ -0,0 +1,30 @@ +language: cpp +cache: ccache +sudo: required +dist: trusty +services: + - docker +os: + - linux +env: + - JOB=PRE_COMMIT + +addons: + apt: + packages: + - git + - python + - python-pip + - python2.7-dev + ssh_known_hosts: 13.229.163.131 +before_install: + - sudo pip install -U virtualenv pre-commit pip + +script: + - exit_code=0 + - .travis/precommit.sh || exit_code=$(( exit_code | $? )) + +notifications: + email: + on_success: change + on_failure: always diff --git a/.travis/precommit.sh b/.travis/precommit.sh new file mode 100755 index 0000000000000000000000000000000000000000..369fa5101630431ca72bc630bb070c2e0084b7ca --- /dev/null +++ b/.travis/precommit.sh @@ -0,0 +1,21 @@ +#!/bin/bash +function abort(){ + echo "Your commit does not fit PaddlePaddle code style" 1>&2 + echo "Please use pre-commit scripts to auto-format your code" 1>&2 + exit 1 +} + +trap 'abort' 0 +set -e +cd `dirname $0` +cd .. +export PATH=/usr/bin:$PATH +pre-commit install + +if ! pre-commit run -a ; then + ls -lh + git diff --exit-code + exit 1 +fi + +trap : 0 diff --git a/BERT/convert_params.py b/BERT/convert_params.py index 760c7ac0179669d39eef502e3e06a64d58d3fedb..17cada954d4920dcb6a3c92d0371415655508a23 100644 --- a/BERT/convert_params.py +++ b/BERT/convert_params.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np import argparse import collections -from args import print_arguments +from utils.args import print_arguments import tensorflow as tf import paddle.fluid as fluid from tensorflow.python import pywrap_tensorflow diff --git a/BERT/predict_classifier.py b/BERT/predict_classifier.py index fe813ffeffdd2beead9742e66507aac1b3d62db4..51e840db1113ec7c4c53c94a1f76000b87756460 100644 --- a/BERT/predict_classifier.py +++ b/BERT/predict_classifier.py @@ -41,7 +41,7 @@ model_g.add_arg("use_fp16", bool, False, "Whether to resume data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options.") data_g.add_arg("data_dir", str, None, "Directory to test data.") data_g.add_arg("vocab_path", str, None, "Vocabulary path.") -data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") +data_g.add_arg("max_seq_len", int, 128, "Number of words of the longest seqence.") data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.") data_g.add_arg("in_tokens", bool, False, "If set, the batch size will be the maximum number of tokens in one batch. " @@ -51,7 +51,6 @@ data_g.add_arg("do_lower_case", bool, True, run_type_g = ArgumentGroup(parser, "run_type", "running type options.") run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") -run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).") run_type_g.add_arg("task_name", str, None, "The name of task to perform fine-tuning, should be in {'xnli', 'mnli', 'cola', 'mrpc'}.") run_type_g.add_arg("do_prediction", bool, True, "Whether to do prediction on test set.") diff --git a/BERT/run_classifier.py b/BERT/run_classifier.py index 5ba2ca9224ac0b0710cc078a923c365be5fad847..1453ef74b43aa8771ab38f1ace7531361ceebebc 100644 --- a/BERT/run_classifier.py +++ b/BERT/run_classifier.py @@ -44,7 +44,7 @@ model_g.add_arg("init_pretraining_params", str, None, model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.") train_g = ArgumentGroup(parser, "training", "training options.") -train_g.add_arg("epoch", int, 100, "Number of epoches for training.") +train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) @@ -65,13 +65,13 @@ data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data process data_g.add_arg("data_dir", str, None, "Path to training data.") data_g.add_arg("vocab_path", str, None, "Vocabulary path.") data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") -data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.") +data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.") data_g.add_arg("in_tokens", bool, False, "If set, the batch size will be the maximum number of tokens in one batch. " "Otherwise, it will be the maximum number of examples in one batch.") data_g.add_arg("do_lower_case", bool, True, "Whether to lower case the input text. Should be True for uncased models and False for cased models.") -data_g.add_arg("random_seed", int, 0, "Random seed.") +data_g.add_arg("random_seed", int, 0, "Random seed.") run_type_g = ArgumentGroup(parser, "run_type", "running type options.") run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") diff --git a/BERT/run_squad.py b/BERT/run_squad.py index 313f06e86f256d46c4b4296823f16ea32b3c0338..07ce1c18077566dcbf2e751d51cae092a0beec55 100644 --- a/BERT/run_squad.py +++ b/BERT/run_squad.py @@ -43,16 +43,15 @@ model_g.add_arg("init_pretraining_params", str, None, model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.") train_g = ArgumentGroup(parser, "training", "training options.") -train_g.add_arg("epoch", int, 100, "Number of epoches for training.") -train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") +train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") +train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) -train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.") +train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.") train_g.add_arg("warmup_proportion", float, 0.1, "Proportion of training steps to perform linear learning rate warmup for.") -train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.") -train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.") -train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.") +train_g.add_arg("save_steps", int, 1000, "The steps interval to save checkpoints.") +train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.") train_g.add_arg("loss_scaling", float, 1.0, "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.") @@ -67,9 +66,9 @@ data_g.add_arg("vocab_path", str, None, "Vocabulary path.") data_g.add_arg("version_2_with_negative", bool, False, "If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.") data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") -data_g.add_arg("max_query_length", int, 64, "Max query length.") -data_g.add_arg("max_answer_length", int, 64, "Max answer length.") -data_g.add_arg("batch_size", int, 12, "Total samples' number in batch for training. see also --in_tokens.") +data_g.add_arg("max_query_length", int, 64, "Max query length.") +data_g.add_arg("max_answer_length", int, 30, "Max answer length.") +data_g.add_arg("batch_size", int, 12, "Total examples' number in batch for training. see also --in_tokens.") data_g.add_arg("in_tokens", bool, False, "If set, the batch size will be the maximum number of tokens in one batch. " "Otherwise, it will be the maximum number of examples in one batch.") @@ -81,7 +80,7 @@ data_g.add_arg("n_best_size", int, 20, "The total number of n-best predictions to generate in the nbest_predictions.json output file.") data_g.add_arg("null_score_diff_threshold", float, 0.0, "If null_score - best_non_null is greater than the threshold predict null.") -data_g.add_arg("random_seed", int, 0, "Random seed.") +data_g.add_arg("random_seed", int, 0, "Random seed.") run_type_g = ArgumentGroup(parser, "run_type", "running type options.") run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") diff --git a/BERT/train.py b/BERT/train.py index 55d362b612f56bd4ff6596e4df875cb539805e69..51df8705d9adcd18f1a8ae0f9674158e776bd49e 100644 --- a/BERT/train.py +++ b/BERT/train.py @@ -65,7 +65,7 @@ data_g.add_arg("validation_set_dir", str, "./data/validation/", "Path to trai data_g.add_arg("test_set_dir", str, None, "Path to training data.") data_g.add_arg("vocab_path", str, "./config/vocab.txt", "Vocabulary path.") data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") -data_g.add_arg("batch_size", int, 8192, "Total examples' number in batch for training. see also --in_tokens.") +data_g.add_arg("batch_size", int, 16, "Total examples' number in batch for training. see also --in_tokens.") data_g.add_arg("in_tokens", bool, False, "If set, the batch size will be the maximum number of tokens in one batch. " "Otherwise, it will be the maximum number of examples in one batch.")