From 5191bf60365d0215c96583b73ced05f6de50514d Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Tue, 5 Mar 2019 13:44:46 +0000 Subject: [PATCH] Change default of some args & activate travis-ci --- .travis.yml | 30 ++++++++++++++++++++++++++++++ .travis/precommit.sh | 21 +++++++++++++++++++++ BERT/convert_params.py | 2 +- BERT/predict_classifier.py | 3 +-- BERT/run_classifier.py | 6 +++--- BERT/run_squad.py | 19 +++++++++---------- BERT/train.py | 2 +- 7 files changed, 66 insertions(+), 17 deletions(-) create mode 100644 .travis.yml create mode 100755 .travis/precommit.sh diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..94e00a1 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,30 @@ +language: cpp +cache: ccache +sudo: required +dist: trusty +services: + - docker +os: + - linux +env: + - JOB=PRE_COMMIT + +addons: + apt: + packages: + - git + - python + - python-pip + - python2.7-dev + ssh_known_hosts: 13.229.163.131 +before_install: + - sudo pip install -U virtualenv pre-commit pip + +script: + - exit_code=0 + - .travis/precommit.sh || exit_code=$(( exit_code | $? )) + +notifications: + email: + on_success: change + on_failure: always diff --git a/.travis/precommit.sh b/.travis/precommit.sh new file mode 100755 index 0000000..369fa51 --- /dev/null +++ b/.travis/precommit.sh @@ -0,0 +1,21 @@ +#!/bin/bash +function abort(){ + echo "Your commit does not fit PaddlePaddle code style" 1>&2 + echo "Please use pre-commit scripts to auto-format your code" 1>&2 + exit 1 +} + +trap 'abort' 0 +set -e +cd `dirname $0` +cd .. +export PATH=/usr/bin:$PATH +pre-commit install + +if ! pre-commit run -a ; then + ls -lh + git diff --exit-code + exit 1 +fi + +trap : 0 diff --git a/BERT/convert_params.py b/BERT/convert_params.py index 760c7ac..17cada9 100644 --- a/BERT/convert_params.py +++ b/BERT/convert_params.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np import argparse import collections -from args import print_arguments +from utils.args import print_arguments import tensorflow as tf import paddle.fluid as fluid from tensorflow.python import pywrap_tensorflow diff --git a/BERT/predict_classifier.py b/BERT/predict_classifier.py index fe813ff..51e840d 100644 --- a/BERT/predict_classifier.py +++ b/BERT/predict_classifier.py @@ -41,7 +41,7 @@ model_g.add_arg("use_fp16", bool, False, "Whether to resume data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options.") data_g.add_arg("data_dir", str, None, "Directory to test data.") data_g.add_arg("vocab_path", str, None, "Vocabulary path.") -data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") +data_g.add_arg("max_seq_len", int, 128, "Number of words of the longest seqence.") data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.") data_g.add_arg("in_tokens", bool, False, "If set, the batch size will be the maximum number of tokens in one batch. " @@ -51,7 +51,6 @@ data_g.add_arg("do_lower_case", bool, True, run_type_g = ArgumentGroup(parser, "run_type", "running type options.") run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") -run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).") run_type_g.add_arg("task_name", str, None, "The name of task to perform fine-tuning, should be in {'xnli', 'mnli', 'cola', 'mrpc'}.") run_type_g.add_arg("do_prediction", bool, True, "Whether to do prediction on test set.") diff --git a/BERT/run_classifier.py b/BERT/run_classifier.py index 5ba2ca9..1453ef7 100644 --- a/BERT/run_classifier.py +++ b/BERT/run_classifier.py @@ -44,7 +44,7 @@ model_g.add_arg("init_pretraining_params", str, None, model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.") train_g = ArgumentGroup(parser, "training", "training options.") -train_g.add_arg("epoch", int, 100, "Number of epoches for training.") +train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) @@ -65,13 +65,13 @@ data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data process data_g.add_arg("data_dir", str, None, "Path to training data.") data_g.add_arg("vocab_path", str, None, "Vocabulary path.") data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") -data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.") +data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.") data_g.add_arg("in_tokens", bool, False, "If set, the batch size will be the maximum number of tokens in one batch. " "Otherwise, it will be the maximum number of examples in one batch.") data_g.add_arg("do_lower_case", bool, True, "Whether to lower case the input text. Should be True for uncased models and False for cased models.") -data_g.add_arg("random_seed", int, 0, "Random seed.") +data_g.add_arg("random_seed", int, 0, "Random seed.") run_type_g = ArgumentGroup(parser, "run_type", "running type options.") run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") diff --git a/BERT/run_squad.py b/BERT/run_squad.py index 313f06e..07ce1c1 100644 --- a/BERT/run_squad.py +++ b/BERT/run_squad.py @@ -43,16 +43,15 @@ model_g.add_arg("init_pretraining_params", str, None, model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.") train_g = ArgumentGroup(parser, "training", "training options.") -train_g.add_arg("epoch", int, 100, "Number of epoches for training.") -train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") +train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") +train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) -train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.") +train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.") train_g.add_arg("warmup_proportion", float, 0.1, "Proportion of training steps to perform linear learning rate warmup for.") -train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.") -train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.") -train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.") +train_g.add_arg("save_steps", int, 1000, "The steps interval to save checkpoints.") +train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.") train_g.add_arg("loss_scaling", float, 1.0, "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.") @@ -67,9 +66,9 @@ data_g.add_arg("vocab_path", str, None, "Vocabulary path.") data_g.add_arg("version_2_with_negative", bool, False, "If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.") data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") -data_g.add_arg("max_query_length", int, 64, "Max query length.") -data_g.add_arg("max_answer_length", int, 64, "Max answer length.") -data_g.add_arg("batch_size", int, 12, "Total samples' number in batch for training. see also --in_tokens.") +data_g.add_arg("max_query_length", int, 64, "Max query length.") +data_g.add_arg("max_answer_length", int, 30, "Max answer length.") +data_g.add_arg("batch_size", int, 12, "Total examples' number in batch for training. see also --in_tokens.") data_g.add_arg("in_tokens", bool, False, "If set, the batch size will be the maximum number of tokens in one batch. " "Otherwise, it will be the maximum number of examples in one batch.") @@ -81,7 +80,7 @@ data_g.add_arg("n_best_size", int, 20, "The total number of n-best predictions to generate in the nbest_predictions.json output file.") data_g.add_arg("null_score_diff_threshold", float, 0.0, "If null_score - best_non_null is greater than the threshold predict null.") -data_g.add_arg("random_seed", int, 0, "Random seed.") +data_g.add_arg("random_seed", int, 0, "Random seed.") run_type_g = ArgumentGroup(parser, "run_type", "running type options.") run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") diff --git a/BERT/train.py b/BERT/train.py index 55d362b..51df870 100644 --- a/BERT/train.py +++ b/BERT/train.py @@ -65,7 +65,7 @@ data_g.add_arg("validation_set_dir", str, "./data/validation/", "Path to trai data_g.add_arg("test_set_dir", str, None, "Path to training data.") data_g.add_arg("vocab_path", str, "./config/vocab.txt", "Vocabulary path.") data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") -data_g.add_arg("batch_size", int, 8192, "Total examples' number in batch for training. see also --in_tokens.") +data_g.add_arg("batch_size", int, 16, "Total examples' number in batch for training. see also --in_tokens.") data_g.add_arg("in_tokens", bool, False, "If set, the batch size will be the maximum number of tokens in one batch. " "Otherwise, it will be the maximum number of examples in one batch.") -- GitLab