Merged commit includes the following changes: (#7141)

256204636 by hongkuny<hongkuny@google.com>: Internal -- 256079834 by hongkuny<hongkuny@google.com>: Clean up: move common flags together for further refactoring Enable steps_per_loop option for all applications. -- PiperOrigin-RevId: 256204636

Merged commit includes the following changes: (#7141)
256204636 by hongkuny<hongkuny@google.com>: Internal -- 256079834 by hongkuny<hongkuny@google.com>: Clean up: move common flags together for further refactoring Enable steps_per_loop option for all applications. -- PiperOrigin-RevId: 256204636
5175b7e6 · saberkun · GitHub · 8155eb9d · 5175b7e6 · 5175b7e6
5 changed file
--- a/official/bert/benchmark/bert_squad_benchmark.py
+++ b/official/bert/benchmark/bert_squad_benchmark.py
@@ -124,6 +124,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    FLAGS.input_meta_data_path = SQUAD_SMALL_INPUT_META_DATA_PATH
    FLAGS.bert_config_file = MODEL_CONFIG_FILE_PATH
    FLAGS.num_train_epochs = 1
+    FLAGS.steps_per_loop = 1

  def _run_and_report_benchmark(self):
    """Runs the benchmark and reports various metrics."""
@@ -200,6 +201,7 @@ class BertSquadAccuracy(BertSquadBenchmarkBase):
    FLAGS.bert_config_file = MODEL_CONFIG_FILE_PATH
    FLAGS.init_checkpoint = PRETRAINED_CHECKPOINT_PATH
    FLAGS.num_train_epochs = 2
+    FLAGS.steps_per_loop = 1

  def _run_and_report_benchmark(self):
    """Runs the benchmark and reports various metrics."""

--- a/official/bert/common_flags.py
+++ b/official/bert/common_flags.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defining common flags used across all BERT models/applications."""
+
+from absl import flags
+
+
+def define_common_bert_flags():
+  """Define the flags related to TPU's."""
+  flags.DEFINE_string('bert_config_file', None,
+                      'Bert configuration file to define core bert layers.')
+  flags.DEFINE_string('model_dir', None, (
+      'The directory where the model weights and training/evaluation summaries '
+      'are stored. If not specified, save to /tmp/bert20/.'))
+  flags.DEFINE_string('tpu', '', 'TPU address to connect to.')
+  flags.DEFINE_string(
+      'init_checkpoint', None,
+      'Initial checkpoint (usually from a pre-trained BERT model).')
+  flags.DEFINE_enum(
+      'strategy_type', 'mirror', ['tpu', 'mirror'],
+      'Distribution Strategy type to use for training. `tpu` uses '
+      'TPUStrategy for running on TPUs, `mirror` uses GPUs with '
+      'single host.')
+  flags.DEFINE_integer('num_train_epochs', 3,
+                       'Total number of training epochs to perform.')
+  flags.DEFINE_integer(
+      'steps_per_loop', 200,
+      'Number of steps per graph-mode loop. Only training step '
+      'happens inside the loop. Callbacks will not be called '
+      'inside.')
+  flags.DEFINE_float('learning_rate', 5e-5,
+                     'The initial learning rate for Adam.')
--- a/official/bert/run_classifier.py
+++ b/official/bert/run_classifier.py
@@ -29,6 +29,7 @@ import tensorflow as tf

 # Import BERT model libraries.
 from official.bert import bert_models
+from official.bert import common_flags
 from official.bert import input_pipeline
 from official.bert import model_saving_utils
 from official.bert import model_training_utils
@@ -42,29 +43,14 @@ flags.DEFINE_enum(
    'trains the model and evaluates in the meantime. '
    '`export_only`: will take the latest checkpoint inside '
    'model_dir and export a `SavedModel`.')
-flags.DEFINE_string('bert_config_file', None,
-                    'Bert configuration file to define core bert layers.')
-flags.DEFINE_string(
-    'model_dir', None,
-    ('The directory where the model weights and training/evaluation summaries '
-     'are stored. If not specified, save to /tmp/bert20/.'))
-flags.DEFINE_string('tpu', '', 'TPU address to connect to.')
 flags.DEFINE_string('train_data_path', None,
                    'Path to training data for BERT classifier.')
 flags.DEFINE_string('eval_data_path', None,
                    'Path to evaluation data for BERT classifier.')
-flags.DEFINE_string(
-    'init_checkpoint', None,
-    'Initial checkpoint (usually from a pre-trained BERT model).')
 flags.DEFINE_string(
    'model_export_path', None,
    'Path to the directory, where trainined model will be '
    'exported.')
-flags.DEFINE_enum(
-    'strategy_type', 'mirror', ['tpu', 'mirror'],
-    'Distribution Strategy type to use for training. `tpu` uses '
-    'TPUStrategy for running on TPUs, `mirror` uses GPUs with '
-    'single host.')
 # Model training specific flags.
 flags.DEFINE_string(
    'input_meta_data_path', None,
@@ -72,14 +58,8 @@ flags.DEFINE_string(
    'to be used for training and evaluation.')
 flags.DEFINE_integer('train_batch_size', 32, 'Batch size for training.')
 flags.DEFINE_integer('eval_batch_size', 32, 'Batch size for evaluation.')
-flags.DEFINE_integer('num_train_epochs', 3,
-                     'Total number of training epochs to perform.')
-flags.DEFINE_integer(
-    'steps_per_loop', 200,
-    'Number of steps per graph-mode loop. Only training step '
-    'happens inside the loop. Callbacks will not be called '
-    'inside.')
-flags.DEFINE_float('learning_rate', 5e-5, 'The initial learning rate for Adam.')
+
+common_flags.define_common_bert_flags()

 FLAGS = flags.FLAGS


--- a/official/bert/run_pretraining.py
+++ b/official/bert/run_pretraining.py
@@ -27,6 +27,7 @@ import tensorflow as tf

 # Import BERT model libraries.
 from official.bert import bert_models
+from official.bert import common_flags
 from official.bert import input_pipeline
 from official.bert import model_training_utils
 from official.bert import modeling
@@ -35,18 +36,6 @@ from official.bert import tpu_lib

 flags.DEFINE_string('input_files', None,
                    'File path to retrieve training data for pre-training.')
-flags.DEFINE_string('bert_config_file', None,
-                    'Bert configuration file to define core bert layers.')
-flags.DEFINE_string(
-    'model_dir', None,
-    ('The directory where the model weights and training/evaluation summaries '
-     'are stored. If not specified, save to /tmp/bert20/.'))
-flags.DEFINE_string('tpu', '', 'TPU address to connect to.')
-flags.DEFINE_enum(
-    'strategy_type', 'mirror', ['tpu', 'mirror'],
-    'Distribution Strategy type to use for training. `tpu` uses '
-    'TPUStrategy for running on TPUs, `mirror` uses GPUs with '
-    'single host.')
 # Model training specific flags.
 flags.DEFINE_integer(
    'max_seq_length', 128,
@@ -56,14 +45,13 @@ flags.DEFINE_integer(
 flags.DEFINE_integer('max_predictions_per_seq', 20,
                     'Maximum predictions per sequence_output.')
 flags.DEFINE_integer('train_batch_size', 32, 'Total batch size for training.')
-flags.DEFINE_integer('num_train_epochs', 3,
-                     'Total number of training epochs to perform.')
 flags.DEFINE_integer('num_steps_per_epoch', 1000,
                     'Total number of training steps to run per epoch.')
-flags.DEFINE_float('learning_rate', 5e-5, 'The initial learning rate for Adam.')
 flags.DEFINE_float('warmup_steps', 10000,
                   'Warmup steps for Adam weight decay optimizer.')

+common_flags.define_common_bert_flags()
+
 FLAGS = flags.FLAGS


@@ -116,6 +104,7 @@ def run_customized_training(strategy,
                            max_predictions_per_seq,
                            model_dir,
                            steps_per_epoch,
+                            steps_per_loop,
                            epochs,
                            initial_lr,
                            warmup_steps,
@@ -142,6 +131,7 @@ def run_customized_training(strategy,
      model_dir=model_dir,
      train_input_fn=train_input_fn,
      steps_per_epoch=steps_per_epoch,
+      steps_per_loop=steps_per_loop,
      epochs=epochs,
      use_remote_tpu=use_remote_tpu)

@@ -165,6 +155,7 @@ def run_bert_pretrain(strategy):
      FLAGS.max_predictions_per_seq,
      FLAGS.model_dir,
      FLAGS.num_steps_per_epoch,
+      FLAGS.steps_per_loop,
      FLAGS.num_train_epochs,
      FLAGS.learning_rate,
      FLAGS.warmup_steps,

--- a/official/bert/run_squad.py
+++ b/official/bert/run_squad.py
@@ -29,6 +29,7 @@ import tensorflow as tf

 # Import BERT model libraries.
 from official.bert import bert_models
+from official.bert import common_flags
 from official.bert import input_pipeline
 from official.bert import model_training_utils
 from official.bert import modeling
@@ -41,31 +42,12 @@ flags.DEFINE_bool('do_train', False, 'Whether to run training.')
 flags.DEFINE_bool('do_predict', False, 'Whether to run eval on the dev set.')
 flags.DEFINE_string('train_data_path', '',
                    'Training data path with train tfrecords.')
-flags.DEFINE_string('bert_config_file', None,
-                    'Bert configuration file to define core bert layers.')
-flags.DEFINE_string(
-    'model_dir', None,
-    ('The directory where the model weights and training/evaluation summaries '
-     'are stored.'))
 flags.DEFINE_string(
    'input_meta_data_path', None,
    'Path to file that contains meta data about input '
    'to be used for training and evaluation.')
-flags.DEFINE_string('tpu', '', 'TPU address to connect to.')
-flags.DEFINE_string(
-    'init_checkpoint', None,
-    'Initial checkpoint (usually from a pre-trained BERT model).')
-flags.DEFINE_enum(
-    'strategy_type', 'mirror', ['tpu', 'mirror'],
-    'Distribution Strategy type to use for training. `tpu` uses '
-    'TPUStrategy for running on TPUs, `mirror` uses GPUs with '
-    'single host.')
 # Model training specific flags.
 flags.DEFINE_integer('train_batch_size', 32, 'Total batch size for training.')
-flags.DEFINE_integer('num_train_epochs', 3,
-                     'Total number of training epochs to perform.')
-flags.DEFINE_float('learning_rate', 5e-5, 'The initial learning rate for Adam.')
-
 # Predict processing related.
 flags.DEFINE_string('predict_file', None,
                    'Prediction data path with train tfrecords.')
@@ -90,6 +72,8 @@ flags.DEFINE_integer(
    'The maximum length of an answer that can be generated. This is needed '
    'because the start and end predictions are not conditioned on one another.')

+common_flags.define_common_bert_flags()
+
 FLAGS = flags.FLAGS


@@ -230,6 +214,7 @@ def train_squad(strategy, input_meta_data, custom_callbacks=None):
      loss_fn=loss_fn,
      model_dir=FLAGS.model_dir,
      steps_per_epoch=steps_per_epoch,
+      steps_per_loop=FLAGS.steps_per_loop,
      epochs=epochs,
      train_input_fn=train_input_fn,
      init_checkpoint=FLAGS.init_checkpoint,