train_lib.py 4.7 KB
Newer Older
H
Hongkun Yu 已提交
1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
H
Hongkun Yu 已提交
2 3 4 5 6 7 8 9 10 11 12 13
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
H
Hongkun Yu 已提交
14

H
Hongkun Yu 已提交
15
"""TFM common training driver library."""
H
Hongkun Yu 已提交
16
# pytype: disable=attribute-error
H
Hongkun Yu 已提交
17
import os
18
from typing import Any, Mapping, Tuple
H
Hongkun Yu 已提交
19 20 21 22 23 24 25

# Import libraries
from absl import logging
import orbit
import tensorflow as tf

from official.core import base_task
26
from official.core import config_definitions
L
Le Hou 已提交
27
from official.core import train_utils
H
Hongkun Yu 已提交
28

H
Hongkun Yu 已提交
29
BestCheckpointExporter = train_utils.BestCheckpointExporter
30
maybe_create_best_ckpt_exporter = train_utils.maybe_create_best_ckpt_exporter
A
A. Unique TensorFlower 已提交
31 32


H
Hongkun Yu 已提交
33 34 35 36 37 38
def run_experiment(distribution_strategy: tf.distribute.Strategy,
                   task: base_task.Task,
                   mode: str,
                   params: config_definitions.ExperimentConfig,
                   model_dir: str,
                   run_post_eval: bool = False,
39 40
                   save_summary: bool = True) \
-> Tuple[tf.keras.Model, Mapping[str, Any]]:
H
Hongkun Yu 已提交
41 42 43 44 45 46 47 48 49 50 51 52 53 54
  """Runs train/eval configured by the experiment params.

  Args:
    distribution_strategy: A distribution distribution_strategy.
    task: A Task instance.
    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
      or 'continuous_eval'.
    params: ExperimentConfig instance.
    model_dir: A 'str', a path to store model checkpoints and summaries.
    run_post_eval: Whether to run post eval once after training, metrics logs
      are returned.
    save_summary: Whether to save train and validation summary.

  Returns:
55 56 57 58
    A 2-tuple of (model, eval_logs).
      model: `tf.keras.Model` instance.
      eval_logs: returns eval metrics logs when run_post_eval is set to True,
        otherwise, returns {}.
H
Hongkun Yu 已提交
59 60 61 62 63 64 65
  """

  with distribution_strategy.scope():
    trainer = train_utils.create_trainer(
        params,
        task,
        train='train' in mode,
A
A. Unique TensorFlower 已提交
66
        evaluate=('eval' in mode) or run_post_eval,
67 68
        checkpoint_exporter=maybe_create_best_ckpt_exporter(
            params, model_dir))
H
Hongkun Yu 已提交
69 70 71 72 73 74 75 76 77

  if trainer.checkpoint:
    checkpoint_manager = tf.train.CheckpointManager(
        trainer.checkpoint,
        directory=model_dir,
        max_to_keep=params.trainer.max_to_keep,
        step_counter=trainer.global_step,
        checkpoint_interval=params.trainer.checkpoint_interval,
        init_fn=trainer.initialize)
78 79
    # Adds recovery handling.
    trainer.add_recovery(params.trainer, checkpoint_manager=checkpoint_manager)
H
Hongkun Yu 已提交
80 81 82 83
  else:
    checkpoint_manager = None

  controller = orbit.Controller(
84
      strategy=distribution_strategy,
H
Hongkun Yu 已提交
85 86 87 88 89
      trainer=trainer if 'train' in mode else None,
      evaluator=trainer,
      global_step=trainer.global_step,
      steps_per_loop=params.trainer.steps_per_loop,
      checkpoint_manager=checkpoint_manager,
90 91 92 93 94
      summary_dir=os.path.join(model_dir, 'train') if (save_summary) else None,
      eval_summary_dir=os.path.join(model_dir, 'validation') if
      (save_summary) else None,
      summary_interval=params.trainer.summary_interval if
      (save_summary) else None)
H
Hongkun Yu 已提交
95 96 97 98 99 100 101 102 103 104 105 106 107

  logging.info('Starts to execute mode: %s', mode)
  with distribution_strategy.scope():
    if mode == 'train':
      controller.train(steps=params.trainer.train_steps)
    elif mode == 'train_and_eval':
      controller.train_and_evaluate(
          train_steps=params.trainer.train_steps,
          eval_steps=params.trainer.validation_steps,
          eval_interval=params.trainer.validation_interval)
    elif mode == 'eval':
      controller.evaluate(steps=params.trainer.validation_steps)
    elif mode == 'continuous_eval':
108

H
Hongkun Yu 已提交
109 110 111 112
      def timeout_fn():
        if trainer.global_step.numpy() >= params.trainer.train_steps:
          return True
        return False
113

H
Hongkun Yu 已提交
114 115
      controller.evaluate_continuously(
          steps=params.trainer.validation_steps,
H
Hongkun Yu 已提交
116 117
          timeout=params.trainer.continuous_eval_timeout,
          timeout_fn=timeout_fn)
H
Hongkun Yu 已提交
118 119 120
    else:
      raise NotImplementedError('The mode is not implemented: %s' % mode)

A
A. Unique TensorFlower 已提交
121 122 123
  if hasattr(trainer.model, 'count_params'):
    logging.info('Number of trainable params in model: %f Millions.',
                 trainer.model.count_params() / 10.**6)
H
Hongkun Yu 已提交
124 125
  if run_post_eval:
    with distribution_strategy.scope():
126
      return trainer.model, trainer.evaluate(
H
Hongkun Yu 已提交
127 128
          tf.convert_to_tensor(params.trainer.validation_steps))
  else:
129
    return trainer.model, {}