提交 12890338 编写于 作者: R Russell Power 提交者: TensorFlower Gardener

Turn on worker watchdog in TPUEstimator.

PiperOrigin-RevId: 216618378
上级 9bad98c6
......@@ -35,6 +35,8 @@ from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.training import session_run_hook
from tensorflow.python.training import training_util
_WATCHDOG = None
class CoordinatorShutdownException(Exception):
"""Raised when the coordinator needs to shutdown."""
......@@ -256,6 +258,22 @@ class WatchdogManager(threading.Thread):
time.sleep(self.ping_interval)
def start_worker_watchdog(session,
devices=None,
ping_interval=60,
shutdown_timeout=3600):
"""Start global worker watchdog to shutdown workers on coordinator exit."""
global _WATCHDOG
if _WATCHDOG is None:
# Ensure we can send a few pings before we timeout!
ping_interval = min(shutdown_timeout / 10., ping_interval)
logging.info('Enabling watchdog timer with %d second timeout',
shutdown_timeout)
_WATCHDOG = WatchdogManager(session, devices, ping_interval,
shutdown_timeout)
_WATCHDOG.configure_and_run()
class GracefulShutdownHook(session_run_hook.SessionRunHook):
"""Session hook that watches for shutdown events.
......
......@@ -480,6 +480,12 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
self._outfeed_controller = _OpQueueContext(
name='OutfeedController', target=self._run_outfeed, args=(session,))
# Enable the worker watchdog to terminate workers on coordinator exit.
watchdog_timeout = int(os.environ.get('TF_TPU_WATCHDOG_TIMEOUT', '0'))
if watchdog_timeout > 0:
session_support.start_worker_watchdog(session,
shutdown_timeout=watchdog_timeout)
def before_run(self, run_context):
self._feed_error = None
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册