From 90b95becee9b2d828fd98b5793296b6eb9ce0a4c Mon Sep 17 00:00:00 2001 From: kuizhiqing Date: Sat, 2 Apr 2022 19:22:57 +0800 Subject: [PATCH] [launch] fix log more stable; default to stdout (#41314) --- .../paddle/distributed/launch/context/node.py | 1 + .../launch/controllers/controller.py | 5 ++-- .../distributed/launch/job/container.py | 25 +++++++++++-------- python/paddle/distributed/launch/main.py | 2 +- 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/python/paddle/distributed/launch/context/node.py b/python/paddle/distributed/launch/context/node.py index 2fa8b892275..8082541ffe0 100644 --- a/python/paddle/distributed/launch/context/node.py +++ b/python/paddle/distributed/launch/context/node.py @@ -44,6 +44,7 @@ class Node(object): return self.free_ports def get_free_port(self): + # for loop to avoid port conflict for _ in range(100): with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py index fbe9df4c9a2..9527ae35c4b 100644 --- a/python/paddle/distributed/launch/controllers/controller.py +++ b/python/paddle/distributed/launch/controllers/controller.py @@ -75,8 +75,9 @@ class ControllerBase(object): while not self.ctx.status.is_done(): status = self.pod.watch(timeout=2) - if self.ctx.continous_log(): - self.pod.logs() + #if self.ctx.continous_log(): + # default to print log + self.pod.logs() # completed if status == self.ctx.status.COMPLETED: diff --git a/python/paddle/distributed/launch/job/container.py b/python/paddle/distributed/launch/job/container.py index 1f43b6ce04b..a1ad6dbe24e 100644 --- a/python/paddle/distributed/launch/job/container.py +++ b/python/paddle/distributed/launch/job/container.py @@ -145,31 +145,34 @@ class Container(object): self.errfile, self._env, ) - def logs(self, fn=None, offset=0, whence=1, lines=1000): + def logs(self, fn=None, offset=0, whence=1, limit=1000): if not self._log_handler: self._log_handler = open(self._out) if fn is None: fn = sys.stdout - self._log_handler.seek(offset, whence) - try: - idx = 0 - for line in self._log_handler: - fn.write(line) - idx += 1 - if idx > lines: + if offset != 0 or whence != 1: + self._log_handler.seek(offset, whence) + + for _ in range(limit): + line = self._log_handler.readline() + if not line: break - finally: + fn.write(line) + except: return def tail(self, length=3000): if not self._log_handler: self._log_handler = open(self._out) - self._log_handler.seek(0, 2) - ed = self._log_handler.tell() + try: + self._log_handler.seek(0, 2) + ed = self._log_handler.tell() + except: + pass if ed > length: self.logs(offset=ed - length, whence=0) diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py index dd7edba35a4..400a4472602 100644 --- a/python/paddle/distributed/launch/main.py +++ b/python/paddle/distributed/launch/main.py @@ -40,7 +40,7 @@ def launch(): - ``--rank``: The rank of the node, can be auto assigned by master. Default ``--rank=-1``. - - ``--log_level``: The log level to set for logging.setLevel which can be CRITICAL/ERROR/WARNING/INFO/DEBUG/NOTSET, case insensitive. The rank 0 log will not print in the terminal by default, while you can enable it by adding --log_level=debug. Default ``--log_level=INFO``. + - ``--log_level``: The log level to set for logging.setLevel which can be CRITICAL/ERROR/WARNING/INFO/DEBUG/NOTSET, case insensitive. Default ``--log_level=INFO``. - ``--nnodes``: The number of nodes for a distributed job, it can be a range in elastic mode, e.g., ``--nnodes=2:3``. Default ``--nnodes=1``. -- GitLab