未验证 提交 cc6dcc7d 编写于 作者: C Chitsing KUI 提交者: GitHub

[LAUNCH] add log overwrite flag (#53608) (#53757)

* add log overwrite flag

* use strtobool
上级 adaa2510
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import os import os
from argparse import REMAINDER, ArgumentParser from argparse import REMAINDER, ArgumentParser
from distutils.util import strtobool
env_args_mapping = { env_args_mapping = {
'POD_IP': 'host', 'POD_IP': 'host',
...@@ -22,6 +23,7 @@ env_args_mapping = { ...@@ -22,6 +23,7 @@ env_args_mapping = {
'PADDLE_NNODES': 'nnodes', 'PADDLE_NNODES': 'nnodes',
'PADDLE_RUN_MODE': 'run_mode', 'PADDLE_RUN_MODE': 'run_mode',
'PADDLE_LOG_LEVEL': 'log_level', 'PADDLE_LOG_LEVEL': 'log_level',
'PADDLE_LOG_OVERWRITE': 'log_overwrite',
'PADDLE_NPROC_PER_NODE': 'nproc_per_node', 'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
'PADDLE_JOB_ID': 'job_id', 'PADDLE_JOB_ID': 'job_id',
'PADDLE_RANK': 'rank', 'PADDLE_RANK': 'rank',
...@@ -61,7 +63,7 @@ def parse_args(): ...@@ -61,7 +63,7 @@ def parse_args():
) )
base_group.add_argument( base_group.add_argument(
"--legacy", type=bool, default=False, help="use legacy launch" "--legacy", type=strtobool, default=False, help="use legacy launch"
) )
base_group.add_argument( base_group.add_argument(
...@@ -72,6 +74,13 @@ def parse_args(): ...@@ -72,6 +74,13 @@ def parse_args():
"--log_level", type=str, default="INFO", help="log level. Default INFO" "--log_level", type=str, default="INFO", help="log level. Default INFO"
) )
base_group.add_argument(
"--log_overwrite",
type=strtobool,
default=False,
help="overwrite exits logfiles. Default False",
)
base_group.add_argument( base_group.add_argument(
"--nnodes", "--nnodes",
type=str, type=str,
......
...@@ -205,6 +205,7 @@ class Controller(ControllerBase): ...@@ -205,6 +205,7 @@ class Controller(ControllerBase):
c = Container( c = Container(
entrypoint=(entrypoint or self._get_entrypoint()), entrypoint=(entrypoint or self._get_entrypoint()),
env=(self.ctx.get_envs() if use_ctx_env else {}), env=(self.ctx.get_envs() if use_ctx_env else {}),
overwrite_log=self.ctx.args.log_overwrite,
) )
c.outfile, c.errfile = self._get_out_err_file(out, err) c.outfile, c.errfile = self._get_out_err_file(out, err)
c.update_env(envs) c.update_env(envs)
...@@ -286,7 +287,7 @@ class Controller(ControllerBase): ...@@ -286,7 +287,7 @@ class Controller(ControllerBase):
) )
try: try:
os.makedirs(os.path.dirname(f), exist_ok=True) os.makedirs(os.path.dirname(f), exist_ok=True)
with open(f, 'w') as fd: with open(f, container.log_mode) as fd:
for k, v in sorted(container.env.items()): for k, v in sorted(container.env.items()):
fd.write(str(f"{k}={v}\n")) fd.write(str(f"{k}={v}\n"))
except Exception as e: except Exception as e:
......
...@@ -25,7 +25,7 @@ class Container: ...@@ -25,7 +25,7 @@ class Container:
TODO(kuizhiqing) A container can be run by process/thread or just a callable function TODO(kuizhiqing) A container can be run by process/thread or just a callable function
''' '''
def __init__(self, entrypoint=[], rank=-1, env={}): def __init__(self, entrypoint=[], rank=-1, env={}, overwrite_log=False):
self._entrypoint = entrypoint self._entrypoint = entrypoint
self._rank = rank self._rank = rank
self._out = None self._out = None
...@@ -39,6 +39,8 @@ class Container: ...@@ -39,6 +39,8 @@ class Container:
self._log_handler = None self._log_handler = None
self._shell = False self._shell = False
self.log_mode = 'w' if overwrite_log else 'a'
@property @property
def env(self): def env(self):
return self._env return self._env
...@@ -104,7 +106,7 @@ class Container: ...@@ -104,7 +106,7 @@ class Container:
d = os.path.dirname(pth) d = os.path.dirname(pth)
if not os.path.isdir(d): if not os.path.isdir(d):
os.makedirs(d, exist_ok=True) os.makedirs(d, exist_ok=True)
return open(pth, 'a') return open(pth, self.log_mode)
except: except:
return None return None
...@@ -120,7 +122,7 @@ class Container: ...@@ -120,7 +122,7 @@ class Container:
elif self._err: elif self._err:
self._stderr = self._get_fd(self._err) or sys.stderr self._stderr = self._get_fd(self._err) or sys.stderr
if not self._log_handler: if self._out and not self._log_handler:
self._log_handler = open(self._out) self._log_handler = open(self._out)
self._log_handler.seek(0, 2) self._log_handler.seek(0, 2)
self._log_start_offset = self._log_handler.tell() self._log_start_offset = self._log_handler.tell()
...@@ -179,7 +181,7 @@ class Container: ...@@ -179,7 +181,7 @@ class Container:
def logs(self, fn=None, offset=0, whence=1, limit=1000): def logs(self, fn=None, offset=0, whence=1, limit=1000):
if not self._log_handler: if not self._log_handler:
self._log_handler = open(self._out) return
if fn is None: if fn is None:
fn = sys.stdout fn = sys.stdout
...@@ -201,7 +203,7 @@ class Container: ...@@ -201,7 +203,7 @@ class Container:
def tail(self, length=3000): def tail(self, length=3000):
if not self._log_handler: if not self._log_handler:
self._log_handler = open(self._out) return
try: try:
self._log_handler.seek(0, 2) self._log_handler.seek(0, 2)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册