未验证 提交 3432c740 编写于 作者: 郭叶军's avatar 郭叶军 提交者: GitHub

deepspeed/launcher/launch.py: add option '--enable_each_rank_log logdir' (#2409)

上级 be4ffb82
......@@ -92,6 +92,12 @@ def parse_args():
default=0,
help="main launching process pid, for internal pid tracking")
parser.add_argument(
"--enable_each_rank_log",
default="None",
type=str,
help="redirect the stdout and stderr from each rank into different log files")
# positional
parser.add_argument("training_script",
type=str,
......@@ -197,6 +203,22 @@ def main():
cmd = []
if not args.enable_elastic_training:
if args.enable_each_rank_log != "None":
# prepare the log path and the file name prefix
if os.path.isfile(args.enable_each_rank_log):
raise ValueError(
f"{args.enable_each_rank_log} should not be a file, it should be a directory."
)
if not os.path.exists(args.enable_each_rank_log):
try:
os.makedirs(args.enable_each_rank_log)
except Exception as e:
print(e)
raise ValueError(
f"unable to create directory {args.enable_each_rank_log} for each rank log."
)
log_name_prefix = time.strftime("%Y%m%d%H%M%S", time.localtime())
for local_rank in range(0, num_local_procs):
# each process's rank
dist_rank = global_rank_mapping[local_node][local_rank]
......@@ -219,7 +241,17 @@ def main():
cmd.append(f"--local_rank={local_rank}")
cmd += args.training_script_args
process = subprocess.Popen(cmd, env=current_env)
if args.enable_each_rank_log != "None":
log_file = os.path.join(args.enable_each_rank_log,
f"{log_name_prefix}_rank{dist_rank}.log")
log_fd = open(log_file, 'w')
process = subprocess.Popen(cmd,
env=current_env,
stdout=log_fd,
stderr=log_fd)
else:
process = subprocess.Popen(cmd, env=current_env)
processes.append(process)
else:
from ..elasticity import DSElasticAgent
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册