提交 e78d7f57 编写于 作者: W WangXi 提交者: gongweibao

Print the rank which trainer is error in launch.py, test=develop (#20838)

上级 48669aa8
...@@ -51,7 +51,7 @@ logger = logging.getLogger() ...@@ -51,7 +51,7 @@ logger = logging.getLogger()
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
log_handler = logging.StreamHandler() log_handler = logging.StreamHandler()
log_format = logging.Formatter( log_format = logging.Formatter(
'%(asctime)s - %(filename)s:%(lineno)d - %(levelname)s: %(message)s') '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
log_handler.setFormatter(log_format) log_handler.setFormatter(log_format)
logger.addHandler(log_handler) logger.addHandler(log_handler)
...@@ -214,10 +214,14 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) ...@@ -214,10 +214,14 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
procs = [] procs = []
cmds = [] cmds = []
ranks = []
for i in range(0, selected_gpus_num): for i in range(0, selected_gpus_num):
rank = (node_id * selected_gpus_num + i)
current_env.update({ current_env.update({
"FLAGS_selected_gpus": "%s" % selected_gpus[i], "FLAGS_selected_gpus": "%s" % selected_gpus[i],
"PADDLE_TRAINER_ID": "%d" % (node_id * selected_gpus_num + i), "PADDLE_TRAINER_ID": "%d" % rank,
"PADDLE_CURRENT_ENDPOINT": "PADDLE_CURRENT_ENDPOINT":
"%s:%d" % (current_node_ip, args.started_port + i), "%s:%d" % (current_node_ip, args.started_port + i),
"PADDLE_TRAINERS_NUM": "%d" % nranks, "PADDLE_TRAINERS_NUM": "%d" % nranks,
...@@ -242,19 +246,22 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) ...@@ -242,19 +246,22 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
proc = subprocess.Popen(cmd, env=current_env) proc = subprocess.Popen(cmd, env=current_env)
procs.append(proc) procs.append(proc)
ranks.append(rank)
try: try:
alive = True alive = True
error = False error = False
error_rank = []
# wait all process finish or one error # wait all process finish or one error
while alive and not error: while alive and not error:
alive = False alive = False
for p in procs: for rank, p in zip(ranks, procs):
ret = p.poll() ret = p.poll()
if ret is None: if ret is None:
alive = True alive = True
elif ret != 0: elif ret != 0:
error = True error = True
error_rank.append(rank)
time.sleep(1) time.sleep(1)
if error: if error:
...@@ -266,11 +273,15 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) ...@@ -266,11 +273,15 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
terminate_procs(procs) terminate_procs(procs)
raise raise
except SystemExit: except SystemExit:
logger.error("One trainer process abort, exit") logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank))
terminate_procs(procs) terminate_procs(procs)
raise raise
except: except:
logger.error("Trainer process abort, exit") logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank))
terminate_procs(procs) terminate_procs(procs)
raise raise
finally: finally:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册