提交 de130e95 编写于 作者: W WangXi 提交者: gongweibao

[Cherry-pick 1.6] Print the rank of trainer & remove nccl sync in launch.py (#20937)

上级 3db61dc0
...@@ -51,7 +51,7 @@ logger = logging.getLogger() ...@@ -51,7 +51,7 @@ logger = logging.getLogger()
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
log_handler = logging.StreamHandler() log_handler = logging.StreamHandler()
log_format = logging.Formatter( log_format = logging.Formatter(
'%(asctime)s - %(filename)s:%(lineno)d - %(levelname)s: %(message)s') '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
log_handler.setFormatter(log_format) log_handler.setFormatter(log_format)
logger.addHandler(log_handler) logger.addHandler(log_handler)
...@@ -71,7 +71,7 @@ def _parse_args(): ...@@ -71,7 +71,7 @@ def _parse_args():
parser = ArgumentParser( parser = ArgumentParser(
description='''start paddle training using multi-process mode. description='''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode, NOTE: your train program ***must*** run as distributed nccl2 mode,
see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2- see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
And your train program must read environment variables below in order to let different And your train program must read environment variables below in order to let different
process init properly: process init properly:
FLAGS_selected_gpus FLAGS_selected_gpus
...@@ -147,9 +147,6 @@ def terminate_procs(procs): ...@@ -147,9 +147,6 @@ def terminate_procs(procs):
def start_procs(args): def start_procs(args):
""" """
""" """
procs = []
log_fns = []
default_env = os.environ.copy() default_env = os.environ.copy()
current_node_ip = args.node_ip current_node_ip = args.node_ip
...@@ -213,48 +210,49 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) ...@@ -213,48 +210,49 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
current_env.pop("https_proxy", None) current_env.pop("https_proxy", None)
procs = [] procs = []
log_fns = []
cmds = [] cmds = []
ranks = []
for i in range(0, selected_gpus_num): for i in range(0, selected_gpus_num):
rank = (node_id * selected_gpus_num + i)
current_env.update({ current_env.update({
"FLAGS_selected_gpus": "%s" % selected_gpus[i], "FLAGS_selected_gpus": "%s" % selected_gpus[i],
"PADDLE_TRAINER_ID": "%d" % (node_id * selected_gpus_num + i), "PADDLE_TRAINER_ID": "%d" % rank,
"PADDLE_CURRENT_ENDPOINT": "PADDLE_CURRENT_ENDPOINT":
"%s:%d" % (current_node_ip, args.started_port + i), "%s:%d" % (current_node_ip, args.started_port + i),
"PADDLE_TRAINERS_NUM": "%d" % nranks, "PADDLE_TRAINERS_NUM": "%d" % nranks,
"PADDLE_TRAINER_ENDPOINTS": trainers_endpoints "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints
}) })
if num_nodes > 1:
current_env.update({"FLAGS_sync_nccl_allreduce": "0"})
cmd = [sys.executable, "-u", args.training_script cmd = [sys.executable, "-u", args.training_script
] + args.training_script_args ] + args.training_script_args
cmds.append(cmd) cmds.append(cmd)
if args.log_dir is not None: if args.log_dir is not None:
os.system("mkdir -p {}".format(args.log_dir)) os.system("mkdir -p {}".format(args.log_dir))
fn = open("%s/workerlog.%d" % (args.log_dir, i), "w") fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
log_fns.append(fn) log_fns.append(fn)
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn) proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
else: else:
proc = subprocess.Popen(cmd, env=current_env) proc = subprocess.Popen(cmd, env=current_env)
procs.append(proc) procs.append(proc)
ranks.append(rank)
try: try:
alive = True alive = True
error = False error = False
error_rank = []
# wait all process finish or one error # wait all process finish or one error
while alive and not error: while alive and not error:
alive = False alive = False
for p in procs: for rank, p in zip(ranks, procs):
ret = p.poll() ret = p.poll()
if ret is None: if ret is None:
alive = True alive = True
elif ret != 0: elif ret != 0:
error = True error = True
error_rank.append(rank)
time.sleep(1) time.sleep(1)
if error: if error:
...@@ -266,11 +264,15 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) ...@@ -266,11 +264,15 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
terminate_procs(procs) terminate_procs(procs)
raise raise
except SystemExit: except SystemExit:
logger.error("One trainer process abort, exit") logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank))
terminate_procs(procs) terminate_procs(procs)
raise raise
except: except:
logger.error("Trainer process abort, exit") logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank))
terminate_procs(procs) terminate_procs(procs)
raise raise
finally: finally:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册