提交 9d8ec423 编写于 作者: W WangXi 提交者: gongweibao

launch.py remove setting for nccl sync, test=develop (#20909)

上级 b6260f38
...@@ -71,7 +71,7 @@ def _parse_args(): ...@@ -71,7 +71,7 @@ def _parse_args():
parser = ArgumentParser( parser = ArgumentParser(
description='''start paddle training using multi-process mode. description='''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode, NOTE: your train program ***must*** run as distributed nccl2 mode,
see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2- see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
And your train program must read environment variables below in order to let different And your train program must read environment variables below in order to let different
process init properly: process init properly:
FLAGS_selected_gpus FLAGS_selected_gpus
...@@ -147,9 +147,6 @@ def terminate_procs(procs): ...@@ -147,9 +147,6 @@ def terminate_procs(procs):
def start_procs(args): def start_procs(args):
""" """
""" """
procs = []
log_fns = []
default_env = os.environ.copy() default_env = os.environ.copy()
current_node_ip = args.node_ip current_node_ip = args.node_ip
...@@ -213,12 +210,11 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) ...@@ -213,12 +210,11 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
current_env.pop("https_proxy", None) current_env.pop("https_proxy", None)
procs = [] procs = []
log_fns = []
cmds = [] cmds = []
ranks = [] ranks = []
for i in range(0, selected_gpus_num): for i in range(0, selected_gpus_num):
rank = (node_id * selected_gpus_num + i) rank = (node_id * selected_gpus_num + i)
current_env.update({ current_env.update({
"FLAGS_selected_gpus": "%s" % selected_gpus[i], "FLAGS_selected_gpus": "%s" % selected_gpus[i],
"PADDLE_TRAINER_ID": "%d" % rank, "PADDLE_TRAINER_ID": "%d" % rank,
...@@ -228,19 +224,14 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) ...@@ -228,19 +224,14 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
"PADDLE_TRAINER_ENDPOINTS": trainers_endpoints "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints
}) })
if num_nodes > 1:
current_env.update({"FLAGS_sync_nccl_allreduce": "0"})
cmd = [sys.executable, "-u", args.training_script cmd = [sys.executable, "-u", args.training_script
] + args.training_script_args ] + args.training_script_args
cmds.append(cmd) cmds.append(cmd)
if args.log_dir is not None: if args.log_dir is not None:
os.system("mkdir -p {}".format(args.log_dir)) os.system("mkdir -p {}".format(args.log_dir))
fn = open("%s/workerlog.%d" % (args.log_dir, i), "w") fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
log_fns.append(fn) log_fns.append(fn)
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn) proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
else: else:
proc = subprocess.Popen(cmd, env=current_env) proc = subprocess.Popen(cmd, env=current_env)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册