未验证 提交 cf1f1601 编写于 作者: C Chunyang Wen 提交者: GitHub

Use fstr in launcher (#1521)

* Use fstr in launcher

* Fix wrong condition for word_info

* Fix typo
上级 f9b37801
......@@ -70,23 +70,21 @@ def main():
for k in current_env.keys():
if "NCCL" in k:
logger.info("%s %s %s", args.node_rank, k, current_env[k])
logger.info(f"{args.node_rank} {k}={current_env[k]}")
world_info = None
assert args.world_info != "None", "must provide world info dict"
if args.world_info == "None":
raise ValueError("world_info can not be None")
world_info = base64.urlsafe_b64decode(args.world_info)
world_info = json.loads(world_info)
logger.info("WORLD INFO DICT: {}".format(world_info))
logger.info(f"WORLD INFO DICT: {world_info}")
node_list = list(world_info.keys())
args.nnodes = len(node_list)
local_node = node_list[args.node_rank]
local_gpu_ids = world_info[local_node]
num_local_procs = len(local_gpu_ids)
logger.info(
"nnodes={}, num_local_procs={}, node_rank={}".format(args.nnodes,
num_local_procs,
args.node_rank),
f"nnodes={args.nnodes}, num_local_procs={num_local_procs}, node_rank={args.node_rank}"
)
global_rank_mapping = defaultdict(list)
......@@ -98,12 +96,10 @@ def main():
for gid in gids:
global_rank_mapping[node_id].append(curr_global_rank)
curr_global_rank += 1
logger.info("global_rank_mapping={}".format(global_rank_mapping))
logger.info("dist_world_size={}".format(dist_world_size))
logger.info(f"global_rank_mapping={global_rank_mapping}")
logger.info(f"dist_world_size={dist_world_size}")
current_env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, local_gpu_ids))
logger.info("Setting CUDA_VISIBLE_DEVICES={}".format(
current_env["CUDA_VISIBLE_DEVICES"]))
exclusion_counts_per_node = None
logger.info(f"Setting CUDA_VISIBLE_DEVICES={current_env['CUDA_VISIBLE_DEVICES']}")
# set PyTorch distributed related environmental variables
current_env["MASTER_ADDR"] = args.master_addr
......@@ -111,6 +107,7 @@ def main():
current_env["WORLD_SIZE"] = str(dist_world_size)
processes = []
cmd = []
for local_rank in range(0, num_local_procs):
# each process's rank
dist_rank = global_rank_mapping[local_node][local_rank]
......@@ -118,36 +115,34 @@ def main():
current_env["LOCAL_RANK"] = str(local_rank)
# spawn the processes
cmd = [
sys.executable,
"-u",
args.training_script,
"--local_rank={}".format(local_rank)
] + args.training_script_args
sig_names = {2: "SIGINT", 15: "SIGTERM"}
last_return_code = None
def sigkill_handler(signum, frame):
for process in processes:
print(f"Killing subprocess {process.pid}")
try:
process.kill()
except Exception as e:
pass
if last_return_code is not None:
raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd)
if signum in sig_names:
print(f"Main process received {sig_names[signum]}, exiting")
sys.exit(1)
# pass SIGINT/SIGTERM to children if the parent is being terminated
signal.signal(signal.SIGINT, sigkill_handler)
signal.signal(signal.SIGTERM, sigkill_handler)
cmd = [sys.executable,
"-u",
args.training_script,
f"--local_rank={local_rank}"] + args.training_script_args
process = subprocess.Popen(cmd, env=current_env)
processes.append(process)
sig_names = {2: "SIGINT", 15: "SIGTERM"}
last_return_code = None
def sigkill_handler(signum, frame):
for process in processes:
logger.info(f"Killing subprocess {process.pid}")
try:
process.kill()
except Exception:
pass
if last_return_code is not None:
raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd)
if signum in sig_names:
logger.info(f"Main process received {sig_names[signum]}, exiting")
sys.exit(1)
# pass SIGINT/SIGTERM to children if the parent is being terminated
signal.signal(signal.SIGINT, sigkill_handler)
signal.signal(signal.SIGTERM, sigkill_handler)
alive_processes = set(processes)
while len(alive_processes):
finished_processes = []
......@@ -161,6 +156,7 @@ def main():
sigkill_handler(signal.SIGTERM, None) # not coming back
else:
# exited cleanly
logger.info(f"Process {process.pid} exits successfully.")
finished_processes.append(process)
alive_processes = set(alive_processes) - set(finished_processes)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册