未验证 提交 f5331eaa 编写于 作者: S Songtao Liu 提交者: GitHub

fix bug of multi-machine training (#240)

fix(tools): fix bug of mm training
上级 d776311e
......@@ -106,6 +106,12 @@ python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o
* -b: total batch size, the recommended number for -b is num-gpu * 8
* --fp16: mixed precision training
**Multi Machine Training**
We also support multi-nodes training. Just add the following args:
* --num\_machines: num of your total training nodes
* --machine\_rank: specify the rank of each node
When using -f, the above commands are equivalent to:
```shell
......
......@@ -41,7 +41,7 @@ def make_parser():
"--local_rank", default=0, type=int, help="local rank for dist training"
)
parser.add_argument(
"--num_machine", default=1, type=int, help="num of node for training"
"--num_machines", default=1, type=int, help="num of node for training"
)
parser.add_argument(
"--machine_rank", default=0, type=int, help="node rank for multi-node training"
......@@ -104,9 +104,6 @@ def make_parser():
@logger.catch
def main(exp, args, num_gpu):
if not args.experiment_name:
args.experiment_name = exp.exp_name
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
......@@ -118,16 +115,11 @@ def main(exp, args, num_gpu):
is_distributed = num_gpu > 1
# set environment variables for distributed training
configure_nccl()
cudnn.benchmark = True
rank = args.local_rank
# rank = get_local_rank()
if rank == 0:
if os.path.exists("./" + args.experiment_name + "ip_add.txt"):
os.remove("./" + args.experiment_name + "ip_add.txt")
file_name = os.path.join(exp.output_dir, args.experiment_name)
if rank == 0:
......@@ -198,13 +190,16 @@ if __name__ == "__main__":
exp = get_exp(args.exp_file, args.name)
exp.merge(args.opts)
if not args.experiment_name:
args.experiment_name = exp.exp_name
num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
assert num_gpu <= torch.cuda.device_count()
launch(
main,
num_gpu,
args.num_machine,
args.num_machines,
args.machine_rank,
backend=args.dist_backend,
dist_url=args.dist_url,
......
......@@ -9,7 +9,6 @@ import torch.backends.cudnn as cudnn
from yolox.core import Trainer, launch
from yolox.exp import get_exp
from yolox.utils import configure_nccl
import argparse
import random
......@@ -57,7 +56,7 @@ def make_parser():
help="resume training start epoch",
)
parser.add_argument(
"--num_machine", default=1, type=int, help="num of node for training"
"--num_machines", default=1, type=int, help="num of node for training"
)
parser.add_argument(
"--machine_rank", default=0, type=int, help="node rank for multi-node training"
......@@ -88,9 +87,6 @@ def make_parser():
@logger.catch
def main(exp, args):
if not args.experiment_name:
args.experiment_name = exp.exp_name
if exp.seed is not None:
random.seed(exp.seed)
torch.manual_seed(exp.seed)
......@@ -102,7 +98,6 @@ def main(exp, args):
)
# set environment variables for distributed training
configure_nccl()
cudnn.benchmark = True
trainer = Trainer(exp, args)
......@@ -114,13 +109,16 @@ if __name__ == "__main__":
exp = get_exp(args.exp_file, args.name)
exp.merge(args.opts)
if not args.experiment_name:
args.experiment_name = exp.exp_name
num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
assert num_gpu <= torch.cuda.device_count()
launch(
main,
num_gpu,
args.num_machine,
args.num_machines,
args.machine_rank,
backend=args.dist_backend,
dist_url=args.dist_url,
......
......@@ -12,6 +12,7 @@ import torch.distributed as dist
import torch.multiprocessing as mp
import yolox.utils.dist as comm
from yolox.utils import configure_nccl
import os
import subprocess
......@@ -63,11 +64,13 @@ def launch(
os.environ.get("MASTER_PORT", "None"),
)
local_rank = int(os.environ.get("LOCAL_RANK", "0"))
world_size = int(os.environ.get("WORLD_SIZE", "1"))
_distributed_worker(
local_rank,
main_func,
world_size,
num_gpus_per_machine,
num_machines,
machine_rank,
backend,
dist_url,
......@@ -99,29 +102,30 @@ def launch_by_subprocess(
assert (
world_size > 1
), "subprocess mode doesn't support single GPU, use spawn mode instead"
machine_rank = int(os.getenv("RLAUNCH_REPLICA", machine_rank))
if dist_url is None:
master_ip = subprocess.check_output(["hostname", "--fqdn"]).decode("utf-8")
master_ip = str(master_ip).strip()
dist_url = "tcp://{}".format(master_ip)
# ------------------------hack for multi-machine training -------------------- #
if num_machines > 1:
ip_add_file = "./" + args[1].experiment_name + "ip_add.txt"
master_ip = subprocess.check_output(["hostname", "--fqdn"]).decode("utf-8")
master_ip = str(master_ip).strip()
dist_url = "tcp://{}".format(master_ip)
ip_add_file = "./" + args[1].experiment_name + "_ip_add.txt"
if machine_rank == 0:
port = _find_free_port()
with open(ip_add_file, "w") as ip_add:
ip_add.write(dist_url)
ip_add.write(dist_url+'\n')
ip_add.write(str(port))
else:
while not os.path.exists(ip_add_file):
time.sleep(0.5)
with open(ip_add_file, "r") as ip_add:
dist_url = ip_add.readline()
dist_url = ip_add.readline().strip()
port = ip_add.readline()
else:
dist_url = "tcp://127.0.0.1"
port = _find_free_port()
port = _find_free_port()
# set PyTorch distributed related environmental variables
current_env = os.environ.copy()
current_env["MASTER_ADDR"] = dist_url
......@@ -166,6 +170,7 @@ def _distributed_worker(
main_func,
world_size,
num_gpus_per_machine,
num_machines,
machine_rank,
backend,
dist_url,
......@@ -174,6 +179,7 @@ def _distributed_worker(
assert (
torch.cuda.is_available()
), "cuda is not available. Please check your installation."
configure_nccl()
global_rank = machine_rank * num_gpus_per_machine + local_rank
logger.info("Rank {} initialization finished.".format(global_rank))
try:
......@@ -190,10 +196,16 @@ def _distributed_worker(
# See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
comm.synchronize()
if global_rank == 0 and os.path.exists(
"./" + args[1].experiment_name + "_ip_add.txt"
):
os.remove("./" + args[1].experiment_name + "_ip_add.txt")
assert num_gpus_per_machine <= torch.cuda.device_count()
torch.cuda.set_device(local_rank)
args[1].local_rank = local_rank
args[1].num_machines = num_machines
# Setup the local process group (which contains ranks within the same machine)
# assert comm._LOCAL_PROCESS_GROUP is None
......
......@@ -55,11 +55,6 @@ class Trainer:
self.meter = MeterBuffer(window_size=exp.print_interval)
self.file_name = os.path.join(exp.output_dir, args.experiment_name)
if self.rank == 0 and os.path.exists(
"./" + args.experiment_name + "ip_add.txt"
):
os.remove("./" + args.experiment_name + "ip_add.txt")
if self.rank == 0:
os.makedirs(self.file_name, exist_ok=True)
......
......@@ -206,7 +206,7 @@ class COCOEvaluator:
try:
from yolox.layers import COCOeval_opt as COCOeval
except ImportError:
from .cocoeval_mr import COCOeval
from pycocotools import cocoeval as COCOeval
logger.warning("Use standard COCOeval.")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册