fix bug of multi-machine training (#240)

fix(tools): fix bug of mm training

fix bug of multi-machine training (#240)
fix(tools): fix bug of mm training
f5331eaa · Songtao Liu · GitHub · d776311e · f5331eaa · f5331eaa
6 changed file
--- a/README.md
+++ b/README.md
@@ -106,6 +106,12 @@ python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o
 * -b: total batch size, the recommended number for -b is num-gpu * 8
 * --fp16: mixed precision training

+**Multi Machine Training**
+
+We also support multi-nodes training. Just add the following args:
+* --num\_machines: num of your total training nodes
+* --machine\_rank: specify the rank of each node
+
 When using -f, the above commands are equivalent to:

 ```shell

--- a/tools/eval.py
+++ b/tools/eval.py
@@ -41,7 +41,7 @@ def make_parser():
        "--local_rank", default=0, type=int, help="local rank for dist training"
    )
    parser.add_argument(
-        "--num_machine", default=1, type=int, help="num of node for training"
+        "--num_machines", default=1, type=int, help="num of node for training"
    )
    parser.add_argument(
        "--machine_rank", default=0, type=int, help="node rank for multi-node training"
@@ -104,9 +104,6 @@ def make_parser():

 @logger.catch
 def main(exp, args, num_gpu):
-    if not args.experiment_name:
-        args.experiment_name = exp.exp_name
-
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
@@ -118,16 +115,11 @@ def main(exp, args, num_gpu):
    is_distributed = num_gpu > 1

    # set environment variables for distributed training
-    configure_nccl()
    cudnn.benchmark = True

    rank = args.local_rank
    # rank = get_local_rank()

-    if rank == 0:
-        if os.path.exists("./" + args.experiment_name + "ip_add.txt"):
-            os.remove("./" + args.experiment_name + "ip_add.txt")
-
    file_name = os.path.join(exp.output_dir, args.experiment_name)

    if rank == 0:
@@ -198,13 +190,16 @@ if __name__ == "__main__":
    exp = get_exp(args.exp_file, args.name)
    exp.merge(args.opts)

+    if not args.experiment_name:
+        args.experiment_name = exp.exp_name
+
    num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
    assert num_gpu <= torch.cuda.device_count()

    launch(
        main,
        num_gpu,
-        args.num_machine,
+        args.num_machines,
        args.machine_rank,
        backend=args.dist_backend,
        dist_url=args.dist_url,

--- a/tools/train.py
+++ b/tools/train.py
@@ -9,7 +9,6 @@ import torch.backends.cudnn as cudnn

 from yolox.core import Trainer, launch
 from yolox.exp import get_exp
-from yolox.utils import configure_nccl

 import argparse
 import random
@@ -57,7 +56,7 @@ def make_parser():
        help="resume training start epoch",
    )
    parser.add_argument(
-        "--num_machine", default=1, type=int, help="num of node for training"
+        "--num_machines", default=1, type=int, help="num of node for training"
    )
    parser.add_argument(
        "--machine_rank", default=0, type=int, help="node rank for multi-node training"
@@ -88,9 +87,6 @@ def make_parser():

 @logger.catch
 def main(exp, args):
-    if not args.experiment_name:
-        args.experiment_name = exp.exp_name
-
    if exp.seed is not None:
        random.seed(exp.seed)
        torch.manual_seed(exp.seed)
@@ -102,7 +98,6 @@ def main(exp, args):
        )

    # set environment variables for distributed training
-    configure_nccl()
    cudnn.benchmark = True

    trainer = Trainer(exp, args)
@@ -114,13 +109,16 @@ if __name__ == "__main__":
    exp = get_exp(args.exp_file, args.name)
    exp.merge(args.opts)

+    if not args.experiment_name:
+        args.experiment_name = exp.exp_name
+
    num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
    assert num_gpu <= torch.cuda.device_count()

    launch(
        main,
        num_gpu,
-        args.num_machine,
+        args.num_machines,
        args.machine_rank,
        backend=args.dist_backend,
        dist_url=args.dist_url,

--- a/yolox/core/launch.py
+++ b/yolox/core/launch.py
@@ -12,6 +12,7 @@ import torch.distributed as dist
 import torch.multiprocessing as mp

 import yolox.utils.dist as comm
+from yolox.utils import configure_nccl

 import os
 import subprocess
@@ -63,11 +64,13 @@ def launch(
                os.environ.get("MASTER_PORT", "None"),
            )
            local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+            world_size = int(os.environ.get("WORLD_SIZE", "1"))
            _distributed_worker(
                local_rank,
                main_func,
                world_size,
                num_gpus_per_machine,
+                num_machines,
                machine_rank,
                backend,
                dist_url,
@@ -99,29 +102,30 @@ def launch_by_subprocess(
    assert (
        world_size > 1
    ), "subprocess mode doesn't support single GPU, use spawn mode instead"
-    machine_rank = int(os.getenv("RLAUNCH_REPLICA", machine_rank))

    if dist_url is None:
-        master_ip = subprocess.check_output(["hostname", "--fqdn"]).decode("utf-8")
-        master_ip = str(master_ip).strip()
-        dist_url = "tcp://{}".format(master_ip)
-
        # ------------------------hack for multi-machine training -------------------- #
        if num_machines > 1:
-            ip_add_file = "./" + args[1].experiment_name + "ip_add.txt"
+            master_ip = subprocess.check_output(["hostname", "--fqdn"]).decode("utf-8")
+            master_ip = str(master_ip).strip()
+            dist_url = "tcp://{}".format(master_ip)
+            ip_add_file = "./" + args[1].experiment_name + "_ip_add.txt"
            if machine_rank == 0:
+                port = _find_free_port()
                with open(ip_add_file, "w") as ip_add:
-                    ip_add.write(dist_url)
+                    ip_add.write(dist_url+'\n')
+                    ip_add.write(str(port))
            else:
                while not os.path.exists(ip_add_file):
                    time.sleep(0.5)

                with open(ip_add_file, "r") as ip_add:
-                    dist_url = ip_add.readline()
+                    dist_url = ip_add.readline().strip()
+                    port = ip_add.readline()
        else:
            dist_url = "tcp://127.0.0.1"
+            port = _find_free_port()

-    port = _find_free_port()
    # set PyTorch distributed related environmental variables
    current_env = os.environ.copy()
    current_env["MASTER_ADDR"] = dist_url
@@ -166,6 +170,7 @@ def _distributed_worker(
    main_func,
    world_size,
    num_gpus_per_machine,
+    num_machines,
    machine_rank,
    backend,
    dist_url,
@@ -174,6 +179,7 @@ def _distributed_worker(
    assert (
        torch.cuda.is_available()
    ), "cuda is not available. Please check your installation."
+    configure_nccl()
    global_rank = machine_rank * num_gpus_per_machine + local_rank
    logger.info("Rank {} initialization finished.".format(global_rank))
    try:
@@ -190,10 +196,16 @@ def _distributed_worker(
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

+    if global_rank == 0 and os.path.exists(
+        "./" + args[1].experiment_name + "_ip_add.txt"
+    ):
+        os.remove("./" + args[1].experiment_name + "_ip_add.txt")
+
    assert num_gpus_per_machine <= torch.cuda.device_count()
    torch.cuda.set_device(local_rank)

    args[1].local_rank = local_rank
+    args[1].num_machines = num_machines

    # Setup the local process group (which contains ranks within the same machine)
    # assert comm._LOCAL_PROCESS_GROUP is None

--- a/yolox/core/trainer.py
+++ b/yolox/core/trainer.py
@@ -55,11 +55,6 @@ class Trainer:
        self.meter = MeterBuffer(window_size=exp.print_interval)
        self.file_name = os.path.join(exp.output_dir, args.experiment_name)

-        if self.rank == 0 and os.path.exists(
-            "./" + args.experiment_name + "ip_add.txt"
-        ):
-            os.remove("./" + args.experiment_name + "ip_add.txt")
-
        if self.rank == 0:
            os.makedirs(self.file_name, exist_ok=True)


--- a/yolox/evaluators/coco_evaluator.py
+++ b/yolox/evaluators/coco_evaluator.py
@@ -206,7 +206,7 @@ class COCOEvaluator:
            try:
                from yolox.layers import COCOeval_opt as COCOeval
            except ImportError:
-                from .cocoeval_mr import COCOeval
+                from pycocotools import cocoeval as COCOeval

                logger.warning("Use standard COCOeval.")