launch_utils.py 16.5 KB
Newer Older
R
Roc 已提交
1
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import time
import os
import signal
import copy
import sys
import subprocess
from contextlib import closing
import socket
J
Jiangxinz 已提交
23
from distutils.util import strtobool
24

R
Roc 已提交
25 26
from paddle.distributed.fleet.launch_utils import get_backend_by_compile_flag
from ..utils.log_utils import get_logger
27

R
Roc 已提交
28
logger = get_logger("INFO", "root")
29 30


31 32 33 34 35
def get_cluster_from_args(args, selected_gpus):
    node_ips = [x.strip() for x in args.cluster_node_ips.split(',')]
    node_ip = args.node_ip
    node_rank = node_ips.index(node_ip)

36 37 38 39 40
    logger.debug(
        "parsed from args:node_ips:{} node_ip:{} node_rank:{}".format(
            node_ips, node_ip, node_rank
        )
    )
41 42

    free_ports = None
43 44 45 46 47
    if (
        not args.use_paddlecloud
        and len(node_ips) <= 1
        and args.started_port is None
    ):
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
        free_ports = find_free_ports(len(selected_gpus))
        if free_ports is not None:
            free_ports = list(free_ports)
    else:
        started_port = 6070
        if args.started_port is not None:
            started_port = args.started_port

        free_ports = [
            x for x in range(started_port, started_port + len(selected_gpus))
        ]

    trainer_endpoints = []
    for ip in node_ips:
        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
    return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)


def get_gpus(selected_gpus):
    if selected_gpus is None:
        from paddle.fluid import core
69

70 71 72 73 74 75 76 77 78 79 80 81
        gpus_num = core.get_cuda_device_count()
        gpus = [str(x) for x in range(0, gpus_num)]
    else:
        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
        if cuda_visible_devices is None or cuda_visible_devices == "":
            gpus = [x.strip() for x in selected_gpus.split(',')]
        else:
            # change selected_gpus into relative values
            # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
            # therefore selected_gpus=0,1,2,3
            cuda_visible_devices_list = cuda_visible_devices.split(',')
            for x in selected_gpus.split(','):
82 83 84 85 86
                assert x in cuda_visible_devices_list, (
                    "Can't find "
                    "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."
                    % (x, cuda_visible_devices)
                )
87 88 89 90
            gpus = [
                cuda_visible_devices_list.index(x.strip())
                for x in selected_gpus.split(',')
            ]
91 92 93 94 95 96 97
            logger.info(
                "Change selected_gpus into reletive values. --ips:{} "
                "will change into relative_ips:{} according to your "
                "CUDA_VISIBLE_DEVICES:{}".format(
                    selected_gpus, gpus, cuda_visible_devices_list
                )
            )
98 99 100 101

    return gpus


102 103 104 105 106 107 108
class Hdfs(object):
    def __init__(self):
        self.hdfs_ugi = None
        self.hdfs_name = None
        self.hdfs_path = None

    def is_valid(self):
109 110 111 112 113
        return (
            self.hdfs_ugi is not None
            and self.hdfs_name is not None
            and self.hdfs_path is not None
        )
114 115 116

    def __str__(self):
        return "hdfs_ugi:{} hdfs_name:{} hdfs_path{}".format(
117 118
            self.hdfs_ugi, self.hdfs_name, self.hdfs_path
        )
119 120

    def __eq__(self, n):
121 122 123 124 125
        return (
            self.hdfs_ugi == n.hdfs_ugi
            and self.hdfs_name == n.hdfs_name
            and self.hdfs_path == n.hdfs_path
        )
126 127 128 129 130 131 132 133 134 135 136 137 138 139

    def __ne__(self, n):
        return not self == n


class Cluster(object):
    def __init__(self, hdfs):
        self.job_server = None
        self.pods = []
        self.hdfs = None
        self.job_stage_flag = None

    def __str__(self):
        return "job_server:{} pods:{} job_stage_flag:{} hdfs:{}".format(
140 141 142 143 144
            self.job_server,
            [str(pod) for pod in self.pods],
            self.job_stage_flag,
            self.hdfs,
        )
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161

    def __eq__(self, cluster):
        if len(self.pods) != len(cluster.pods):
            return False

        for a, b in zip(self.pods, cluster.pods):
            if a != b:
                return False

        if self.job_stage_flag != cluster.job_stage_flag:
            return False

        return True

    def __ne__(self, cluster):
        return not self.__eq__(cluster)

162
    def update_pods(self, cluster):
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
        self.pods = copy.copy(cluster.pods)

    def trainers_nranks(self):
        return len(self.trainers_endpoints())

    def pods_nranks(self):
        return len(self.pods)

    def trainers_endpoints(self):
        r = []
        for pod in self.pods:
            for t in pod.trainers:
                r.append(t.endpoint)
        return r

    def pods_endpoints(self):
        r = []
        for pod in self.pods:
            ep = "{}:{}".format(pod.addr, pod.port)
182 183 184
            assert (
                pod.port != None and pod.addr != None
            ), "{} not a valid endpoint".format(ep)
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
            r.append(ep)

        return r

    def get_pod_by_id(self, pod_id):
        for pod in self.pods:
            if str(pod_id) == str(pod.id):
                return pod

        return None


class JobServer(object):
    def __init__(self):
        self.endpoint = None

    def __str__(self):
        return "{}".format(self.endpoint)

    def __eq__(self, j):
        return self.endpint == j.endpoint

    def __ne__(self, j):
        return not self == j


class Trainer(object):
    def __init__(self):
        self.gpus = []
        self.endpoint = None
        self.rank = None

    def __str__(self):
218 219 220
        return "gpu:{} endpoint:{} rank:{}".format(
            self.gpus, self.endpoint, self.rank
        )
221 222 223 224 225

    def __eq__(self, t):
        if len(self.gpus) != len(t.gpus):
            return False

226
        if self.endpoint != t.endpoint or self.rank != t.rank:
227 228 229 230 231 232 233 234 235 236 237
            return False

        for a, b in zip(self.gpus, t.gpus):
            if a != b:
                return False

        return True

    def __ne__(self, t):
        return not self == t

238
    def get_rank(self):
239 240 241 242 243 244 245 246 247 248 249 250 251
        return self.rank


class Pod(object):
    def __init__(self):
        self.rank = None
        self.id = None
        self.addr = None
        self.port = None
        self.trainers = []
        self.gpus = []

    def __str__(self):
252 253 254 255 256 257 258 259 260 261
        return (
            "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{}".format(
                self.rank,
                self.id,
                self.addr,
                self.port,
                self.gpus,
                [str(t) for t in self.trainers],
            )
        )
262 263

    def __eq__(self, pod):
264 265 266 267 268 269
        if (
            self.rank != pod.rank
            or self.id != pod.id
            or self.addr != pod.addr
            or self.port != pod.port
        ):
Z
zhangchunle 已提交
270
            logger.debug("pod {} != {}".format(self, pod))
271 272 273
            return False

        if len(self.trainers) != len(pod.trainers):
274 275 276
            logger.debug(
                "trainers {} != {}".format(self.trainers, pod.trainers)
            )
277 278 279 280
            return False

        for i in range(len(self.trainers)):
            if self.trainers[i] != pod.trainers[i]:
281 282 283
                logger.debug(
                    "trainer {} != {}".format(self.trainers[i], pod.trainers[i])
                )
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
                return False

        return True

    def __ne__(self, pod):
        return not self == pod

    def parse_response(self, res_pods):
        pass

    def get_visible_gpus(self):
        r = ""
        for g in self.gpus:
            r += "{},".format(g)

        assert r != "", "this pod {} can't see any gpus".format(self)

        r = r[:-1]
        return r


305 306
def get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus):
    assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
307 308 309 310 311 312
    cluster = Cluster(hdfs=None)
    trainer_rank = 0
    for node_rank, ip in enumerate(node_ips):
        pod = Pod()
        pod.rank = node_rank
        pod.addr = ip
313 314 315 316 317
        cur_node_endpoints = trainer_endpoints[node_rank]
        # when use paddlecloud, endpoints may > selected_gpus(user_defined)
        assert len(cur_node_endpoints) >= len(
            selected_gpus
        ), "current trainer_endpoints size should be greater equal than selected_gpus size."
318 319 320
        for i in range(len(selected_gpus)):
            trainer = Trainer()
            trainer.gpus.append(selected_gpus[i])
321
            trainer.endpoint = "%s" % (cur_node_endpoints[i])
322 323 324 325 326 327 328 329 330 331 332 333 334
            trainer.rank = trainer_rank
            trainer_rank += 1

            pod.trainers.append(trainer)
        cluster.pods.append(pod)

    pod_rank = node_ips.index(node_ip)
    return cluster, cluster.pods[pod_rank]


def terminate_local_procs(procs):
    for p in procs:
        if p.proc.poll() is None:
M
mapingshuo 已提交
335
            p.proc.terminate()
336 337
            if p.log_fn:
                p.log_fn.close()
338 339
            logger.debug("terminate process id:{}".format(p.proc.pid))

340
    # wait all process terminiated
M
mapingshuo 已提交
341
    time.sleep(3)
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
    for step in range(0, 50):
        alive = False
        for p in procs:
            if p.proc.poll() is None:  # not termniate
                os.kill(p.proc.pid, signal.SIGKILL)
                alive = True

        if not alive:
            logger.info("terminate all the procs")
            return

        time.sleep(3)

    logger.fatal("can't kill all process and exit")
    exit(1)


def get_host_name_ip():
    try:
        host_name = socket.gethostname()
        host_ip = socket.gethostbyname(host_name)
        return host_name, host_ip
    except:
        return None


def add_arguments(argname, type, default, help, argparser, **kwargs):
    """Add argparse's argument.
    Usage:
    .. code-block:: python
        parser = argparse.ArgumentParser()
        add_argument("name", str, "Jonh", "User name.", parser)
        args = parser.parse_args()
    """
J
Jiangxinz 已提交
376
    type = strtobool if type == bool else type
377 378 379 380 381 382 383
    argparser.add_argument(
        "--" + argname,
        default=default,
        type=type,
        help=help + ' Default: %(default)s.',
        **kwargs
    )
384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411


def find_free_ports(num):
    def __free_port():
        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
            s.bind(('', 0))
            return s.getsockname()[1]

    port_set = set()
    step = 0
    while True:
        port = __free_port()
        if port not in port_set:
            port_set.add(port)

        if len(port_set) >= num:
            return port_set

        step += 1
        if step > 100:
            print(
                "can't find avilable port and use the specified static port now!"
            )
            return None

    return None


X
xiongkun 已提交
412 413 414 415
def _prepare_trainer_env(cluster, trainer, backend=None):
    if backend is None:
        backend = get_backend_by_compile_flag()  # for compatibility
    if backend == 'bkcl':
416
        proc_env = {
417 418
            "FLAGS_selected_xpus": "%s"
            % ",".join([str(g) for g in trainer.gpus]),
419 420 421
            "PADDLE_TRAINER_ID": "%d" % trainer.rank,
            "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
422
            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
423
        }
X
xiongkun 已提交
424
    elif backend == 'nccl':
425
        proc_env = {
426 427
            "FLAGS_selected_gpus": "%s"
            % ",".join([str(g) for g in trainer.gpus]),
428 429 430
            "PADDLE_TRAINER_ID": "%d" % trainer.rank,
            "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
431
            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
432 433 434
        }
    elif backend == 'cncl':
        proc_env = {
435 436
            "FLAGS_selected_mlus": "%s"
            % ",".join([str(g) for g in trainer.gpus]),
437 438 439
            "PADDLE_TRAINER_ID": "%d" % trainer.rank,
            "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
440
            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
441
        }
X
xiongkun 已提交
442 443 444 445 446 447 448
    elif backend == 'gloo':
        # NOTE (xiongkun) default fall back into cpu only
        proc_env = {
            "PADDLE_TRAINER_ID": "%d" % trainer.rank,
            "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
449
            "PADDLE_DISTRI_BACKEND": backend,  # only add here, other will be auto
X
xiongkun 已提交
450 451 452 453
        }
    else:
        raise ValueError("backend must be one of 'gloo, nccl, bkcl'")

454 455 456
    return proc_env


457 458 459 460
class TrainerProc(object):
    def __init__(self):
        self.proc = None
        self.log_fn = None
461
        self.log_offset = None
462
        self.rank = None
463
        self.local_rank = None
464 465 466
        self.cmd = None


467 468 469
def start_local_trainers(
    cluster, pod, training_script, training_script_args, log_dir=None
):
470
    current_env = copy.copy(os.environ.copy())
471 472 473 474
    # paddle broadcast ncclUniqueId use socket, and
    # proxy maybe make trainers unreachable, so delete them.
    # if we set them to "", grpc will log error message "bad uri"
    # so just delete them.
475 476 477 478 479
    current_env.pop("http_proxy", None)
    current_env.pop("https_proxy", None)

    procs = []
    for idx, t in enumerate(pod.trainers):
480
        proc_env = _prepare_trainer_env(cluster, t)
481 482 483 484 485 486 487 488 489 490 491 492
        current_env.update(proc_env)

        logger.debug("trainer proc env:{}".format(current_env))

        cmd = [sys.executable, "-u", training_script] + training_script_args

        logger.info("start trainer proc:{} env:{}".format(cmd, proc_env))

        fn = None
        if log_dir is not None:
            os.system("mkdir -p {}".format(log_dir))
            fn = open("%s/workerlog.%d" % (log_dir, idx), "a")
493
            proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
494 495 496 497 498 499
        else:
            proc = subprocess.Popen(cmd, env=current_env)

        tp = TrainerProc()
        tp.proc = proc
        tp.rank = t.rank
500
        tp.local_rank = idx
501
        tp.log_fn = fn
502
        tp.log_offset = fn.tell() if fn else None
503 504 505 506 507 508 509
        tp.cmd = cmd

        procs.append(tp)

    return procs


510 511 512 513 514 515 516 517 518 519
def pull_worker_log(tp):
    if tp.log_fn:
        with open(tp.log_fn.name, 'r') as fin:
            fin.seek(tp.log_offset, 0)
            for line in fin:
                try:
                    sys.stdout.write(line)
                except UnicodeEncodeError:
                    sys.stdout.write(
                        'UnicodeEncodeError occurs at this line. '
520 521 522
                        'Please refer to the original log file "%s"\n'
                        % tp.log_fn.name
                    )
523 524 525
            tp.log_offset = fin.tell()


526 527 528 529 530 531 532
def watch_local_trainers(procs, nranks):
    try:
        error = False
        error_rank = []
        # wait all process finish or one error
        alive = False
        for p in procs:
533 534 535
            if p.log_fn and p.local_rank == 0:
                pull_worker_log(p)

536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552
            ret = p.proc.poll()
            if ret is None:
                alive = True
            elif ret != 0:
                error = True
                error_rank.append(p.rank)

        if error:
            terminate_local_procs(procs)
            exit(1)

    except KeyboardInterrupt:
        logger.warning("KeyboardInterrupt, exit")
        terminate_local_procs(procs)
        raise
    except SystemExit:
        logger.error(
553 554 555 556
            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".format(
                nranks, error_rank
            )
        )
557 558 559 560
        terminate_local_procs(procs)
        raise
    except:
        logger.error(
561 562 563 564
            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".format(
                nranks, error_rank
            )
        )
565 566 567 568
        terminate_local_procs(procs)
        raise

    return alive
R
Roc 已提交
569 570 571 572


def _print_arguments(args):
    print("-----------  Configuration Arguments -----------")
573
    for arg, value in sorted(vars(args).items()):
R
Roc 已提交
574 575
        print("%s: %s" % (arg, value))
    print("------------------------------------------------")