test_launch_coverage.py 4.0 KB
Newer Older
1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2
#
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
6
#
7
#     http://www.apache.org/licenses/LICENSE-2.0
8
#
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import subprocess
import os
import time
import six
import copy
import unittest
import paddle.fluid as fluid

from argparse import ArgumentParser, REMAINDER
R
Roc 已提交
25
from paddle.distributed.utils.launch_utils import _print_arguments, get_gpus, get_cluster_from_args
W
WangXi 已提交
26
from paddle.distributed.fleet.launch_utils import find_free_ports
27 28 29 30


def _parse_args():
    parser = ArgumentParser(
31 32 33 34 35 36 37 38 39 40 41
        description='''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode,
see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
And your train program must read environment variables below in order to let different
process init properly:
FLAGS_selected_gpus
PADDLE_TRAINER_ID
PADDLE_CURRENT_ENDPOINT
PADDLE_TRAINERS_NUM
PADDLE_TRAINER_ENDPOINTS
POD_IP (current node ip address, not needed for local training)
42 43
''')

44
    #Optional arguments for the launch helper
45 46 47 48 49
    parser.add_argument(
        "--cluster_node_ips",
        type=str,
        default="127.0.0.1",
        help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
50 51 52 53
    parser.add_argument("--node_ip",
                        type=str,
                        default="127.0.0.1",
                        help="The current node ip. ")
54 55 56
    parser.add_argument(
        "--use_paddlecloud",
        action='store_true',
57 58
        help=
        "wheter to use paddlecloud platform to run your multi-process job. If false, no need to set this argument."
59
    )
60 61 62 63
    parser.add_argument("--started_port",
                        type=int,
                        default=None,
                        help="The trainer's started port on a single node")
64

65 66 67 68
    parser.add_argument("--print_config",
                        type=bool,
                        default=True,
                        help="Print the config or not")
69 70 71 72 73

    parser.add_argument(
        "--selected_gpus",
        type=str,
        default=None,
74 75
        help=
        "It's for gpu training and the training process will run on the selected_gpus,"
76 77 78 79 80 81
        "each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
    )

    parser.add_argument(
        "--log_level",
        type=int,
82
        default=
83
        20,  # logging.INFO, details are here:https://docs.python.org/3/library/logging.html#levels
84 85 86 87 88
        help="Logging level, default is logging.INFO")

    parser.add_argument(
        "--log_dir",
        type=str,
89 90
        help=
        "The path for each process's log.If it's not set, the log will printed to default pipe."
91 92
    )

93 94 95 96 97 98 99
    #positional
    parser.add_argument("training_script",
                        type=str,
                        help="The full path to the single GPU training "
                        "program/script to be launched in parallel, "
                        "followed by all the arguments for the "
                        "training script")
100

101
    #rest from the training program
102 103 104 105 106
    parser.add_argument('training_script_args', nargs=REMAINDER)
    return parser.parse_args()


class TestCoverage(unittest.TestCase):
107

108 109 110 111 112 113 114 115 116 117 118
    def test_gpus(self):
        args = _parse_args()

        if args.print_config:
            _print_arguments(args)

        gpus = get_gpus(None)

        args.use_paddlecloud = True
        cluster, pod = get_cluster_from_args(args, "0")

W
WangXi 已提交
119 120 121
    def test_find_free_ports(self):
        find_free_ports(2)

122 123 124

if __name__ == '__main__':
    unittest.main()