test_launch_coverage.py 3.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function
import sys
import subprocess
import os
import time
import six
import copy
import unittest
import paddle.fluid as fluid

from argparse import ArgumentParser, REMAINDER
from paddle.distributed.utils import _print_arguments, get_gpus, get_cluster_from_args
W
WangXi 已提交
27
from paddle.distributed.fleet.launch_utils import find_free_ports
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118


def _parse_args():
    parser = ArgumentParser(
        description='''start paddle training using multi-process mode.	
NOTE: your train program ***must*** run as distributed nccl2 mode,	
see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-	
And your train program must read environment variables below in order to let different	
process init properly:	
FLAGS_selected_gpus	
PADDLE_TRAINER_ID	
PADDLE_CURRENT_ENDPOINT	
PADDLE_TRAINERS_NUM	
PADDLE_TRAINER_ENDPOINTS	
POD_IP (current node ip address, not needed for local training)	
''')

    #Optional arguments for the launch helper	
    parser.add_argument(
        "--cluster_node_ips",
        type=str,
        default="127.0.0.1",
        help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
    parser.add_argument(
        "--node_ip",
        type=str,
        default="127.0.0.1",
        help="The current node ip. ")
    parser.add_argument(
        "--use_paddlecloud",
        action='store_true',
        help="wheter to use paddlecloud platform to run your multi-process job. If false, no need to set this argument."
    )
    parser.add_argument(
        "--started_port",
        type=int,
        default=None,
        help="The trainer's started port on a single node")

    parser.add_argument(
        "--print_config",
        type=bool,
        default=True,
        help="Print the config or not")

    parser.add_argument(
        "--selected_gpus",
        type=str,
        default=None,
        help="It's for gpu training and the training process will run on the selected_gpus,"
        "each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
    )

    parser.add_argument(
        "--log_level",
        type=int,
        default=20,  # logging.INFO, details are here:https://docs.python.org/3/library/logging.html#levels	
        help="Logging level, default is logging.INFO")

    parser.add_argument(
        "--log_dir",
        type=str,
        help="The path for each process's log.If it's not set, the log will printed to default pipe."
    )

    #positional	
    parser.add_argument(
        "training_script",
        type=str,
        help="The full path to the single GPU training "
        "program/script to be launched in parallel, "
        "followed by all the arguments for the "
        "training script")

    #rest from the training program	
    parser.add_argument('training_script_args', nargs=REMAINDER)
    return parser.parse_args()


class TestCoverage(unittest.TestCase):
    def test_gpus(self):
        args = _parse_args()

        if args.print_config:
            _print_arguments(args)

        gpus = get_gpus(None)

        args.use_paddlecloud = True
        cluster, pod = get_cluster_from_args(args, "0")

W
WangXi 已提交
119 120 121
    def test_find_free_ports(self):
        find_free_ports(2)

122 123 124

if __name__ == '__main__':
    unittest.main()