未验证 提交 1358397e 编写于 作者: G gongweibao 提交者: GitHub

Clean up the redundant files and unify the launch interface. (#28928)

上级 47af5c3c
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
import os import os
import paddle import paddle
from paddle.distributed.utils import get_cluster, logger from paddle.distributed.utils import get_cluster, logger, get_gpus, get_cluster_from_args
def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus): def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus):
...@@ -94,5 +94,26 @@ paddlecloud environment.".format(args_node_ips, node_ips)) ...@@ -94,5 +94,26 @@ paddlecloud environment.".format(args_node_ips, node_ips))
return cluster, cluster.pods[node_rank] return cluster, cluster.pods[node_rank]
def get_trainers_num(): def _get_trainers_num():
return int(os.getenv("PADDLE_TRAINERS_NUM", "1")) return int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
def get_cluster_and_pod(args):
# parse arguments, used for cloud-single-machine and local
selected_gpus = get_gpus(args.selected_gpus)
trainers_num = _get_trainers_num()
logger.debug("parsed from args trainerss_num:{} selected_gpus:{}".format(
trainers_num, selected_gpus))
cluster = None
pod = None
if args.use_paddlecloud and trainers_num != 1:
cluster, pod = get_cloud_cluster(args.cluster_node_ips, args.node_ip,
args.started_port, selected_gpus)
logger.info("get cluster from cloud:{}".format(cluster))
else:
cluster, pod = get_cluster_from_args(args, selected_gpus)
logger.info("get cluster from args:{}".format(cluster))
return cluster, pod
...@@ -17,9 +17,12 @@ import paddle ...@@ -17,9 +17,12 @@ import paddle
from paddle.distributed.fleet.launch_utils import get_cluster, logger from paddle.distributed.fleet.launch_utils import get_cluster, logger
def get_cloud_cluster(args_node_ips, selected_gpus, args_port=6170): def get_cloud_cluster(args_node_ips,
device_mode,
devices_per_proc,
args_port=6170):
""" """
args_node_ips:string, selected_gpus:list, args_port: int args_node_ips:string, device_mode:DeviceMode(IntEnum), device_per_proc:list, args_port: int
""" """
#you can automatically get ip info while using paddlecloud multi nodes mode. #you can automatically get ip info while using paddlecloud multi nodes mode.
node_ips = os.getenv("PADDLE_TRAINERS") node_ips = os.getenv("PADDLE_TRAINERS")
...@@ -55,7 +58,7 @@ paddlecloud environment.".format(args_node_ips, node_ips)) ...@@ -55,7 +58,7 @@ paddlecloud environment.".format(args_node_ips, node_ips))
paddle_port = int(os.getenv("PADDLE_PORT", "")) paddle_port = int(os.getenv("PADDLE_PORT", ""))
if paddle_ports_num >= len( if paddle_ports_num >= len(
selected_gpus) and paddle_port != args_port: devices_per_proc) and paddle_port != args_port:
logger.warning("Use Cloud specified port:{}.".format( logger.warning("Use Cloud specified port:{}.".format(
paddle_port)) paddle_port))
started_port = paddle_port started_port = paddle_port
...@@ -67,7 +70,7 @@ paddlecloud environment.".format(args_node_ips, node_ips)) ...@@ -67,7 +70,7 @@ paddlecloud environment.".format(args_node_ips, node_ips))
if started_port is None: if started_port is None:
started_port = 6170 started_port = 6170
ports = [ ports = [
x for x in range(started_port, started_port + len(selected_gpus)) x for x in range(started_port, started_port + len(devices_per_proc))
] ]
trainer_endpoints = [] trainer_endpoints = []
for ip in node_ips: for ip in node_ips:
...@@ -85,7 +88,7 @@ paddlecloud environment.".format(args_node_ips, node_ips)) ...@@ -85,7 +88,7 @@ paddlecloud environment.".format(args_node_ips, node_ips))
.format(node_ips, node_ip, node_rank, trainer_endpoints)) .format(node_ips, node_ip, node_rank, trainer_endpoints))
cluster, pod = get_cluster(node_ips, node_ip, trainer_endpoints, cluster, pod = get_cluster(node_ips, node_ip, trainer_endpoints,
selected_gpus) device_mode, devices_per_proc)
return cluster, cluster.pods[node_rank] return cluster, cluster.pods[node_rank]
......
...@@ -68,7 +68,9 @@ import copy ...@@ -68,7 +68,9 @@ import copy
from argparse import ArgumentParser, REMAINDER from argparse import ArgumentParser, REMAINDER
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.distributed.fleet import launch_utils
# TODO(danleifeng): Don't import * from a module
from paddle.distributed.fleet.launch_utils import * from paddle.distributed.fleet.launch_utils import *
import paddle.distributed.fleet.cloud_utils as cloud_utils import paddle.distributed.fleet.cloud_utils as cloud_utils
...@@ -98,12 +100,21 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra ...@@ -98,12 +100,21 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
help="The path for each process's log.If it's not set, the log will printed to default pipe." help="The path for each process's log.If it's not set, the log will printed to default pipe."
) )
base_group.add_argument(
"--nproc_per_node",
type=int,
default=None,
help="The number of processes to launch on a node."
"In gpu training, it should be less or equal to the gpus number of you system(or you set by --gpus). And so each process can"
" bound to one or average number of gpus.")
base_group.add_argument( base_group.add_argument(
"--gpus", "--gpus",
type=str, type=str,
default=None, default=None,
help="It's for gpu training and the training process will run on the gpus," help="It's for gpu training."
"each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training." "For example:"
"--gpus=\"0,1,2,3\" will launch four training processes each bound to one gpu."
) )
base_group.add_argument( base_group.add_argument(
...@@ -146,14 +157,13 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra ...@@ -146,14 +157,13 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
return parser.parse_args() return parser.parse_args()
def get_cluster_from_args(args, gpus): def get_cluster_from_args(args, device_mode, devices_per_proc):
node_ips = [x.strip() for x in args.ips.split(',')] node_ips = [x.strip() for x in args.ips.split(',')]
if len(node_ips) == 1: if len(node_ips) == 1:
node_ip = node_ips[0] node_ip = node_ips[0]
else: else:
_, node_ip = get_host_name_ip() _, node_ip = get_host_name_ip()
# node_ip = args.node_ip
assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \ assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
% (node_ip, node_ips) % (node_ip, node_ips)
node_rank = node_ips.index(node_ip) node_rank = node_ips.index(node_ip)
...@@ -164,7 +174,7 @@ def get_cluster_from_args(args, gpus): ...@@ -164,7 +174,7 @@ def get_cluster_from_args(args, gpus):
free_ports = None free_ports = None
if not cloud_utils.use_paddlecloud() and len( if not cloud_utils.use_paddlecloud() and len(
node_ips) <= 1 and os.environ.get('FLAGS_START_PORT') is None: node_ips) <= 1 and os.environ.get('FLAGS_START_PORT') is None:
free_ports = find_free_ports(len(gpus)) free_ports = find_free_ports(len(devices_per_proc))
if free_ports is not None: if free_ports is not None:
free_ports = list(free_ports) free_ports = list(free_ports)
else: else:
...@@ -172,20 +182,23 @@ def get_cluster_from_args(args, gpus): ...@@ -172,20 +182,23 @@ def get_cluster_from_args(args, gpus):
if os.environ.get('FLAGS_START_PORT') is not None: if os.environ.get('FLAGS_START_PORT') is not None:
start_port = int(os.environ.get('FLAGS_START_PORT')) start_port = int(os.environ.get('FLAGS_START_PORT'))
free_ports = [x for x in range(start_port, start_port + len(gpus))] free_ports = [
x for x in range(start_port, start_port + len(devices_per_proc))
]
trainer_endpoints = [] trainer_endpoints = []
for ip in node_ips: for ip in node_ips:
trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports]) trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
return get_cluster(node_ips, node_ip, trainer_endpoints, gpus) return get_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
devices_per_proc)
def launch_collective(args): def launch_collective(args):
# parse arguments, used for cloud-single-machine and local # parse arguments, used for cloud-single-machine and local
gpus = get_gpus(args.gpus) (device_mode, devices_per_proc) = launch_utils.get_device_proc_info(args)
trainers_num = cloud_utils.get_trainers_num() trainers_num = cloud_utils.get_trainers_num()
logger.debug("parsed from args trainerss_num:{} gpus:{}".format( logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".format(
trainers_num, gpus)) trainers_num, device_mode, devices_per_proc))
cluster = None cluster = None
pod = None pod = None
...@@ -194,11 +207,13 @@ def launch_collective(args): ...@@ -194,11 +207,13 @@ def launch_collective(args):
if os.environ.get('FLAGS_START_PORT') is not None: if os.environ.get('FLAGS_START_PORT') is not None:
start_port = os.environ.get('FLAGS_START_PORT') start_port = os.environ.get('FLAGS_START_PORT')
if cloud_utils.use_paddlecloud() and trainers_num != 1: if cloud_utils.use_paddlecloud() and trainers_num != 1:
cluster, pod = cloud_utils.get_cloud_cluster(args.ips, gpus, start_port) cluster, pod = cloud_utils.get_cloud_cluster(
args.ips, device_mode, devices_per_proc, start_port)
logger.debug("get cluster from cloud:{}".format(cluster)) logger.debug("get cluster from cloud:{}".format(cluster))
else: else:
# trainers_num = 1 or not use paddlecloud ips="a,b" # trainers_num = 1 or not use paddlecloud ips="a,b"
cluster, pod = get_cluster_from_args(args, gpus) cluster, pod = get_cluster_from_args(args, device_mode,
devices_per_proc)
logger.debug("get cluster from args:{}".format(cluster)) logger.debug("get cluster from args:{}".format(cluster))
global_envs = copy.copy(os.environ.copy()) global_envs = copy.copy(os.environ.copy())
......
...@@ -26,6 +26,8 @@ import shutil ...@@ -26,6 +26,8 @@ import shutil
from contextlib import closing from contextlib import closing
import socket import socket
import warnings import warnings
import six
from enum import IntEnum
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -33,7 +35,7 @@ logger = logging.getLogger("root") ...@@ -33,7 +35,7 @@ logger = logging.getLogger("root")
logger.propagate = False logger.propagate = False
class DistributeMode: class DistributeMode(IntEnum):
""" """
There are various mode for fleetrun, each of them is designed for different model. There are various mode for fleetrun, each of them is designed for different model.
""" """
...@@ -42,6 +44,16 @@ class DistributeMode: ...@@ -42,6 +44,16 @@ class DistributeMode:
PS_HETER = 2 PS_HETER = 2
class DeviceMode(IntEnum):
"""
Training devices type
"""
CPU = 0
GPU = 1
KUNLUN = 2
UNKNOWN = 3
class Cluster(object): class Cluster(object):
def __init__(self, hdfs): def __init__(self, hdfs):
self.job_server = None self.job_server = None
...@@ -243,7 +255,8 @@ def get_logger(log_level=20, name="root"): ...@@ -243,7 +255,8 @@ def get_logger(log_level=20, name="root"):
return logger return logger
def get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus): def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
devices_per_proc):
assert type(trainer_endpoints) is list, "trainer_endpoints must be list" assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
cluster = Cluster(hdfs=None) cluster = Cluster(hdfs=None)
trainer_rank = 0 trainer_rank = 0
...@@ -252,13 +265,17 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus): ...@@ -252,13 +265,17 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus):
pod.rank = node_rank pod.rank = node_rank
pod.addr = ip pod.addr = ip
cur_node_endpoints = trainer_endpoints[node_rank] cur_node_endpoints = trainer_endpoints[node_rank]
# when use paddlecloud, endpoints may > selected_gpus(user_defined) # when use paddlecloud, endpoints may > devices_per_proc(user_defined)
assert len(cur_node_endpoints) >= len( assert len(cur_node_endpoints) >= len(
selected_gpus devices_per_proc
), "current trainer_endpoints size should be greater equal than selected_gpus size." ), "current trainer_endpoints size should be greater equal than selected_gpus size."
for i in range(len(selected_gpus)): for i in range(len(devices_per_proc)):
trainer = Trainer() trainer = Trainer()
trainer.gpus.append(selected_gpus[i]) if device_mode == DeviceMode.GPU:
if isinstance(devices_per_proc[i], (list, tuple)):
trainer.gpus.extend(devices_per_proc[i])
else:
trainer.gpus.append(devices_per_proc[i])
trainer.endpoint = "%s" % (cur_node_endpoints[i]) trainer.endpoint = "%s" % (cur_node_endpoints[i])
trainer.rank = trainer_rank trainer.rank = trainer_rank
trainer_rank += 1 trainer_rank += 1
...@@ -432,13 +449,16 @@ def start_local_trainers(cluster, ...@@ -432,13 +449,16 @@ def start_local_trainers(cluster,
procs = [] procs = []
for idx, t in enumerate(pod.trainers): for idx, t in enumerate(pod.trainers):
proc_env = { proc_env = {
"FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]),
"PADDLE_TRAINER_ID": "%d" % t.rank, "PADDLE_TRAINER_ID": "%d" % t.rank,
"PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint, "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
"PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
} }
if len(t.gpus) > 0:
proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
[str(g) for g in t.gpus])
current_env.update(proc_env) current_env.update(proc_env)
cmd = [sys.executable, "-u", training_script] + training_script_args cmd = [sys.executable, "-u", training_script] + training_script_args
...@@ -565,6 +585,47 @@ def get_gpus(gpus): ...@@ -565,6 +585,47 @@ def get_gpus(gpus):
return res_gpus return res_gpus
def get_device_mode():
#TODO(gongwb):Add XPU supported
if not fluid.core.is_compiled_with_cuda(
) or fluid.core.get_cuda_device_count() <= 0:
print("launch train in CPU mode")
return DeviceMode.CPU
print("launch train in GPU mode")
return DeviceMode.GPU
def get_device_proc_info(args):
# device_mode
device_mode = get_device_mode()
# devices
devices_per_proc = []
if device_mode == DeviceMode.GPU:
gpus = get_gpus(args.gpus)
if args.nproc_per_node is not None:
assert (len(gpus) % int(args.nproc_per_node)) ==0, \
"gpus' number:{} mod args.nproc_per_node:{} must == 0".format(len(gpus), arg.nproc_per_node)
n = int(len(gpus) / int(args.nproc_per_node))
devices_per_proc = [
gpus[i:i + n] for i in six.moves.range(0, len(gpus), n)
]
else:
devices_per_proc = gpus
elif device_mode == DeviceMode.CPU:
if args.nproc_per_node is None:
devices_per_proc = [0]
else:
devices_per_proc = [x for x in range(0, args.nproc_per_node)]
else:
assert False, "Can't support device_mode:{}, support only cpu and gpu now.".format(
device_mode)
return (device_mode, devices_per_proc)
def direct_start(args): def direct_start(args):
# run ps-cpu mode on paddlecloud, using given envs # run ps-cpu mode on paddlecloud, using given envs
cmd = [sys.executable, "-u", args.training_script] + \ cmd = [sys.executable, "-u", args.training_script] + \
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
r"""
paddle.distributed.launch is a module that spawns multiple distributed
process on each training node for gpu training.
Usage:
In both of single node training or multiple node training, this module
launch a process on each of the given gpu card.
1. for single node training with all visible gpu cards:
python -m paddle.distributed.launch \
your_training_py (arg1 arg2 and all others)
2. for single node training with [0,4) cards
python -m paddle.distributed.launch --selected_gpus="0,1,2,3" \
your_training_py (arg1 arg2 and all others)
3. for multiple node training such as two node:192.168.0.16, 192.168.0.17
on 192.168.0.16:
python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
--node_ip=192.168.0.16 \
your_training_py (arg1 arg2 and all others)
on 192.168.0.17:
python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
--node_ip=192.168.0.17 \
your_training_py (arg1 arg2 and all others)
"""
from __future__ import print_function from paddle.distributed.fleet import launch
import sys launch.launch()
from sys import version
import subprocess
import os
import time
import six
import copy
from argparse import ArgumentParser, REMAINDER
from paddle.distributed.utils import *
from paddle.distributed import cloud_utils
def _print_arguments(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(six.iteritems(vars(args))):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def _parse_args():
"""
Helper function parsing the command line options
@retval ArgumentParser
"""
parser = ArgumentParser(
description='''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode,
see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
And your train program must read environment variables below in order to let different
process init properly:
FLAGS_selected_gpus
PADDLE_TRAINER_ID
PADDLE_CURRENT_ENDPOINT
PADDLE_TRAINERS_NUM
PADDLE_TRAINER_ENDPOINTS
POD_IP (current node ip address, not needed for local training)
''')
#Optional arguments for the launch helper
parser.add_argument(
"--cluster_node_ips",
type=str,
default="127.0.0.1",
help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
parser.add_argument(
"--node_ip",
type=str,
default="127.0.0.1",
help="The current node ip. ")
parser.add_argument(
"--use_paddlecloud",
action='store_true',
help="wheter to use paddlecloud platform to run your multi-process job. If false, no need to set this argument."
)
parser.add_argument(
"--started_port",
type=int,
default=None,
help="The trainer's started port on a single node")
parser.add_argument(
"--print_config",
type=bool,
default=True,
help="Print the config or not")
parser.add_argument(
"--selected_gpus",
type=str,
default=None,
help="It's for gpu training and the training process will run on the selected_gpus,"
"each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
)
parser.add_argument(
"--log_level",
type=int,
default=20, # logging.INFO, details are here:https://docs.python.org/3/library/logging.html#levels
help="Logging level, default is logging.INFO")
parser.add_argument(
"--log_dir",
type=str,
help="The path for each process's log.If it's not set, the log will printed to default pipe."
)
#positional
parser.add_argument(
"training_script",
type=str,
help="The full path to the single GPU training "
"program/script to be launched in parallel, "
"followed by all the arguments for the "
"training script")
#rest from the training program
parser.add_argument('training_script_args', nargs=REMAINDER)
return parser.parse_args()
def get_cluster_from_args(args, selected_gpus):
node_ips = [x.strip() for x in args.cluster_node_ips.split(',')]
node_ip = args.node_ip
node_rank = node_ips.index(node_ip)
logger.debug("parsed from args:node_ips:{} node_ip:{} node_rank:{}".format(
node_ips, node_ip, node_rank))
free_ports = None
if not args.use_paddlecloud and len(
node_ips) <= 1 and args.started_port is None:
free_ports = find_free_ports(len(selected_gpus))
if free_ports is not None:
free_ports = list(free_ports)
else:
started_port = 6070
if args.started_port is not None:
started_port = args.started_port
free_ports = [
x for x in range(started_port, started_port + len(selected_gpus))
]
trainer_endpoints = []
for ip in node_ips:
trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
def get_gpus(selected_gpus):
if selected_gpus is None:
from paddle.fluid import core
gpus_num = core.get_cuda_device_count()
gpus = [str(x) for x in range(0, gpus_num)]
else:
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
if cuda_visible_devices is None or cuda_visible_devices == "":
gpus = [x.strip() for x in selected_gpus.split(',')]
else:
# change selected_gpus into relative values
# e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
# therefore selected_gpus=0,1,2,3
cuda_visible_devices_list = cuda_visible_devices.split(',')
for x in selected_gpus.split(','):
assert x in cuda_visible_devices_list, "Can't find "\
"your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
% (x, cuda_visible_devices)
gpus = [
cuda_visible_devices_list.index(x.strip())
for x in selected_gpus.split(',')
]
logger.info("Change selected_gpus into reletive values. --ips:{} "
"will change into relative_ips:{} according to your "
"CUDA_VISIBLE_DEVICES:{}".format(
selected_gpus, gpus, cuda_visible_devices_list))
return gpus
def get_cluster_and_pod(args):
# parse arguments, used for cloud-single-machine and local
selected_gpus = get_gpus(args.selected_gpus)
trainers_num = cloud_utils.get_trainers_num()
logger.debug("parsed from args trainerss_num:{} selected_gpus:{}".format(
trainers_num, selected_gpus))
cluster = None
pod = None
if args.use_paddlecloud and trainers_num != 1:
cluster, pod = cloud_utils.get_cloud_cluster(
args.cluster_node_ips, args.node_ip, args.started_port,
selected_gpus)
logger.info("get cluster from cloud:{}".format(cluster))
else:
cluster, pod = get_cluster_from_args(args, selected_gpus)
logger.info("get cluster from args:{}".format(cluster))
return cluster, pod
def launch(args):
cluster, pod = get_cluster_and_pod(args)
procs = start_local_trainers(
cluster,
pod,
training_script=args.training_script,
training_script_args=args.training_script_args,
log_dir=args.log_dir)
while True:
alive = watch_local_trainers(procs, cluster.trainers_nranks())
if not alive:
logger.info("Local procs complete, POD info:{}".format(pod))
break
time.sleep(3)
if __name__ == "__main__":
args = _parse_args()
logger = get_logger(args.log_level)
if args.print_config:
_print_arguments(args)
launch(args)
...@@ -21,8 +21,8 @@ import six ...@@ -21,8 +21,8 @@ import six
import sys import sys
import warnings import warnings
from paddle.distributed.launch import get_cluster_and_pod, _print_arguments from paddle.distributed.utils import _print_arguments, _prepare_trainer_env
from paddle.distributed.utils import _prepare_trainer_env from paddle.distributed.cloud_utils import get_cluster_and_pod
from paddle.device import get_device from paddle.device import get_device
# deprecated module import # deprecated module import
...@@ -30,10 +30,6 @@ from paddle.fluid import core ...@@ -30,10 +30,6 @@ from paddle.fluid import core
from paddle.fluid.framework import _cpu_num from paddle.fluid.framework import _cpu_num
# NOTE(chenweihang): The existence of this class leads to
# the maintenance of two arguments. When the launch.py arguments
# is updated, the arguments here also need to be updated,
# but I have not thought of a better way here
class ParallelEnvArgs(object): class ParallelEnvArgs(object):
def __init__(self): def __init__(self):
# Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17.. # Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..
...@@ -136,7 +132,6 @@ def _get_subprocess_env_list(nprocs, options): ...@@ -136,7 +132,6 @@ def _get_subprocess_env_list(nprocs, options):
args.use_paddlecloud = options.get('use_paddlecloud', False) args.use_paddlecloud = options.get('use_paddlecloud', False)
args.print_config = options.get('print_config', False) args.print_config = options.get('print_config', False)
# reuse code of launch.py
cluster, pod = get_cluster_and_pod(args) cluster, pod = get_cluster_and_pod(args)
# prepare subprocess env list # prepare subprocess env list
...@@ -151,7 +146,7 @@ def _get_subprocess_env_list(nprocs, options): ...@@ -151,7 +146,7 @@ def _get_subprocess_env_list(nprocs, options):
def _remove_risky_env(): def _remove_risky_env():
# remove useless env vars, same as launch.py # remove useless env vars
# no copy, each process will hold env vars itself # no copy, each process will hold env vars itself
os.environ.pop("http_proxy", None) os.environ.pop("http_proxy", None)
os.environ.pop("https_proxy", None) os.environ.pop("https_proxy", None)
......
...@@ -20,6 +20,7 @@ import os ...@@ -20,6 +20,7 @@ import os
import signal import signal
import copy import copy
import sys import sys
import six
import subprocess import subprocess
from contextlib import closing from contextlib import closing
import socket import socket
...@@ -28,6 +29,72 @@ logger = logging.getLogger("root") ...@@ -28,6 +29,72 @@ logger = logging.getLogger("root")
logger.propagate = False logger.propagate = False
def get_cluster_from_args(args, selected_gpus):
node_ips = [x.strip() for x in args.cluster_node_ips.split(',')]
node_ip = args.node_ip
node_rank = node_ips.index(node_ip)
logger.debug("parsed from args:node_ips:{} node_ip:{} node_rank:{}".format(
node_ips, node_ip, node_rank))
free_ports = None
if not args.use_paddlecloud and len(
node_ips) <= 1 and args.started_port is None:
free_ports = find_free_ports(len(selected_gpus))
if free_ports is not None:
free_ports = list(free_ports)
else:
started_port = 6070
if args.started_port is not None:
started_port = args.started_port
free_ports = [
x for x in range(started_port, started_port + len(selected_gpus))
]
trainer_endpoints = []
for ip in node_ips:
trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
def get_gpus(selected_gpus):
if selected_gpus is None:
from paddle.fluid import core
gpus_num = core.get_cuda_device_count()
gpus = [str(x) for x in range(0, gpus_num)]
else:
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
if cuda_visible_devices is None or cuda_visible_devices == "":
gpus = [x.strip() for x in selected_gpus.split(',')]
else:
# change selected_gpus into relative values
# e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
# therefore selected_gpus=0,1,2,3
cuda_visible_devices_list = cuda_visible_devices.split(',')
for x in selected_gpus.split(','):
assert x in cuda_visible_devices_list, "Can't find "\
"your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
% (x, cuda_visible_devices)
gpus = [
cuda_visible_devices_list.index(x.strip())
for x in selected_gpus.split(',')
]
logger.info("Change selected_gpus into reletive values. --ips:{} "
"will change into relative_ips:{} according to your "
"CUDA_VISIBLE_DEVICES:{}".format(
selected_gpus, gpus, cuda_visible_devices_list))
return gpus
def _print_arguments(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(six.iteritems(vars(args))):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
class Hdfs(object): class Hdfs(object):
def __init__(self): def __init__(self):
self.hdfs_ugi = None self.hdfs_ugi = None
......
...@@ -69,7 +69,7 @@ class ParallelEnv(object): ...@@ -69,7 +69,7 @@ class ParallelEnv(object):
This class is used to obtain the environment variables required for This class is used to obtain the environment variables required for
the parallel execution of ``paddle.nn.Layer`` in dynamic mode. the parallel execution of ``paddle.nn.Layer`` in dynamic mode.
The parallel execution in dynamic mode needs to be started using ``paddle.distributed.launch`` The parallel execution in dynamic mode needs to be started using ``paddle.distributed.launch``
or ``paddle.distributed.spawn`` . or ``paddle.distributed.spawn`` .
Examples: Examples:
...@@ -104,7 +104,11 @@ class ParallelEnv(object): ...@@ -104,7 +104,11 @@ class ParallelEnv(object):
def __init__(self): def __init__(self):
self._rank = int(os.getenv("PADDLE_TRAINER_ID", "0")) self._rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
self._world_size = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) self._world_size = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
self._device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
# imperative only support one gpu
selected_gpus = os.getenv("FLAGS_selected_gpus", "0").split(",")
self._device_id = int(selected_gpus[0])
self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
"").split(",") "").split(",")
self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "") self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
...@@ -347,7 +351,7 @@ class DataParallel(layers.Layer): ...@@ -347,7 +351,7 @@ class DataParallel(layers.Layer):
2. start by ``paddle.distributed.launch`` module, for example: 2. start by ``paddle.distributed.launch`` module, for example:
``python -m paddle.distributed.launch --selected_gpus=0,1 demo.py`` . ``python -m paddle.distributed.launch --gpus=0,1 demo.py`` .
And the content of `demo.py` is the code of examples. And the content of `demo.py` is the code of examples.
......
...@@ -26,14 +26,18 @@ list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer) ...@@ -26,14 +26,18 @@ list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler) list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
list(APPEND MIXED_DIST_TEST_OPS test_recv_save_op) list(APPEND MIXED_DIST_TEST_OPS test_recv_save_op)
list(APPEND MIXED_DIST_TEST_OPS test_transpiler_ops) list(APPEND MIXED_DIST_TEST_OPS test_transpiler_ops)
list(APPEND MIXED_DIST_TEST_OPS test_launch)
list(APPEND MIXED_DIST_TEST_OPS test_c_comm_init_op) list(APPEND MIXED_DIST_TEST_OPS test_c_comm_init_op)
list(APPEND MIXED_DIST_TEST_OPS test_launch_ps)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_async) list(APPEND MIXED_DIST_TEST_OPS test_communicator_async)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo) list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async) list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync) list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ps)
list(APPEND MIXED_DIST_TEST_OPS test_launch_coverage)
list(APPEND MIXED_DIST_TEST_OPS test_fleetrun)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_run_random_port)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_async)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_cloud)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input) list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint) list(APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint)
list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer)
...@@ -494,14 +498,17 @@ if(WITH_DISTRIBUTE) ...@@ -494,14 +498,17 @@ if(WITH_DISTRIBUTE)
endif() endif()
if(NOT APPLE) if(NOT APPLE)
if(WITH_GPU) if(WITH_GPU)
# NOTE. test_launch only work in gpu collective mode
bash_test_modules(test_launch START_BASH test_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
bash_test_modules(test_c_comm_init_op START_BASH test_c_comm_init_op.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_c_comm_init_op START_BASH test_c_comm_init_op.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
py_test_modules(test_fleet_checkpoint MODULES test_fleet_checkpoint) py_test_modules(test_fleet_checkpoint MODULES test_fleet_checkpoint)
py_test_modules(test_launch_coverage MODULES test_launch_coverage)
endif() endif()
bash_test_modules(test_launch_ps START_BASH test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
bash_test_modules(test_fleet_launch START_BASH test_fleet_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_fleetrun START_BASH test_fleetrun.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
bash_test_modules(test_fleet_run_random_port START_BASH test_fleet_run_random_port.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
# port range (20000, 23000) is reserved for dist-ops # port range (20000, 23000) is reserved for dist-ops
set(dist_ut_port 20001) set(dist_ut_port 20001)
...@@ -624,9 +631,7 @@ if (WITH_DISTRIBUTE AND NOT APPLE) ...@@ -624,9 +631,7 @@ if (WITH_DISTRIBUTE AND NOT APPLE)
if(WITH_GPU) if(WITH_GPU)
set_tests_properties(test_c_comm_init_op PROPERTIES TIMEOUT 120) set_tests_properties(test_c_comm_init_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_fleet_checkpoint PROPERTIES TIMEOUT 120) set_tests_properties(test_fleet_checkpoint PROPERTIES TIMEOUT 120)
set_tests_properties(test_launch PROPERTIES TIMEOUT 120)
endif() endif()
set_tests_properties(test_fleet_launch PROPERTIES TIMEOUT 120)
endif() endif()
# setting timeout value as 15S # setting timeout value as 15S
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import sys
import paddle.fluid as fluid
print("compile with cuda:", fluid.core.is_compiled_with_cuda())
print("get_cuda_device_count:", fluid.core.get_cuda_device_count())
if fluid.core.is_compiled_with_cuda() and fluid.core.get_cuda_device_count(
) > 0:
sys.exit(0)
else:
sys.exit(1)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import time
def train(prefix):
selected_gpus = os.getenv("FLAGS_selected_gpus")
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
worker_endpoints = worker_endpoints_env
trainers_num = len(worker_endpoints.split(','))
name = "selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
.format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
print(name)
with open("{}.check_{}.log".format(prefix, trainer_id), "w") as f:
f.write(name)
if __name__ == '__main__':
prefix = sys.argv[1]
train(prefix)
#!/bin/bash
set -e
function test_launch_ps(){
fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then
echo "test pserver launch succeed"
else
echo "test pserver launch failed"
exit -1
fi
fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1:6782,127.0.0.1:6783" fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then
echo "test pserver launch succeed"
else
echo "test pserver launch failed"
exit -1
fi
fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1,127.0.0.1" fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then
echo "test pserver launch succeed"
else
echo "test pserver launch failed"
exit -1
fi
}
function test_launch_ps_heter(){
fleetrun --server_num=2 --worker_num=2 --heter_worker_num=2 fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then
echo "test heter pserver launch succeed"
else
echo "test pserver launch failed"
exit -1
fi
}
if [[ ${WITH_GPU} == "OFF" ]]; then
echo "in cpu test mode"
test_launch_ps
exit 0
fi
echo "No.1 unittest"
test_launch_ps
test_launch_ps_heter
# use default values
echo "No.2 unittest"
fleetrun multi_process.py fleetrun
# use paddlecloud
echo "begin test use paddlecloud"
cluster_node_ips="127.0.0.1,127.0.0.2"
export PADDLE_TRAINERS_NUM=2
export POD_IP=127.0.0.1
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export PADDLE_TRAINER_ID=0
export PADDLE_PORT=35789
export TRAINER_PORTS_NUM=2
echo "No.3 unittest"
distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun
str1="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
str2="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
file_0="multi_process_fleetrun.check_0.log"
file_1="multi_process_fleetrun.check_1.log"
echo "paddlecloud params test"
if grep -q "$str1" "$file_0"; then
echo "find trainer 0"
else
echo "not find trainer 0"
exit -1
fi
if grep -q "$str2" "$file_1"; then
echo "find trainer 1"
else
echo "not find trainer 1"
exit -1
fi
# test async poll process
if [ -f $file_0 ]; then
rm $file_0
fi
if [ -f $file_1 ]; then
rm $file_1
fi
# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud
unset PADDLE_PORT
export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
echo "No.4 unittest"
echo "paddle.distributed.launch async poll process test"
if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun abort; then
echo "train abort as planned"
fi
abort_str1="abort>>> selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
if grep -q "$abort_str1" "$file_0"; then
echo "trainer 0 abort as planned"
else
echo "trainer 0 not abort as planned"
exit -1
fi
if [ ! -f $file_1 ]; then
echo "trainer 1 terminate as planned"
else
echo "trainer 1 not terminate as planned"
exit -1
fi
#test for random ports
file_0_0="test_launch_filelock_0_0.log"
file_1_0="test_launch_filelock_1_0.log"
rm -rf $file_0_0 $file_0_1
distributed_args="--gpus=0,1 --log_dir=testlog"
export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
echo "No.5 unittest"
CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} find_ports.py
str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
#!/bin/bash #!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e set -e
# use default values
# FIXME: random fails on Unknown command lines -c (or -m).
launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
python ${launch_py} multi_process.py launch
# use paddlecloud # test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud
echo "begin test use paddlecloud" unset PADDLE_PORT
cluster_node_ips="10.0.0.1" export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
node_ip="10.0.0.1" export cluster_node_ips="127.0.0.1,127.0.0.2"
export PADDLE_TRAINERS_NUM=2 export PADDLE_TRAINERS_NUM=2
export POD_IP=127.0.0.1 export POD_IP=127.0.0.1
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export PADDLE_TRAINER_ID=0 export PADDLE_TRAINER_ID=0
export PADDLE_PORT=35019
export TRAINER_PORTS_NUM=2 export TRAINER_PORTS_NUM=2
distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog" file_0="multi_process_fleetrun.check_0.log"
CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch file_1="multi_process_fleetrun.check_1.log"
str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
file_0="multi_process_launch.check_0.log"
file_1="multi_process_launch.check_1.log"
echo "paddlecloud params test"
if grep -q "$str1" "$file_0"; then
echo "find trainer 0"
else
echo "not find trainer 0"
exit -1
fi
if grep -q "$str2" "$file_1"; then
echo "find trainer 1"
else
echo "not find trainer 1"
exit -1
fi
# test async poll process distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
if [ -f $file_0 ]; then
rm $file_0
fi
if [ -f $file_1 ]; then
rm $file_1
fi
# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud echo "paddle.distributed.fleet.launch async poll process test"
unset PADDLE_PORT if ! CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process.py fleetrun abort; then
export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
echo ""
echo "paddle.distributed.launch async poll process test"
if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch abort; then
echo "train abort as planned" echo "train abort as planned"
fi fi
...@@ -73,13 +52,3 @@ else ...@@ -73,13 +52,3 @@ else
echo "trainer 1 not terminate as planned" echo "trainer 1 not terminate as planned"
exit -1 exit -1
fi fi
#test for random ports
file_0_0="test_launch_filelock_0_0.log"
file_1_0="test_launch_filelock_1_0.log"
rm -rf $file_0_0 $file_0_1
distributed_args="--selected_gpus=0,1 --log_dir=testlog"
export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} find_ports.py
str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
# use paddlecloud
echo "begin test use paddlecloud"
cluster_node_ips="127.0.0.1,127.0.0.2"
export PADDLE_TRAINERS_NUM=2
export POD_IP=127.0.0.1
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export PADDLE_TRAINER_ID=0
export PADDLE_PORT=35789
export TRAINER_PORTS_NUM=2
distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process.py fleetrun
str1="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
str2="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
file_0="multi_process_fleetrun.check_0.log"
file_1="multi_process_fleetrun.check_1.log"
echo "paddlecloud params test"
if grep -q "$str1" "$file_0"; then
echo "find trainer 0"
else
echo "not find trainer 0"
exit -1
fi
if grep -q "$str2" "$file_1"; then
echo "find trainer 1"
else
echo "not find trainer 1"
exit -1
fi
# test async poll process
if [ -f $file_0 ]; then
rm $file_0
fi
if [ -f $file_1 ]; then
rm $file_1
fi
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
export FLAGS_START_PORT=35789
#local_ip=`ip route get 1 | awk '{print $NF;exit}'`
file_0="fleet_nproc_0.check_0.log"
function test_nproc_0(){
gpus=$1
rm -f ${file_0}
distributed_args="--log_dir=testlog --nproc_per_node=1"
# nproc_per_node=1, each with 2 gpus
python -m paddle.distributed.launch ${distributed_args} nproc_process.py fleet_nproc_0
str0="selected_gpus:${gpus} worker_endpoints:127.0.0.1:35789 trainers_num:1 current_endpoint:127.0.0.1:35789 trainer_id:0"
if grep -q "$str0" "$file_0"; then
echo "find trainer 0"
else
echo "not find trainer 0"
exit -1
fi
}
# unittest1:gpu
if python detected_gpu.py ; then
echo "begin ut 1:"
export CUDA_VISIBLE_DEVICES=0,1
test_nproc_0 "0,1"
fi
# unittest2:cpu
if ! python detected_gpu.py ; then
echo "begin ut 2:"
export CUDA_VISIBLE_DEVICES=""
test_nproc_0 ""
fi
function test_nproc_1_gpu(){
file_0="fleet_nproc_1.check_0.log"
file_1="fleet_nproc_1.check_1.log"
rm -f ${file_0} ${file_1}
distributed_args="--log_dir=testlog --nproc_per_node=2"
python -m paddle.distributed.launch ${distributed_args} nproc_process.py fleet_nproc_1
str0="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35789 trainer_id:0"
if grep -q "$str0" "$file_0"; then
echo "find trainer 0"
else
echo "not find trainer 0"
exit -1
fi
str1="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35790 trainer_id:1"
if grep -q "$str1" "$file_1"; then
echo "find trainer 1"
else
echo "not find trainer 1"
exit -1
fi
}
# unittest3: nproc_per_node=2, each with 1 gpus
if python detected_gpu.py ; then
echo "begin ut 3:"
export CUDA_VISIBLE_DEVICES=0,1
test_nproc_1_gpu
fi
function test_nproc_1_cpu(){
file_0="fleet_nproc_1.check_0.log"
file_1="fleet_nproc_1.check_1.log"
rm -f ${file_0} ${file_1}
distributed_args="--log_dir=testlog --nproc_per_node=2"
python -m paddle.distributed.launch ${distributed_args} nproc_process.py fleet_nproc_1
str0="selected_gpus: worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35789 trainer_id:0"
if grep -q "$str0" "$file_0"; then
echo "find trainer 0"
else
echo "not find trainer 0"
exit -1
fi
str1="selected_gpus: worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35790 trainer_id:1"
if grep -q "$str1" "$file_1"; then
echo "find trainer 1"
else
echo "not find trainer 1"
exit -1
fi
}
# unittest4: nproc_per_node=2, cpu
if ! python detected_gpu.py ; then
echo "begin ut 4:"
export CUDA_VISIBLE_DEVICES=""
test_nproc_1_cpu
fi
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
function test_launch_ps(){
python -m paddle.distributed.fleet.launch --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then
echo "test pserver launch succeed"
else
echo "test pserver launch failed"
exit -1
fi
python -m paddle.distributed.fleet.launch --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1:6782,127.0.0.1:6783" fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then
echo "test pserver launch succeed"
else
echo "test pserver launch failed"
exit -1
fi
python -m paddle.distributed.fleet.launch --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1,127.0.0.1" fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then
echo "test pserver launch succeed"
else
echo "test pserver launch failed"
exit -1
fi
}
function test_launch_ps_heter(){
python -m paddle.distributed.fleet.launch --server_num=2 --worker_num=2 --heter_worker_num=2 fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then
echo "test heter pserver launch succeed"
else
echo "test pserver launch failed"
exit -1
fi
}
if [[ ${WITH_GPU} == "OFF" ]]; then
echo "in cpu test mode"
test_launch_ps
exit 0
fi
test_launch_ps
test_launch_ps_heter
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
#test for random ports
file_0_0="test_launch_filelock_0_0.log"
file_1_0="test_launch_filelock_1_0.log"
rm -rf $file_0_0 $file_0_1
distributed_args="--gpus=0,1 --log_dir=testlog"
export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} find_ports.py
str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
# use default values
fleetrun multi_process.py fleetrun
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -13,34 +13,55 @@ ...@@ -13,34 +13,55 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
from __future__ import unicode_literals
import subprocess
import sys import sys
import subprocess
import os import os
import time
import six
import copy import copy
from argparse import ArgumentParser, REMAINDER import unittest
import paddle.fluid as fluid
from argparse import ArgumentParser, REMAINDER
def parse_args(): from paddle.distributed.utils import _print_arguments, get_gpus, get_cluster_from_args
# Optional arguments for the launch helper
parser = ArgumentParser(description="Distributed training")
def _parse_args():
parser = ArgumentParser(
description='''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode,
see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
And your train program must read environment variables below in order to let different
process init properly:
FLAGS_selected_gpus
PADDLE_TRAINER_ID
PADDLE_CURRENT_ENDPOINT
PADDLE_TRAINERS_NUM
PADDLE_TRAINER_ENDPOINTS
POD_IP (current node ip address, not needed for local training)
''')
#Optional arguments for the launch helper
parser.add_argument( parser.add_argument(
"--cluster_node_ips", "--cluster_node_ips",
type=str, type=str,
default="127.0.0.1", default="127.0.0.1",
help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..") help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
parser.add_argument( parser.add_argument(
"--node_ip", "--node_ip",
type=str, type=str,
default="127.0.0.1", default="127.0.0.1",
help="The current node ip. ") help="The current node ip. ")
parser.add_argument( parser.add_argument(
"--start_port", "--use_paddlecloud",
action='store_true',
help="wheter to use paddlecloud platform to run your multi-process job. If false, no need to set this argument."
)
parser.add_argument(
"--started_port",
type=int, type=int,
default=6170, default=None,
help="The trainer's start port on a single node") help="The trainer's started port on a single node")
parser.add_argument( parser.add_argument(
"--print_config", "--print_config",
...@@ -49,22 +70,26 @@ def parse_args(): ...@@ -49,22 +70,26 @@ def parse_args():
help="Print the config or not") help="Print the config or not")
parser.add_argument( parser.add_argument(
"--endpoints", type=str, default="", help="User defined endpoints") "--selected_gpus",
type=str,
parser.add_argument( default=None,
"--worker_num", type=int, default=2, help="number of workers") help="It's for gpu training and the training process will run on the selected_gpus,"
"each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
)
parser.add_argument( parser.add_argument(
"--server_num", type=int, default=2, help="number of servers") "--log_level",
type=int,
default=20, # logging.INFO, details are here:https://docs.python.org/3/library/logging.html#levels
help="Logging level, default is logging.INFO")
parser.add_argument( parser.add_argument(
"--log_dir", "--log_dir",
default="logs",
type=str, type=str,
help="The path for each process's log.If it's not set, the log will printed to default pipe." help="The path for each process's log.If it's not set, the log will printed to default pipe."
) )
# positional #positional
parser.add_argument( parser.add_argument(
"training_script", "training_script",
type=str, type=str,
...@@ -73,93 +98,23 @@ def parse_args(): ...@@ -73,93 +98,23 @@ def parse_args():
"followed by all the arguments for the " "followed by all the arguments for the "
"training script") "training script")
# rest from the training program #rest from the training program
parser.add_argument('training_script_args', nargs=REMAINDER) parser.add_argument('training_script_args', nargs=REMAINDER)
return parser.parse_args() return parser.parse_args()
def start_procs(args): class TestCoverage(unittest.TestCase):
worker_num = args.worker_num def test_gpus(self):
server_num = args.server_num args = _parse_args()
start_port = args.start_port
default_env = os.environ.copy() if args.print_config:
current_env = copy.copy(default_env) _print_arguments(args)
current_env.pop("http_proxy", None)
current_env.pop("https_proxy", None) gpus = get_gpus(None)
procs = []
cmds = [] args.use_paddlecloud = True
log_fns = [] cluster, pod = get_cluster_from_args(args, "0")
ports = range(start_port, start_port + server_num, 1)
default_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
user_endpoints = "" if __name__ == '__main__':
if args.endpoints == "": unittest.main()
user_endpoints = default_endpoints
else:
user_endpoints = args.endpoints
user_endpoints_ips = [x.split(":")[0] for x in user_endpoints.split(",")]
user_endpoints_port = [x.split(":")[1] for x in user_endpoints.split(",")]
for i in range(server_num):
current_env.update({
"PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
"PADDLE_PORT": user_endpoints_port[i],
"TRAINING_ROLE": "PSERVER",
"PADDLE_TRAINERS_NUM": str(worker_num),
"POD_IP": user_endpoints_ips[i]
})
cmd = [sys.executable, "-u", args.training_script
] + args.training_script_args
cmds.append(cmd)
if args.log_dir is not None:
os.system("mkdir -p {}".format(args.log_dir))
fn = open("%s/serverlog.%d" % (args.log_dir, i), "w")
log_fns.append(fn)
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
else:
proc = subprocess.Popen(cmd, env=current_env)
procs.append(proc)
for i in range(worker_num):
current_env.update({
"PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
"PADDLE_TRAINERS_NUM": str(worker_num),
"TRAINING_ROLE": "TRAINER",
"PADDLE_TRAINER_ID": str(i)
})
cmd = [sys.executable, "-u", args.training_script
] + args.training_script_args
cmds.append(cmd)
if args.log_dir is not None:
os.system("mkdir -p {}".format(args.log_dir))
fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
log_fns.append(fn)
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
else:
proc = subprocess.Popen(cmd, env=current_env)
procs.append(proc)
# only wait worker to finish here
for i, proc in enumerate(procs):
if i < server_num:
continue
procs[i].wait()
if len(log_fns) > 0:
log_fns[i].close()
print("all workers exit, going to finish parameter server", file=sys.stderr)
for i in range(server_num):
if len(log_fns) > 0:
log_fns[i].close()
procs[i].terminate()
print("all parameter server are killed", file=sys.stderr)
def launch():
args = parse_args()
if args.print_config:
start_procs(args)
# server num, worker num
if __name__ == "__main__":
launch()
#!/bin/bash
set -e
# use default values
launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch_ps.py
python ${launch_py} fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then
echo "succeed"
else
echo "failed"
exit -1
fi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册