From d424e5b4c9edf6fdbb2200f04967e2c3bde9f011 Mon Sep 17 00:00:00 2001 From: Yan Xu Date: Tue, 12 Feb 2019 15:48:06 +0800 Subject: [PATCH] add launch mp distributed job py module test=develop (#15620) * add launch mp distributed mode module test=develop * delete unused file test=develop * refine usage test=develop * refine usage test=develop * move distributed package test=develop * add to whl package test=develop --- python/paddle/__init__.py | 1 + python/paddle/distributed/__init__.py | 13 +++++++ .../paddle/distributed/launch.py | 38 +++++++++++-------- python/paddle/fluid/__init__.py | 1 - python/setup.py.in | 1 + 5 files changed, 37 insertions(+), 17 deletions(-) create mode 100644 python/paddle/distributed/__init__.py rename tools/run_mp.py => python/paddle/distributed/launch.py (83%) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 53746afdb..fe2ae67ec 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -25,4 +25,5 @@ import paddle.reader import paddle.dataset import paddle.batch import paddle.compat +import paddle.distributed batch = batch.batch diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py new file mode 100644 index 000000000..d0c32e260 --- /dev/null +++ b/python/paddle/distributed/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tools/run_mp.py b/python/paddle/distributed/launch.py similarity index 83% rename from tools/run_mp.py rename to python/paddle/distributed/launch.py index 2485400ab..03c407877 100644 --- a/tools/run_mp.py +++ b/python/paddle/distributed/launch.py @@ -37,7 +37,7 @@ default_envs = { GPUS = 8 -def start_procs(gpus, cmd, log_dir): +def start_procs(gpus, entrypoint, entrypoint_args, log_dir): procs = [] log_fns = [] os.system("mkdir -p %s" % log_dir) @@ -73,12 +73,11 @@ def start_procs(gpus, cmd, log_dir): "PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints }) - print("starting process ", i, cmd, curr_env) + print("starting process ", i, entrypoint, entrypoint_args, curr_env) fn = open("%s/workerlog.%d" % (log_dir, i), "w") log_fns.append(fn) - procs.append( - subprocess.Popen( - cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env)) + cmd = [sys.executable, "-u", entrypoint] + entrypoint_args + procs.append(subprocess.Popen(cmd, stdout=fn, stderr=fn, env=curr_env)) for i in range(gpus): try: @@ -89,7 +88,8 @@ def start_procs(gpus, cmd, log_dir): pass -def main(): +def parse_args(): + parser = argparse.ArgumentParser( description='''start paddle training using multi-process mode. NOTE: your train program ***must*** run as distributed nccl2 mode, @@ -108,21 +108,27 @@ POD_IP (current node ip address, not needed for local training) type=int, default=8, help='start number of processes for every gpu') - parser.add_argument( - '--cmd', - type=str, - default="", - help='command to run for each process, e.g. python train.py --lr 0.1') parser.add_argument( '--log_dir', type=str, default="mylog", help='directory to put logs per process.') - args = parser.parse_args() - if args.cmd == "": - parser.print_help() - exit(0) - start_procs(args.gpus, args.cmd, args.log_dir) + parser.add_argument( + 'entrypoint_script', + type=str, + help="The entrypoint script to be launched in parallel," + "followed by all the arguments for each process," + "e.g. train.py --lr 0.1") + parser.add_argument('entrypoint_args', nargs=argparse.REMAINDER) + return parser.parse_args() + + +def main(): + args = parse_args() + + # launch multiple training process + start_procs(args.gpus, args.entrypoint_script, args.entrypoint_args, + args.log_dir) if __name__ == "__main__": diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 396f36e18..aa1f85734 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -161,7 +161,6 @@ def __bootstrap__(): 'times_excess_than_required_tmp_allocation', 'enable_inplace_whitelist' ] - core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0]) diff --git a/python/setup.py.in b/python/setup.py.in index f93f0cd13..a7c1e91f9 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -100,6 +100,7 @@ packages=['paddle', 'paddle.utils', 'paddle.dataset', 'paddle.reader', + 'paddle.distributed', 'paddle.fluid', 'paddle.fluid.imperative', 'paddle.fluid.proto', -- GitLab