From 1cfd3cb13b0ea4bf091757d59ec24e021563518f Mon Sep 17 00:00:00 2001 From: Qiyang Min Date: Mon, 11 Jun 2018 03:11:22 -0500 Subject: [PATCH] Add some dist-training robust cases into fluid benchmark test (#11207) * 1. add weight decay feature into fluid benchmark test 2. add learning rate decay feature into fluid benchmark test 3. add L1&L2 regularization feature into fluid benchmark test 4. add error clipping feature into fluid benchmark test 5. add gradient clipping feature into fluid benchmark test * Add some document to README.md under benchmark/fluid/ repo * Add model_base.py * Fix bugs in test_listen_and_serv_op * 1. remove args out of fluid_benchmark.py 2. remove lr_decay, regularization, clipping out of fluid_benchmark.py * add async_mode description to doc and remove the clipping description out * for restart build * to restart build * remove optimization args from args.py * 1. remove optimization from models 2. fix bug in test_listen_and_serv_op * change the name retry_times to left_time * change retry_times to the pserver start left time --- benchmark/fluid/README.md | 4 +- benchmark/fluid/args.py | 126 ++++++++++++++++++ benchmark/fluid/fluid_benchmark.py | 114 ++-------------- .../unittests/test_listen_and_serv_op.py | 9 +- 4 files changed, 143 insertions(+), 110 deletions(-) create mode 100644 benchmark/fluid/args.py diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md index f40f3c129..28cade463 100644 --- a/benchmark/fluid/README.md +++ b/benchmark/fluid/README.md @@ -24,10 +24,12 @@ Currently supported `--model` argument include: * Run the following command to start a benchmark job locally: ```bash - python fluid_benchmark.py --model mnist --device GPU + python fluid_benchmark.py --model mnist --device GPU ``` You can choose to use GPU/CPU training. With GPU training, you can specify `--gpus ` to run multi GPU training. + You can set async mode parameter server. With async mode, you can specify + `--async_mode` to train model asynchronous. * Run distributed training with parameter servers: * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example. * start parameter servers: diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py new file mode 100644 index 000000000..68a3d42d7 --- /dev/null +++ b/benchmark/fluid/args.py @@ -0,0 +1,126 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +__all__ = ['parse_args', ] + +BENCHMARK_MODELS = [ + "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" +] + + +def parse_args(): + parser = argparse.ArgumentParser('Fluid model benchmarks.') + parser.add_argument( + '--model', + type=str, + choices=BENCHMARK_MODELS, + default='resnet', + help='The model to run benchmark with.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + # args related to learning rate + parser.add_argument( + '--learning_rate', type=float, default=0.001, help='The learning rate.') + # TODO(wuyi): add "--use_fake_data" option back. + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=100, help='The number of passes.') + parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data data_format, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--gpus', + type=int, + default=1, + help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') + # this option is available only for vgg and resnet. + parser.add_argument( + '--cpus', + type=int, + default=1, + help='If cpus > 1, will use ParallelDo to run, else use Executor.') + parser.add_argument( + '--data_set', + type=str, + default='flowers', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + parser.add_argument( + '--no_test', + action='store_true', + help='If set, do not test the testset during training.') + parser.add_argument( + '--memory_optimize', + action='store_true', + help='If set, optimize runtime memory before start.') + parser.add_argument( + '--use_fake_data', + action='store_true', + help='If set ommit the actual read data operators.') + parser.add_argument( + '--profile', action='store_true', help='If set, profile a few steps.') + parser.add_argument( + '--update_method', + type=str, + default='local', + choices=['local', 'pserver', 'nccl2'], + help='Choose parameter update method, can be local, pserver, nccl2.') + parser.add_argument( + '--no_split_var', + action='store_true', + default=False, + help='Whether split variables into blocks when update_method is pserver') + parser.add_argument( + '--async_mode', + action='store_true', + default=False, + help='Whether start pserver in async mode to support ASGD') + parser.add_argument( + '--use_reader_op', + action='store_true', + help='Whether to use reader op, and must specify the data path if set this to true.' + ) + parser.add_argument( + '--data_path', + type=str, + default="", + help='Directory that contains all the training recordio files.') + args = parser.parse_args() + return args diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 62a05234c..902dca209 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -24,108 +24,7 @@ import paddle.fluid.core as core import paddle.fluid.profiler as profiler import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler -BENCHMARK_MODELS = [ - "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" -] - - -def parse_args(): - parser = argparse.ArgumentParser('Fluid model benchmarks.') - parser.add_argument( - '--model', - type=str, - choices=BENCHMARK_MODELS, - default='resnet', - help='The model to run benchmark with.') - parser.add_argument( - '--batch_size', - type=int, - default=32, - help='The batch size on each gpu.') - parser.add_argument( - '--learning_rate', type=float, default=0.001, help='The learning rate.') - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', - type=int, - default=80, - help='The number of minibatches, set to -1 to run all batches.') - parser.add_argument( - '--pass_num', type=int, default=100, help='The number of passes.') - parser.add_argument( - '--data_format', - type=str, - default='NCHW', - choices=['NCHW', 'NHWC'], - help='The data data_format, now only support NCHW.') - parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--gpus', - type=int, - default=1, - help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') - # this option is available only for vgg and resnet. - parser.add_argument( - '--cpus', - type=int, - default=1, - help='If cpus > 1, will use ParallelDo to run, else use Executor.') - parser.add_argument( - '--data_set', - type=str, - default='flowers', - choices=['cifar10', 'flowers', 'imagenet'], - help='Optional dataset for benchmark.') - parser.add_argument( - '--infer_only', action='store_true', help='If set, run forward only.') - parser.add_argument( - '--use_cprof', action='store_true', help='If set, use cProfile.') - parser.add_argument( - '--use_nvprof', - action='store_true', - help='If set, use nvprof for CUDA.') - parser.add_argument( - '--no_test', - action='store_true', - help='If set, do not test the testset during training.') - parser.add_argument( - '--memory_optimize', - action='store_true', - help='If set, optimize runtime memory before start.') - parser.add_argument( - '--use_fake_data', - action='store_true', - help='If set ommit the actual read data operators.') - parser.add_argument( - '--profile', action='store_true', help='If set, profile a few steps.') - parser.add_argument( - '--update_method', - type=str, - default='local', - choices=['local', 'pserver', 'nccl2'], - help='Choose parameter update method, can be local, pserver, nccl2.') - parser.add_argument( - '--use_reader_op', - action='store_true', - help='Whether to use reader op, and must specify the data path if set this to true.' - ) - parser.add_argument( - '--data_path', - type=str, - default="", - help='Directory that contains all the training recordio files.') - args = parser.parse_args() - return args +from args import * def append_nccl2_prepare(trainer_id): @@ -160,7 +59,7 @@ def append_nccl2_prepare(trainer_id): "nccl-based dist train.") -def dist_transpile(trainer_id): +def dist_transpile(trainer_id, args): if trainer_id < 0: return None, None @@ -182,7 +81,12 @@ def dist_transpile(trainer_id): training_role = os.getenv("PADDLE_TRAINING_ROLE") t = distribute_transpiler.DistributeTranspiler() - t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) + t.transpile( + trainer_id, + pservers=pserver_endpoints, + trainers=trainers, + sync_mode=not args.async_mode, + slice_var_up=not args.no_split_var) if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program(current_endpoint, @@ -417,7 +321,7 @@ def main(): fluid.memory_optimize(fluid.default_main_program()) if args.update_method == "pserver": - train_prog, startup_prog = dist_transpile(trainer_id) + train_prog, startup_prog = dist_transpile(trainer_id, args) if not train_prog: raise Exception( "Must configure correct environments to run dist train.") diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py index 1226027dd..d1d709551 100644 --- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py +++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py @@ -70,17 +70,18 @@ class TestListenAndServOp(OpTest): return p.pid def _wait_ps_ready(self, pid): - retry_times = self.ps_timeout + start_left_time = self.ps_timeout + sleep_time = 0.5 while True: - assert retry_times >= 0, "wait ps ready failed" - time.sleep(0.5) + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) try: # the listen_and_serv_op would touch a file which contains the listen port # on the /tmp directory until it was ready to process all the RPC call. os.stat("/tmp/paddle.%d.port" % pid) return except os.error: - retry_times -= 1 + start_left_time -= sleep_time def test_rpc_interfaces(self): # TODO(Yancey1989): need to make sure the rpc interface correctly. -- GitLab