From b14ecb8632e21af8b72586768f003f61da0c8b6e Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 29 Sep 2020 13:06:04 +0800 Subject: [PATCH] Polish api BuildStrategy/ExecutionStrategy doc & code example (#27662) * polish BuildStrategy api doc & example * polish ExecutionStrategy api doc & example * polish details --- paddle/fluid/pybind/pybind.cc | 254 +++++++++++++++++++++---------- python/paddle/static/__init__.py | 4 +- 2 files changed, 180 insertions(+), 78 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ae0b50461a1..fc5b0774667 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1980,27 +1980,34 @@ All parameter, weight, gradient are variables in Paddle. ExecutionStrategy allows the user to more preciously control how to run the program in ParallelExecutor by setting the property. + Returns: + ExecutionStrategy: An ExecutionStrategy object. + Examples: .. code-block:: python - import paddle.fluid as fluid - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) + import paddle + import paddle.static as static + import paddle.nn.functional as F + + paddle.enable_static() + + x = static.data(name='x', shape=[None, 13], dtype='float32') + y = static.data(name='y', shape=[None, 1], dtype='float32') + y_predict = static.nn.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_loss = fluid.layers.mean(cost) + cost = F.square_error_cost(input=y_predict, label=y) + avg_loss = paddle.mean(cost) - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_loss) - exec_strategy = fluid.ExecutionStrategy() + exec_strategy = static.ExecutionStrategy() exec_strategy.num_threads = 4 - train_exe = fluid.ParallelExecutor(use_cuda=False, - loss_name=avg_loss.name, - exec_strategy=exec_strategy) - + train_exe = static.ParallelExecutor(use_cuda=False, + loss_name=avg_loss.name, + exec_strategy=exec_strategy) )DOC"); exec_strategy.def(py::init()) @@ -2010,7 +2017,8 @@ All parameter, weight, gradient are variables in Paddle. [](ExecutionStrategy &self, size_t num_threads) { self.num_threads_ = num_threads; }, - R"DOC(The type is INT, num_threads represents the size of thread pool that + R"DOC( + The type is INT, num_threads represents the size of thread pool that used to run the operators of the current program in ParallelExecutor. If :math:`num\_threads=1`, all the operators will execute one by one, but the order maybe difference between iterations. @@ -2018,7 +2026,19 @@ All parameter, weight, gradient are variables in Paddle. device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU, :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor. if it is not set, ParallelExecutor will get the cpu count by calling - `multiprocessing.cpu_count()`. Default 0.)DOC") + `multiprocessing.cpu_count()`. Default 0. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + exec_strategy = static.ExecutionStrategy() + exec_strategy.num_threads = 4 + )DOC") .def_property( "use_cuda", [](const ExecutionStrategy &self) { return self.use_cuda_; }, @@ -2050,13 +2070,24 @@ All parameter, weight, gradient are variables in Paddle. many iterations to clean up the temp variables which is generated during execution. It may make the execution faster, because the temp variable's shape maybe the same between two iterations. - Default 1. + Default 100. + + .. note:: + 1. If you fetch data when calling the 'run', the ParallelExecutor + will clean up the temp variables at the end of the current iteration. + 2. In some NLP model, it may cause the GPU memory is insufficient, + in this case, you should reduce `num_iteration_per_drop_scope`. + + Examples: + .. code-block:: python - NOTES: - 1. If you fetch data when calling the 'run', the ParallelExecutor - will clean up the temp variables at the end of the current iteration. - 2. In some NLP model, it may cause the GPU memory is insufficient, - in this case, you should reduce `num_iteration_per_drop_scope`. + import paddle + import paddle.static as static + + paddle.enable_static() + + exec_strategy = static.ExecutionStrategy() + exec_strategy.num_iteration_per_drop_scope = 10 )DOC") .def_property( "num_iteration_per_run", @@ -2067,7 +2098,18 @@ All parameter, weight, gradient are variables in Paddle. self.num_iteration_per_run_ = num_iteration_per_run; }, R"DOC(This config that how many iteration the executor will run when - user call exe.run() in python + user call exe.run() in python。Default: 1. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + exec_strategy = static.ExecutionStrategy() + exec_strategy.num_iteration_per_run = 10 )DOC") .def_property( "use_thread_barrier", @@ -2097,29 +2139,34 @@ All parameter, weight, gradient are variables in Paddle. BuildStrategy allows the user to more preciously control how to build the SSA Graph in ParallelExecutor by setting the property. + Returns: + BuildStrategy: An BuildStrategy object. + Examples: .. code-block:: python import os - import numpy as np - import paddle.fluid as fluid + import paddle + import paddle.static as static + + paddle.enable_static() - os.environ["CPU_NUM"] = '2' - places = fluid.cpu_places() + os.environ['CPU_NUM'] = str(2) + places = static.cpu_places() - data = fluid.layers.data(name="x", shape=[1], dtype="float32") - hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) - fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) + data = static.data(name="x", shape=[None, 1], dtype="float32") + hidden = static.nn.fc(input=data, size=10) + loss = paddle.mean(hidden) + paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) - build_strategy = fluid.BuildStrategy() + build_strategy = static.BuildStrategy() build_strategy.enable_inplace = True build_strategy.memory_optimize = True - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce - program = fluid.compiler.CompiledProgram(fluid.default_main_program()) + build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce + program = static.CompiledProgram(static.default_main_program()) program = program.with_data_parallel(loss_name=loss.name, - build_strategy=build_strategy, - places=places) + build_strategy=build_strategy, + places=places) )DOC"); py::enum_(build_strategy, "ReduceStrategy") @@ -2154,9 +2201,13 @@ All parameter, weight, gradient are variables in Paddle. Examples: .. code-block:: python - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce )DOC") .def_property( "gradient_scale_strategy", @@ -2178,50 +2229,51 @@ All parameter, weight, gradient are variables in Paddle. Examples: .. code-block:: python - import paddle.fluid as fluid - import paddle.fluid.compiler as compiler import numpy import os + import paddle + import paddle.static as static + + paddle.enable_static() use_cuda = True - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) + place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() + exe = static.Executor(place) # NOTE: If you use CPU to run the program, you need - # to specify the CPU_NUM, otherwise, fluid will use + # to specify the CPU_NUM, otherwise, paddle will use # all the number of the logic core as the CPU_NUM, # in that case, the batch size of the input should be # greater than CPU_NUM, if not, the process will be # failed by an exception. if not use_cuda: os.environ['CPU_NUM'] = str(2) - places = fluid.cpu_places() + places = static.cpu_places() else: - places = places = fluid.cuda_places() + places = static.cuda_places() - data = fluid.layers.data(name='X', shape=[1], dtype='float32') - hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) - fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) + data = static.data(name='X', shape=[None, 1], dtype='float32') + hidden = static.nn.fc(input=data, size=10) + loss = paddle.mean(hidden) + paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) - fluid.default_startup_program().random_seed=1 - exe.run(fluid.default_startup_program()) + exe.run(static.default_startup_program()) - build_strategy = fluid.BuildStrategy() + build_strategy = static.BuildStrategy() build_strategy.gradient_scale_strategy = \ - fluid.BuildStrategy.GradientScaleStrategy.Customized - compiled_prog = compiler.CompiledProgram( - fluid.default_main_program()).with_data_parallel( + static.BuildStrategy.GradientScaleStrategy.Customized + compiled_prog = static.CompiledProgram( + static.default_main_program()).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, - places = places) + places=places) dev_count = len(places) x = numpy.random.random(size=(10, 1)).astype('float32') loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01 loss_grad_name = loss.name+"@GRAD" loss_data = exe.run(compiled_prog, - feed={"X": x, loss_grad_name : loss_grad}, - fetch_list=[loss.name, loss_grad_name]) + feed={"X": x, loss_grad_name : loss_grad}, + fetch_list=[loss.name, loss_grad_name]) )DOC") .def_property( "debug_graphviz_path", @@ -2240,10 +2292,13 @@ All parameter, weight, gradient are variables in Paddle. Examples: .. code-block:: python - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() - build_strategy.debug_graphviz_path = "./graph" + import paddle + import paddle.static as static + + paddle.enable_static() + build_strategy = static.BuildStrategy() + build_strategy.debug_graphviz_path = "./graph" )DOC") .def_property( "enable_sequential_execution", @@ -2263,8 +2318,12 @@ All parameter, weight, gradient are variables in Paddle. Examples: .. code-block:: python - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() build_strategy.enable_sequential_execution = True )DOC") .def_property( @@ -2285,8 +2344,12 @@ All parameter, weight, gradient are variables in Paddle. Examples: .. code-block:: python - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() build_strategy.remove_unnecessary_lock = True )DOC") .def_property( @@ -2351,8 +2414,12 @@ All parameter, weight, gradient are variables in Paddle. Examples: .. code-block:: python - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() build_strategy.fuse_elewise_add_act_ops = True )DOC") .def_property( @@ -2372,8 +2439,12 @@ All parameter, weight, gradient are variables in Paddle. Examples: .. code-block:: python - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() build_strategy.fuse_bn_act_ops = True )DOC") .def_property( @@ -2394,8 +2465,12 @@ All parameter, weight, gradient are variables in Paddle. Examples: .. code-block:: python - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() build_strategy.enable_auto_fusion = True )DOC") .def_property( @@ -2419,8 +2494,12 @@ All parameter, weight, gradient are variables in Paddle. Examples: .. code-block:: python - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() build_strategy.fuse_relu_depthwise_conv = True )DOC") .def_property("fuse_broadcast_ops", @@ -2445,8 +2524,12 @@ All parameter, weight, gradient are variables in Paddle. Examples: .. code-block:: python - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() build_strategy.fuse_broadcast_ops = True )DOC") .def_property("fuse_all_optimizer_ops", @@ -2481,8 +2564,12 @@ All parameter, weight, gradient are variables in Paddle. Examples: .. code-block:: python - import paddle.fluid as fluid - build_strategy = fluid.BuildStrategy() + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() build_strategy.sync_batch_norm = True )DOC") .def_property( @@ -2512,7 +2599,20 @@ All parameter, weight, gradient are variables in Paddle. Default None. None means framework would choose to use or not use this strategy automatically. Currently, None means that it is enabled when GC is disabled, and disabled when GC is enabled. - True means enabling and False means disabling. Default is None.)DOC") + True means enabling and False means disabling. Default is None. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.memory_optimize = True + + )DOC") .def_property( "is_distribution", [](const BuildStrategy &self) { return self.is_distribution_; }, diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py index e0a9bc6eec3..0f65083dc52 100644 --- a/python/paddle/static/__init__.py +++ b/python/paddle/static/__init__.py @@ -19,7 +19,7 @@ __all__ = [ 'name_scope', 'ParallelExecutor', 'program_guard', 'WeightNormParamAttr', 'default_main_program', 'default_startup_program', 'Program', 'data', 'InputSpec', 'save', 'load', 'save_inference_model', 'load_inference_model', - 'load_program_state', 'set_program_state' + 'load_program_state', 'set_program_state', 'cpu_places', 'cuda_places' ] from . import nn @@ -38,6 +38,8 @@ from ..fluid.framework import default_startup_program #DEFINE_ALIAS from ..fluid.framework import Program #DEFINE_ALIAS from ..fluid.framework import name_scope #DEFINE_ALIAS from ..fluid.framework import program_guard #DEFINE_ALIAS +from ..fluid.framework import cpu_places #DEFINE_ALIAS +from ..fluid.framework import cuda_places #DEFINE_ALIAS from ..fluid.layers.control_flow import Print #DEFINE_ALIAS from ..fluid.layers.nn import py_func #DEFINE_ALIAS from ..fluid.parallel_executor import ParallelExecutor #DEFINE_ALIAS -- GitLab