diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 6dd6c842f05c63b7f5bc7dabc5f7d225390e2b9e..f763e69809010b89375e98945adde8059e985e6b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1283,12 +1283,13 @@ All parameter, weight, gradient are variables in Paddle. PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.reduce_ = strategy; }, - R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor, - 'AllReduce' and 'Reduce'. If you want that all the parameters' - optimization are done on all devices independently, you should choose 'AllReduce'; - if you choose 'Reduce', all the parameters' optimization will be evenly distributed - to different devices, and then broadcast the optimized parameter to other devices. - In some models, `Reduce` is faster. Default 'AllReduce'. + R"DOC(The type is fluid.BuildStrategy.ReduceStrategy, there are two reduce + strategies in ParallelExecutor, AllReduce and Reduce. If you want + that all the parameters' optimization are done on all devices independently, + you should choose AllReduce; if you choose Reduce, all the parameters' + optimization will be evenly distributed to different devices, and then + broadcast the optimized parameter to other devices. + Default 'AllReduce'. Examples: .. code-block:: python @@ -1302,21 +1303,62 @@ All parameter, weight, gradient are variables in Paddle. [](const BuildStrategy &self) { return self.gradient_scale_; }, [](BuildStrategy &self, BuildStrategy::GradientScaleStrategy strategy) { - PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finalized."); self.gradient_scale_ = strategy; }, - R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in - ParallelExecutor, 'CoeffNumDevice', 'One' and 'Customized'. By default, - ParallelExecutor sets the :math:`loss@grad` according to the number of devices. - If you want to customize :math:`loss@grad`, you can choose 'Customized'. - Default 'CoeffNumDevice'. + R"DOC(The type is fluid.BuildStrategy.GradientScaleStrategy, there are three + ways of defining :math:`loss@grad` in ParallelExecutor, CoeffNumDevice, + One and Customized. By default, ParallelExecutor sets the :math:`loss@grad` + according to the number of devices. If you want to customize :math:`loss@grad`, + you can choose Customized. Default 'CoeffNumDevice'. Examples: .. code-block:: python import paddle.fluid as fluid + import paddle.fluid.compiler as compiler + import numpy + import os + + use_cuda = True + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + # NOTE: If you use CPU to run the program, you need + # to specify the CPU_NUM, otherwise, fluid will use + # all the number of the logic core as the CPU_NUM, + # in that case, the batch size of the input should be + # greater than CPU_NUM, if not, the process will be + # failed by an exception. + if not use_cuda: + os.environ['CPU_NUM'] = str(2) + places = fluid.cpu_places() + else: + places = places = fluid.cuda_places() + + data = fluid.layers.data(name='X', shape=[1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) + + fluid.default_startup_program().random_seed=1 + exe.run(fluid.default_startup_program()) + build_strategy = fluid.BuildStrategy() - build_strategy.gradient_scale_strategy = True + build_strategy.gradient_scale_strategy = \ + fluid.BuildStrategy.GradientScaleStrategy.Customized + compiled_prog = compiler.CompiledProgram( + fluid.default_main_program()).with_data_parallel( + loss_name=loss.name, build_strategy=build_strategy, + places = places) + + dev_count = len(places) + x = numpy.random.random(size=(10, 1)).astype('float32') + loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01 + loss_grad_name = loss.name+"@GRAD" + loss_data = exe.run(compiled_prog, + feed={"X": x, loss_grad_name : loss_grad}, + fetch_list=[loss.name, loss_grad_name]) )DOC") .def_property( "debug_graphviz_path", @@ -1325,7 +1367,7 @@ All parameter, weight, gradient are variables in Paddle. PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.debug_graphviz_path_ = path; }, - R"DOC(The type is STR, debug_graphviz_path indicate the path that + R"DOC(The type is STR, debug_graphviz_path indicates the path that writing the SSA Graph to file in the form of graphviz. It is useful for debugging. Default "" @@ -1334,7 +1376,8 @@ All parameter, weight, gradient are variables in Paddle. import paddle.fluid as fluid build_strategy = fluid.BuildStrategy() - build_strategy.debug_graphviz_path = "" + build_strategy.debug_graphviz_path = "./graph" + )DOC") .def_property( "enable_sequential_execution", @@ -1345,7 +1388,8 @@ All parameter, weight, gradient are variables in Paddle. PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.enable_sequential_execution_ = b; }, - R"DOC(The type is BOOL. If set True, the execution order of ops would be the same as what is in the program. Default False. + R"DOC(The type is BOOL. If set True, the execution order of ops would + be the same as what is in the program. Default False. Examples: .. code-block:: python @@ -1363,7 +1407,8 @@ All parameter, weight, gradient are variables in Paddle. PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.remove_unnecessary_lock_ = b; }, - R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default True. + R"DOC(The type is BOOL. If set True, some locks in GPU ops would be + released and ParallelExecutor would run faster. Default True. Examples: .. code-block:: python