[with_data_parallel][part13] remove with_data_parallel in example code (#51588)

* remove with_data_parallel in example code * revert python/paddle/fluid/data_feeder.py * fix static.nn.fc api

[with_data_parallel][part13] remove with_data_parallel in example code (#51588)
* remove with_data_parallel in example code * revert python/paddle/fluid/data_feeder.py * fix static.nn.fc api
14f1973d · kangguangli · GitHub · 5b3c7ee7 · 14f1973d · 14f1973d
3 changed file
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -372,17 +372,13 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
    Examples:
        .. code-block:: python
-            import os
            import paddle
            import paddle.static as static
            paddle.enable_static()
-            os.environ['CPU_NUM'] = str(2)
-            places = static.cpu_places()
            data = static.data(name="x", shape=[None, 1], dtype="float32")
-            hidden = static.nn.fc(input=data, size=10)
+            hidden = static.nn.fc(data, size=10)
            loss = paddle.mean(hidden)
            paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
@@ -390,10 +386,7 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
            build_strategy.enable_inplace = True
            build_strategy.memory_optimize = True
            build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
-            program = static.CompiledProgram(static.default_main_program())
+            program = static.CompiledProgram(static.default_main_program(), build_strategy=build_strategy)
-            program = program.with_data_parallel(loss_name=loss.name,
-                                                  build_strategy=build_strategy,
-                                                  places=places)
 )DOC");
  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
@@ -461,7 +454,6 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                    .. code-block:: python
                        import numpy
-                        import os
                        import paddle
                        import paddle.static as static
@@ -471,20 +463,8 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
                        exe = static.Executor(place)
-                        # NOTE: If you use CPU to run the program, you need
-                        # to specify the CPU_NUM, otherwise, paddle will use
-                        # all the number of the logic core as the CPU_NUM,
-                        # in that case, the batch size of the input should be
-                        # greater than CPU_NUM, if not, the process will be
-                        # failed by an exception.
-                        if not use_cuda:
-                            os.environ['CPU_NUM'] = str(2)
-                            places = static.cpu_places()
-                        else:
-                            places = static.cuda_places()
                        data = static.data(name='X', shape=[None, 1], dtype='float32')
-                        hidden = static.nn.fc(input=data, size=10)
+                        hidden = static.nn.fc(data, size=10)
                        loss = paddle.mean(hidden)
                        paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
@@ -492,19 +472,18 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                        build_strategy = static.BuildStrategy()
                        build_strategy.gradient_scale_strategy = \
-                                  static.BuildStrategy.GradientScaleStrategy.Customized
+                                    static.BuildStrategy.GradientScaleStrategy.Customized
                        compiled_prog = static.CompiledProgram(
-                                  static.default_main_program()).with_data_parallel(
+                                    static.default_main_program(),
-                                          loss_name=loss.name, build_strategy=build_strategy,
+                                    build_strategy=build_strategy,
-                                          places=places)
+                        )
-                        dev_count =  len(places)
                        x = numpy.random.random(size=(10, 1)).astype('float32')
-                        loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01
+                        loss_grad = numpy.ones((1)).astype("float32") * 0.01
                        loss_grad_name = loss.name+"@GRAD"
                        loss_data = exe.run(compiled_prog,
-                                              feed={"X": x, loss_grad_name : loss_grad},
+                                                feed={"X": x, loss_grad_name : loss_grad},
-                                              fetch_list=[loss.name, loss_grad_name])
+                                                fetch_list=[loss.name, loss_grad_name])
                   )DOC")
      .def_property(
          "debug_graphviz_path",

--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -82,18 +82,6 @@ def _has_optimize_op(block):
    return False
-def _has_optimizer_in_control_flow(program):
-    if not program:
-        program = framework.default_main_program()
-    for op in program.global_block().ops:
-        if op.type == "conditional_block_grad":
-            sub_block = program.block(op._block_attr_id("sub_block"))
-            if _has_optimize_op(sub_block):
-                return True
-    return False
 def _should_broadcast_or_not_exists(program, var_name):
    block = program.global_block()
    var = block.vars.get(var_name, None)

--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -798,21 +798,13 @@ class DataLoader:
                # Define network
                loss = simple_net(image, label)
-                # Set data source of DataLoader
-                #
-                # If DataLoader is iterable, places must be given and the number of places must be the same with device number.
-                #  - If you are using GPU, call `paddle.static.cuda_places()` to get all GPU places.
-                #  - If you are using CPU, call `paddle.static.cpu_places()` to get all CPU places.
-                #
-                # If DataLoader is not iterable, places can be None.
                places = static.cuda_places() if USE_GPU else static.cpu_places()
                set_data_source(loader, places)
                exe = static.Executor(places[0])
                exe.run(static.default_startup_program())
-                prog = static.CompiledProgram(static.default_main_program()).with_data_parallel(loss_name=loss.name)
+                prog = static.CompiledProgram(static.default_main_program())
                if loader.iterable:
                    train_iterable(exe, prog, loss, loader)
                else:
@@ -890,54 +882,6 @@ class DataLoader:
                        print("Epoch {} batch {}: loss = {}".format(
                            epoch_id, batch_id, np.mean(loss.numpy())))
-        Examples 3:
-            .. code-block:: python
-                '''
-                Example of `drop_last` using in static graph multi-cards mode
-                '''
-                import paddle
-                import paddle.static as static
-                import numpy as np
-                import os
-                # We use 2 CPU cores to run inference network
-                os.environ['CPU_NUM'] = '2'
-                paddle.enable_static()
-                # The data source has only 3 batches, which can not be
-                # divided evenly to each CPU core
-                def batch_generator():
-                    for i in range(3):
-                        yield np.array([i+1]).astype('float32'),
-                x = static.data(name='x', shape=[None], dtype='float32')
-                y = x * x
-                def run_inference(drop_last):
-                    loader = paddle.io.DataLoader.from_generator(feed_list=[x],
-                            capacity=8, drop_last=drop_last)
-                    loader.set_batch_generator(batch_generator, static.cpu_places())
-                    exe = static.Executor(paddle.CPUPlace())
-                    prog = static.CompiledProgram(static.default_main_program())
-                    prog = prog.with_data_parallel()
-                    result = []
-                    for data in loader():
-                        each_ret, = exe.run(prog, feed=data, fetch_list=[y])
-                        result.extend(each_ret)
-                    return result
-                # Set drop_last to True, so that the last batch whose
-                # number is less than CPU core number would be discarded.
-                print(run_inference(drop_last=True)) # [1.0, 4.0]
-                # Set drop_last to False, so that the last batch whose
-                # number is less than CPU core number can be tested.
-                print(run_inference(drop_last=False)) # [1.0, 4.0, 9.0]
        """
        if _non_static_mode():
            return DygraphGeneratorLoader(