diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index 5c49c1d904eb27598015e8b64e72864a56b145e8..d23b9e568148725742e0ccf6f6d81141f3558f5a 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -372,17 +372,13 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
     Examples:
         .. code-block:: python
 
-            import os
             import paddle
             import paddle.static as static
 
             paddle.enable_static()
 
-            os.environ['CPU_NUM'] = str(2)
-            places = static.cpu_places()
-
             data = static.data(name="x", shape=[None, 1], dtype="float32")
-            hidden = static.nn.fc(input=data, size=10)
+            hidden = static.nn.fc(data, size=10)
             loss = paddle.mean(hidden)
             paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
@@ -390,10 +386,7 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
             build_strategy.enable_inplace = True
             build_strategy.memory_optimize = True
             build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
-            program = static.CompiledProgram(static.default_main_program())
-            program = program.with_data_parallel(loss_name=loss.name,
-                                                  build_strategy=build_strategy,
-                                                  places=places)
+            program = static.CompiledProgram(static.default_main_program(), build_strategy=build_strategy)
 )DOC");
 
   py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
@@ -461,7 +454,6 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                     .. code-block:: python
 
                         import numpy
-                        import os
                         import paddle
                         import paddle.static as static
 
@@ -471,20 +463,8 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                         place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
                         exe = static.Executor(place)
 
-                        # NOTE: If you use CPU to run the program, you need
-                        # to specify the CPU_NUM, otherwise, paddle will use
-                        # all the number of the logic core as the CPU_NUM,
-                        # in that case, the batch size of the input should be
-                        # greater than CPU_NUM, if not, the process will be
-                        # failed by an exception.
-                        if not use_cuda:
-                            os.environ['CPU_NUM'] = str(2)
-                            places = static.cpu_places()
-                        else:
-                            places = static.cuda_places()
-
                         data = static.data(name='X', shape=[None, 1], dtype='float32')
-                        hidden = static.nn.fc(input=data, size=10)
+                        hidden = static.nn.fc(data, size=10)
                         loss = paddle.mean(hidden)
                         paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
@@ -492,19 +472,18 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
 
                         build_strategy = static.BuildStrategy()
                         build_strategy.gradient_scale_strategy = \
-                                  static.BuildStrategy.GradientScaleStrategy.Customized
+                                    static.BuildStrategy.GradientScaleStrategy.Customized
                         compiled_prog = static.CompiledProgram(
-                                  static.default_main_program()).with_data_parallel(
-                                          loss_name=loss.name, build_strategy=build_strategy,
-                                          places=places)
+                                    static.default_main_program(),
+                                    build_strategy=build_strategy,
+                        )
 
-                        dev_count =  len(places)
                         x = numpy.random.random(size=(10, 1)).astype('float32')
-                        loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01
+                        loss_grad = numpy.ones((1)).astype("float32") * 0.01
                         loss_grad_name = loss.name+"@GRAD"
                         loss_data = exe.run(compiled_prog,
-                                              feed={"X": x, loss_grad_name : loss_grad},
-                                              fetch_list=[loss.name, loss_grad_name])
+                                                feed={"X": x, loss_grad_name : loss_grad},
+                                                fetch_list=[loss.name, loss_grad_name])
                    )DOC")
       .def_property(
           "debug_graphviz_path",
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 8be433ae828397f2884d6935ae57456ede5dc0e8..977fcbdf15f0d053570ee55b058f45337bd02cfd 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -82,18 +82,6 @@ def _has_optimize_op(block):
     return False
 
 
-def _has_optimizer_in_control_flow(program):
-    if not program:
-        program = framework.default_main_program()
-    for op in program.global_block().ops:
-        if op.type == "conditional_block_grad":
-            sub_block = program.block(op._block_attr_id("sub_block"))
-            if _has_optimize_op(sub_block):
-                return True
-
-    return False
-
-
 def _should_broadcast_or_not_exists(program, var_name):
     block = program.global_block()
     var = block.vars.get(var_name, None)
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 8e36dd28005be7cf0c21423ae93d8e6a62c2d88e..b9ed17304c8910e53590bfa410d4e08436935ed5 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -798,21 +798,13 @@ class DataLoader:
                 # Define network
                 loss = simple_net(image, label)
 
-                # Set data source of DataLoader
-                #
-                # If DataLoader is iterable, places must be given and the number of places must be the same with device number.
-                #  - If you are using GPU, call `paddle.static.cuda_places()` to get all GPU places.
-                #  - If you are using CPU, call `paddle.static.cpu_places()` to get all CPU places.
-                #
-                # If DataLoader is not iterable, places can be None.
                 places = static.cuda_places() if USE_GPU else static.cpu_places()
                 set_data_source(loader, places)
 
                 exe = static.Executor(places[0])
                 exe.run(static.default_startup_program())
 
-                prog = static.CompiledProgram(static.default_main_program()).with_data_parallel(loss_name=loss.name)
-
+                prog = static.CompiledProgram(static.default_main_program())
                 if loader.iterable:
                     train_iterable(exe, prog, loss, loader)
                 else:
@@ -890,54 +882,6 @@ class DataLoader:
                         print("Epoch {} batch {}: loss = {}".format(
                             epoch_id, batch_id, np.mean(loss.numpy())))
 
-        Examples 3:
-
-            .. code-block:: python
-
-                '''
-                Example of `drop_last` using in static graph multi-cards mode
-                '''
-                import paddle
-                import paddle.static as static
-                import numpy as np
-                import os
-
-                # We use 2 CPU cores to run inference network
-                os.environ['CPU_NUM'] = '2'
-
-                paddle.enable_static()
-
-                # The data source has only 3 batches, which can not be
-                # divided evenly to each CPU core
-                def batch_generator():
-                    for i in range(3):
-                        yield np.array([i+1]).astype('float32'),
-
-                x = static.data(name='x', shape=[None], dtype='float32')
-                y = x * x
-
-                def run_inference(drop_last):
-                    loader = paddle.io.DataLoader.from_generator(feed_list=[x],
-                            capacity=8, drop_last=drop_last)
-                    loader.set_batch_generator(batch_generator, static.cpu_places())
-
-                    exe = static.Executor(paddle.CPUPlace())
-                    prog = static.CompiledProgram(static.default_main_program())
-                    prog = prog.with_data_parallel()
-
-                    result = []
-                    for data in loader():
-                        each_ret, = exe.run(prog, feed=data, fetch_list=[y])
-                        result.extend(each_ret)
-                    return result
-
-                # Set drop_last to True, so that the last batch whose
-                # number is less than CPU core number would be discarded.
-                print(run_inference(drop_last=True)) # [1.0, 4.0]
-
-                # Set drop_last to False, so that the last batch whose
-                # number is less than CPU core number can be tested.
-                print(run_inference(drop_last=False)) # [1.0, 4.0, 9.0]
         """
         if _non_static_mode():
             return DygraphGeneratorLoader(