diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index ae0b50461a113dd90a0b457b5a26030394657f20..fc5b07746679437d6ba5197d1f5c45ae930a3def 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1980,27 +1980,34 @@ All parameter, weight, gradient are variables in Paddle.
     ExecutionStrategy allows the user to more preciously control how to run
     the program in ParallelExecutor by setting the property.
 
+    Returns:
+        ExecutionStrategy: An ExecutionStrategy object.
+
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-          y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-          y_predict = fluid.layers.fc(input=x, size=1, act=None)
+          import paddle
+          import paddle.static as static
+          import paddle.nn.functional as F
+
+          paddle.enable_static()
+
+          x = static.data(name='x', shape=[None, 13], dtype='float32')
+          y = static.data(name='y', shape=[None, 1], dtype='float32')
+          y_predict = static.nn.fc(input=x, size=1, act=None)
 
-          cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-          avg_loss = fluid.layers.mean(cost)
+          cost = F.square_error_cost(input=y_predict, label=y)
+          avg_loss = paddle.mean(cost)
 
-          sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+          sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
           sgd_optimizer.minimize(avg_loss)
 
-          exec_strategy = fluid.ExecutionStrategy()
+          exec_strategy = static.ExecutionStrategy()
           exec_strategy.num_threads = 4
 
-          train_exe = fluid.ParallelExecutor(use_cuda=False,
-                                             loss_name=avg_loss.name,
-                                             exec_strategy=exec_strategy)
-
+          train_exe = static.ParallelExecutor(use_cuda=False,
+                                              loss_name=avg_loss.name,
+                                              exec_strategy=exec_strategy)
         )DOC");
 
   exec_strategy.def(py::init())
@@ -2010,7 +2017,8 @@ All parameter, weight, gradient are variables in Paddle.
           [](ExecutionStrategy &self, size_t num_threads) {
             self.num_threads_ = num_threads;
           },
-          R"DOC(The type is INT, num_threads represents the size of thread pool that
+          R"DOC(
+            The type is INT, num_threads represents the size of thread pool that
             used to run the operators of the current program in ParallelExecutor.
             If :math:`num\_threads=1`, all the operators will execute one by one,
             but the order maybe difference between iterations.
@@ -2018,7 +2026,19 @@ All parameter, weight, gradient are variables in Paddle.
             device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
             :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
             if it is not set, ParallelExecutor will get the cpu count by calling
-            `multiprocessing.cpu_count()`. Default 0.)DOC")
+            `multiprocessing.cpu_count()`. Default 0.
+
+            Examples:
+                .. code-block:: python
+
+                    import paddle
+                    import paddle.static as static
+
+                    paddle.enable_static()
+
+                    exec_strategy = static.ExecutionStrategy()
+                    exec_strategy.num_threads = 4
+            )DOC")
       .def_property(
           "use_cuda",
           [](const ExecutionStrategy &self) { return self.use_cuda_; },
@@ -2050,13 +2070,24 @@ All parameter, weight, gradient are variables in Paddle.
                 many iterations to clean up the temp variables which
                 is generated during execution. It may make the execution faster,
                 because the temp variable's shape maybe the same between two iterations.
-                Default 1.
+                Default 100.
+
+                .. note::
+                    1. If you fetch data when calling the 'run', the ParallelExecutor 
+                    will clean up the temp variables at the end of the current iteration. 
+                    2. In some NLP model, it may cause the GPU memory is insufficient, 
+                    in this case, you should reduce `num_iteration_per_drop_scope`.
+
+                Examples:
+                    .. code-block:: python
 
-                NOTES:
-                    1. If you fetch data when calling the 'run', the ParallelExecutor
-                       will clean up the temp variables at the end of the current iteration.
-                    2. In some NLP model, it may cause the GPU memory is insufficient,
-                       in this case, you should reduce `num_iteration_per_drop_scope`.
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        exec_strategy = static.ExecutionStrategy()
+                        exec_strategy.num_iteration_per_drop_scope = 10
               )DOC")
       .def_property(
           "num_iteration_per_run",
@@ -2067,7 +2098,18 @@ All parameter, weight, gradient are variables in Paddle.
             self.num_iteration_per_run_ = num_iteration_per_run;
           },
           R"DOC(This config that how many iteration the executor will run when
-                user call exe.run() in python
+                user call exe.run() in python。Default: 1.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        exec_strategy = static.ExecutionStrategy()
+                        exec_strategy.num_iteration_per_run = 10
               )DOC")
       .def_property(
           "use_thread_barrier",
@@ -2097,29 +2139,34 @@ All parameter, weight, gradient are variables in Paddle.
     BuildStrategy allows the user to more preciously control how to
     build the SSA Graph in ParallelExecutor by setting the property.
 
+    Returns:
+        BuildStrategy: An BuildStrategy object.
+
     Examples:
         .. code-block:: python
 
             import os
-            import numpy as np
-            import paddle.fluid as fluid
+            import paddle
+            import paddle.static as static
+
+            paddle.enable_static()
 
-            os.environ["CPU_NUM"] = '2'
-            places = fluid.cpu_places()
+            os.environ['CPU_NUM'] = str(2)
+            places = static.cpu_places()
 
-            data = fluid.layers.data(name="x", shape=[1], dtype="float32")
-            hidden = fluid.layers.fc(input=data, size=10)
-            loss = fluid.layers.mean(hidden)
-            fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
+            data = static.data(name="x", shape=[None, 1], dtype="float32")
+            hidden = static.nn.fc(input=data, size=10)
+            loss = paddle.mean(hidden)
+            paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
-            build_strategy = fluid.BuildStrategy()
+            build_strategy = static.BuildStrategy()
             build_strategy.enable_inplace = True
             build_strategy.memory_optimize = True
-            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-            program = fluid.compiler.CompiledProgram(fluid.default_main_program())
+            build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
+            program = static.CompiledProgram(static.default_main_program())
             program = program.with_data_parallel(loss_name=loss.name,
-                                                 build_strategy=build_strategy,
-                                                 places=places)
+                                                  build_strategy=build_strategy,
+                                                  places=places)
 )DOC");
 
   py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
@@ -2154,9 +2201,13 @@ All parameter, weight, gradient are variables in Paddle.
                 Examples:
                     .. code-block:: python
 
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
                   )DOC")
       .def_property(
           "gradient_scale_strategy",
@@ -2178,50 +2229,51 @@ All parameter, weight, gradient are variables in Paddle.
                 Examples:
                     .. code-block:: python
 
-                        import paddle.fluid as fluid
-                        import paddle.fluid.compiler as compiler
                         import numpy
                         import os
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
 
                         use_cuda = True
-                        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-                        exe = fluid.Executor(place)
+                        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+                        exe = static.Executor(place)
 
                         # NOTE: If you use CPU to run the program, you need
-                        # to specify the CPU_NUM, otherwise, fluid will use
+                        # to specify the CPU_NUM, otherwise, paddle will use
                         # all the number of the logic core as the CPU_NUM,
                         # in that case, the batch size of the input should be
                         # greater than CPU_NUM, if not, the process will be
                         # failed by an exception.
                         if not use_cuda:
                             os.environ['CPU_NUM'] = str(2)
-                            places = fluid.cpu_places()
+                            places = static.cpu_places()
                         else:
-                            places = places = fluid.cuda_places()
+                            places = static.cuda_places()
 
-                        data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-                        hidden = fluid.layers.fc(input=data, size=10)
-                        loss = fluid.layers.mean(hidden)
-                        fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
+                        data = static.data(name='X', shape=[None, 1], dtype='float32')
+                        hidden = static.nn.fc(input=data, size=10)
+                        loss = paddle.mean(hidden)
+                        paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
-                        fluid.default_startup_program().random_seed=1
-                        exe.run(fluid.default_startup_program())
+                        exe.run(static.default_startup_program())
 
-                        build_strategy = fluid.BuildStrategy()
+                        build_strategy = static.BuildStrategy()
                         build_strategy.gradient_scale_strategy = \
-                                 fluid.BuildStrategy.GradientScaleStrategy.Customized
-                        compiled_prog = compiler.CompiledProgram(
-                                 fluid.default_main_program()).with_data_parallel(
+                                  static.BuildStrategy.GradientScaleStrategy.Customized
+                        compiled_prog = static.CompiledProgram(
+                                  static.default_main_program()).with_data_parallel(
                                           loss_name=loss.name, build_strategy=build_strategy,
-                                          places = places)
+                                          places=places)
 
                         dev_count =  len(places)
                         x = numpy.random.random(size=(10, 1)).astype('float32')
                         loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01
                         loss_grad_name = loss.name+"@GRAD"
                         loss_data = exe.run(compiled_prog,
-                                             feed={"X": x, loss_grad_name : loss_grad},
-                                             fetch_list=[loss.name, loss_grad_name])
+                                              feed={"X": x, loss_grad_name : loss_grad},
+                                              fetch_list=[loss.name, loss_grad_name])
                    )DOC")
       .def_property(
           "debug_graphviz_path",
@@ -2240,10 +2292,13 @@ All parameter, weight, gradient are variables in Paddle.
                 Examples:
                     .. code-block:: python
 
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.debug_graphviz_path = "./graph"
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
 
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.debug_graphviz_path = "./graph"
                     )DOC")
       .def_property(
           "enable_sequential_execution",
@@ -2263,8 +2318,12 @@ All parameter, weight, gradient are variables in Paddle.
                 Examples:
                     .. code-block:: python
 
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
                         build_strategy.enable_sequential_execution = True
           )DOC")
       .def_property(
@@ -2285,8 +2344,12 @@ All parameter, weight, gradient are variables in Paddle.
                 Examples:
                     .. code-block:: python
 
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
                         build_strategy.remove_unnecessary_lock = True
           )DOC")
       .def_property(
@@ -2351,8 +2414,12 @@ All parameter, weight, gradient are variables in Paddle.
                 Examples:
                     .. code-block:: python
 
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
                         build_strategy.fuse_elewise_add_act_ops = True
                      )DOC")
       .def_property(
@@ -2372,8 +2439,12 @@ All parameter, weight, gradient are variables in Paddle.
                 Examples:
                     .. code-block:: python
 
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
                         build_strategy.fuse_bn_act_ops = True
                      )DOC")
       .def_property(
@@ -2394,8 +2465,12 @@ All parameter, weight, gradient are variables in Paddle.
                 Examples:
                     .. code-block:: python
 
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
                         build_strategy.enable_auto_fusion = True
                     )DOC")
       .def_property(
@@ -2419,8 +2494,12 @@ All parameter, weight, gradient are variables in Paddle.
                 Examples:
                     .. code-block:: python
 
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
                         build_strategy.fuse_relu_depthwise_conv = True
           )DOC")
       .def_property("fuse_broadcast_ops",
@@ -2445,8 +2524,12 @@ All parameter, weight, gradient are variables in Paddle.
                       Examples:
                           .. code-block:: python
 
-                              import paddle.fluid as fluid
-                              build_strategy = fluid.BuildStrategy()
+                              import paddle
+                              import paddle.static as static
+
+                              paddle.enable_static()
+
+                              build_strategy = static.BuildStrategy()
                               build_strategy.fuse_broadcast_ops = True
                     )DOC")
       .def_property("fuse_all_optimizer_ops",
@@ -2481,8 +2564,12 @@ All parameter, weight, gradient are variables in Paddle.
                 Examples:
                     .. code-block:: python
 
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
                         build_strategy.sync_batch_norm = True
                 )DOC")
       .def_property(
@@ -2512,7 +2599,20 @@ All parameter, weight, gradient are variables in Paddle.
                 Default None. None means framework would choose to use or not use 
                 this strategy automatically. Currently, None means that it is 
                 enabled when GC is disabled, and disabled when GC is enabled. 
-                True means enabling and False means disabling. Default is None.)DOC")
+                True means enabling and False means disabling. Default is None.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.memory_optimize = True
+                
+                )DOC")
       .def_property(
           "is_distribution",
           [](const BuildStrategy &self) { return self.is_distribution_; },
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index e0a9bc6eec3ba37049cc670a63fbf54f68c5defb..0f65083dc52e7fbf711b92e97d95e9907c753be7 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -19,7 +19,7 @@ __all__ = [
     'name_scope', 'ParallelExecutor', 'program_guard', 'WeightNormParamAttr',
     'default_main_program', 'default_startup_program', 'Program', 'data',
     'InputSpec', 'save', 'load', 'save_inference_model', 'load_inference_model',
-    'load_program_state', 'set_program_state'
+    'load_program_state', 'set_program_state', 'cpu_places', 'cuda_places'
 ]
 
 from . import nn
@@ -38,6 +38,8 @@ from ..fluid.framework import default_startup_program  #DEFINE_ALIAS
 from ..fluid.framework import Program  #DEFINE_ALIAS
 from ..fluid.framework import name_scope  #DEFINE_ALIAS
 from ..fluid.framework import program_guard  #DEFINE_ALIAS
+from ..fluid.framework import cpu_places  #DEFINE_ALIAS
+from ..fluid.framework import cuda_places  #DEFINE_ALIAS
 from ..fluid.layers.control_flow import Print  #DEFINE_ALIAS
 from ..fluid.layers.nn import py_func  #DEFINE_ALIAS
 from ..fluid.parallel_executor import ParallelExecutor  #DEFINE_ALIAS