diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index a9b446d8c0b1e5f186a00cbda475e7517ff594ba..fb5f64b2ec48f711a9051fb8875c622d969c1e12 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -15,12 +15,12 @@ paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=N
 paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912'))
 paddle.fluid.in_dygraph_mode (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f06314a1cb30c96b5808dde2219c2dae'))
 paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
+paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3a584496aa1343f36eebf3c46b323a74'))
 paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '9c7decb955b9c4f718114179c8985581'))
-paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'f482e93b38b4018796969a2e1dde479d'))
+paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', '4cfcd9c15b766a51b584cc46d38f1ad8'))
 paddle.fluid.Executor.train_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', 'd521011d79e71080fe9b5bb179b43518'))
-paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0'))
-paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2'))
+paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f65788d9ead293ada47551339df12203'))
+paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'e480208ccc0c9abf084867206dab4d2c'))
 paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680'))
 paddle.fluid.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8'))
@@ -31,7 +31,7 @@ paddle.fluid.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'pr
 paddle.fluid.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ac4114d3df16264f1946deb3a8434a6f'))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
 paddle.fluid.ParallelExecutor.__init__ (ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '2cb4bd74481861345c70228a0f57620c'))
+paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '33ce6ec50f8eeb05d340e6b114b026fd'))
 paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', 'b82ea20e2dc5ff2372e0643169ca47ff'))
 paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '74dc6d23185d90a7a50fbac19f5b65fb'))
 paddle.fluid.DataFeedDesc.__init__ (ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -40,7 +40,7 @@ paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], v
 paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'eb894b464bbcd1b4bc8038398954f766'))
 paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', '415c56600ce4e198c071cad01409a690'))
 paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a8c7793803cf976680d9478e378fa356'))
+paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', '0e17773521634ef798fddd7d2ea3ef96'))
 paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8'))
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 624c9934d5392b57526edea68254ddf45bd79f4c..f01a6dd9da2dd518227d0f45bab9a140191d38de 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -58,20 +58,34 @@ class CompiledProgram(object):
     optimizations, for example.
       * Pre-compute some logic once so that each run is faster.
       * Transform the program so that it can run in multiple devices.
-      * TODO: transform the program for optimized inference or distributed
-              training.
+      * Transform the program for optimized inference or distributed
+        training. **Note that: this part is not finished.**
 
     Example:
         .. code-block:: python
-            place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup)
-            compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
-                loss_name=loss.name)
-            for i in range(5):
-                test_loss, = exe.run(compiled_prog,
-                                     feed=feed_dict,
-                                     fetch_list=[loss.name])
+
+          import paddle.fluid as fluid
+          import paddle.fluid.compiler as compiler
+          import numpy
+          import os
+
+          place = fluid.CUDAPlace(0) # fluid.CPUPlace()
+          exe = fluid.Executor(place)
+
+          data = fluid.layers.data(name='X', shape=[1], dtype='float32')
+          hidden = fluid.layers.fc(input=data, size=10)
+          loss = fluid.layers.mean(hidden)
+          fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
+
+          fluid.default_startup_program().random_seed=1
+          exe.run(fluid.default_startup_program())
+          compiled_prog = compiler.CompiledProgram(
+                   fluid.default_main_program())
+
+          x = numpy.random.random(size=(10, 1)).astype('float32')
+          loss_data, = exe.run(compiled_prog,
+                               feed={"X": x},
+                               fetch_list=[loss.name])
 
     Args:
         program_or_graph (Graph|Program): If it's Program, it will be first
@@ -108,6 +122,44 @@ class CompiledProgram(object):
                            places=None):
         """Configs the program to run in data parallel way.
 
+        Example:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              import paddle.fluid.compiler as compiler
+              import numpy
+              import os
+
+              use_cuda = True
+              place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+              # NOTE: If you use CPU to run the program, you need
+              # to specify the CPU_NUM, otherwise, fluid will use
+              # all the number of the logic core as the CPU_NUM,
+              # in that case, the batch size of the input should be
+              # greater than CPU_NUM, if not, the process will be
+              # failed by an exception.
+              if not use_cuda:
+                  os.environ['CPU_NUM'] = str(2)
+
+              exe = fluid.Executor(place)
+
+              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
+              hidden = fluid.layers.fc(input=data, size=10)
+              loss = fluid.layers.mean(hidden)
+              fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
+
+              fluid.default_startup_program().random_seed=1
+              exe.run(fluid.default_startup_program())
+              compiled_prog = compiler.CompiledProgram(
+                       fluid.default_main_program()).with_data_parallel(
+                                loss_name=loss.name)
+
+              x = numpy.random.random(size=(10, 1)).astype('float32')
+              loss_data, = exe.run(compiled_prog,
+                                   feed={"X": x},
+                                   fetch_list=[loss.name])
+
         Args:
             loss_name (str): The loss name must set in training. Default None.
             build_strategy(BuildStrategy): build_strategy is used to
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 0b9a23e6769389715535a4ea9dea77bfd3c2707b..53cf1e6743262afee5a93b708d35ffe296874377 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -38,6 +38,15 @@ def global_scope():
     Get the global/default scope instance. There are a lot of APIs use
     :code:`global_scope` as its default value, e.g., :code:`Executor.run`
 
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          import numpy
+
+          fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
+          numpy.array(fluid.global_scope().find_var("data").get_tensor())
+
     Returns:
         Scope: The global/default scope instance.
     """
@@ -58,10 +67,15 @@ def scope_guard(scope):
     variable in runtime will assigned to the new scope.
 
     Examples:
-        >>> import paddle.fluid as fluid
-        >>> new_scope = fluid.Scope()
-        >>> with fluid.scope_guard(new_scope):
-        >>>     ...
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          import numpy
+
+          new_scope = fluid.Scope()
+          with fluid.scope_guard(new_scope):
+              fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
+          numpy.array(new_scope.find_var("data").get_tensor())
 
     Args:
         scope: The new global/default scope.
@@ -75,11 +89,18 @@ def as_numpy(tensor):
     """
     Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information.
     For higher dimensional sequence data, please use LoDTensor directly.
+
     Examples:
-        >>> import paddle.fluid as fluid
-        >>> outs = executor.run(...)
-        >>> np_outs = map(lambda x: as_numpy(x), outs)
-        >>>     ...
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          import numpy
+
+          new_scope = fluid.Scope()
+          with fluid.scope_guard(new_scope):
+              fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
+          tensor = new_scope.find_var("data").get_tensor()
+          fluid.executor.as_numpy(tensor) # or numpy.array(new_scope.find_var("data").get_tensor())
 
     Args:
        tensor(Variable): a instance of Tensor
@@ -263,42 +284,70 @@ def _as_lodtensor(data, place):
 
 class Executor(object):
     """
-    An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
-    Python executor takes a program, adds feed operators and fetch operators to this program according
-    to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
-    the variables(or names) that user wants to get after program runs. Note: the executor will run all
-    operators in the program but not only the operators dependent by the fetch_list.
-    It stores the global variables into the global scope, and creates a local scope for the temporary
-    variables. The contents in local scope may be discarded after every minibatch forward/backward
-    finished. But the global scope variables will be persistent through different runs.
-
-
-    Example:
+    An Executor in Python, supports single/multiple-GPU running,
+    and single/multiple-CPU running. Python executor takes a program,
+    adds feed operators and fetch operators to this program according
+    to feed map and fetch_list. Feed map provides input data for the
+    program. fetch_list provides the variables(or names) that user wants
+    to get after program runs. Note: the executor will run all operators
+    in the program but not only the operators dependent by the fetch_list.
+    It stores the global variables into the global scope, and creates a
+    local scope for the temporary variables. The contents in local scope
+    may be discarded after every minibatch forward/backward finished.
+    But the global scope variables will be persistent through different runs.
 
+    Examples:
         .. code-block:: python
 
-            # First create the Executor.
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-
-            # Run the startup program once and only once.
-            # Not need to optimize/compile the startup program.
-            exe.run(fluid.default_startup_program())
-
-            # Run the main program directly without compile.
-            loss, = exe.run(fluid.default_main_program(),
-                            feed=feed_dict,
-                            fetch_list=[loss.name])
-            # Or, compiled the program and run. See `CompiledProgram` for more detail.
-            compiled_prog = compiler.CompiledProgram(
-                fluid.default_main_program()).with_data_parallel(
-                loss_name=loss.name)
-            loss, = exe.run(compiled_prog,
-                            feed=feed_dict,
-                            fetch_list=[loss.name])
+          import paddle.fluid as fluid
+          import paddle.fluid.compiler as compiler
+          import numpy
+          import os
+
+          use_cuda = True
+          place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+          exe = fluid.Executor(place)
+
+          train_program = fluid.Program()
+          startup_program = fluid.Program()
+          with fluid.program_guard(train_program, startup_program):
+              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
+              hidden = fluid.layers.fc(input=data, size=10)
+              loss = fluid.layers.mean(hidden)
+              fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
+
+          # Run the startup program once and only once.
+          # Not need to optimize/compile the startup program.
+          startup_program.random_seed=1
+          exe.run(startup_program)
+
+          # Run the main program directly without compile.
+          x = numpy.random.random(size=(10, 1)).astype('float32')
+          loss_data, = exe.run(train_program,
+                               feed={"X": x},
+                               fetch_list=[loss.name])
+
+          # Or, compiled the program and run. See `CompiledProgram`
+          # for more detail.
+          # NOTE: If you use CPU to run the program, you need
+          # to specify the CPU_NUM, otherwise, fluid will use
+          # all the number of the logic core as the CPU_NUM,
+          # in that case, the batch size of the input should be
+          # greater than CPU_NUM, if not, the process will be
+          # failed by an exception.
+          if not use_cuda:
+              os.environ['CPU_NUM'] = str(2)
+
+          compiled_prog = compiler.CompiledProgram(
+              train_program).with_data_parallel(
+              loss_name=loss.name)
+          loss_data, = exe.run(compiled_prog,
+                               feed={"X": x},
+                               fetch_list=[loss.name])
 
     Args:
-        place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
+        place(fluid.CPUPlace|fluid.CUDAPlace(n)): indicate the executor run on which device.
+
     """
 
     def __init__(self, place):
@@ -392,14 +441,18 @@ class Executor(object):
         Close this executor.
 
         You can no longer use this executor after calling this method.
-        For the distributed training, this method would free the resource on PServers related to
-        the current Trainer.
-
-        Example:
-            >>> cpu = core.CPUPlace()
-            >>> exe = Executor(cpu)
-            >>> ...
-            >>> exe.close()
+        For the distributed training, this method would free the resource
+        on PServers related to the current Trainer.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+
+              cpu = fluid.CPUPlace()
+              exe = fluid.Executor(cpu)
+              # execute training or testing
+              exe.close()
         """
         if not self._closed:
             self._default_executor.close()
@@ -490,13 +543,37 @@ class Executor(object):
             return_numpy=True,
             use_program_cache=False):
         """
-        Run program by this Executor. Feed data by feed map, fetch result by fetch_list.
-        Python executor takes a program, add feed operators and fetch operators to this program according
-        to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
+        Run program by this Executor. Feed data by feed map, fetch result by
+        fetch_list. Python executor takes a program, add feed operators and
+        fetch operators to this program according to feed map and fetch_list.
+        Feed map provides input data for the program. fetch_list provides
         the variables(or names) that user want to get after program run.
 
-        Note: the executor will run all
-        operators in the program but not only the operators dependent by the fetch_list
+        Note: the executor will run all operators in the program but not
+        only the operators dependent by the fetch_list.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              import numpy
+
+              # First create the Executor.
+              place = fluid.CPUPlace() # fluid.CUDAPlace(0)
+              exe = fluid.Executor(place)
+
+              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
+              hidden = fluid.layers.fc(input=data, size=10)
+              loss = fluid.layers.mean(hidden)
+              adam = fluid.optimizer.Adam()
+              adam.minimize(loss)
+
+              # Run the startup program once and only once.
+              exe.run(fluid.default_startup_program())
+
+              x = numpy.random.random(size=(10, 1)).astype('float32')
+              outs = exe.run(feed={'X': x},
+                             fetch_list=[loss.name])
 
         Args:
             program(Program|CompiledProgram): the program that need to run,
@@ -520,26 +597,6 @@ class Executor(object):
         Returns:
 
             list(numpy.array): fetch result according to fetch_list.
-
-
-        Examples:
-
-            >>> data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-            >>> out = fluid.layers.create_tensor(dtype='float32')
-            >>> hidden = fluid.layers.fc(input=data, size=10)
-            >>> fluid.layers.assign(hidden,out)
-            >>> loss = fluid.layers.mean(out)
-            >>> adam = fluid.optimizer.Adam()
-						>>> adam.minimize(loss)
-
-            >>> cpu = core.CPUPlace()
-            >>> exe = fluid.Executor(cpu)
-            >>> exe.run(fluid.default_startup_program())
-
-            >>> x = numpy.random.random(size=(10, 1)).astype('float32')
-            >>> outs = exe.run(
-            >>>     feed={'X': x},
-            >>>     fetch_list=[loss.name])
         """
 
         if self._closed:
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index cf10f590ce2c90450047ff046ee3ed206b38322e..ad32f157d0e694477e009b5a9e3650701787d79e 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -37,6 +37,53 @@ class ParallelExecutor(object):
     is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number
     of CPUs in the system.
 
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          import numpy
+          import os
+
+          use_cuda = True
+          place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+          # NOTE: If you use CPU to run the program, you need
+          # to specify the CPU_NUM, otherwise, fluid will use
+          # all the number of the logic core as the CPU_NUM,
+          # in that case, the batch size of the input should be
+          # greater than CPU_NUM, if not, the process will be
+          # failed by an exception.
+          if not use_cuda:
+              os.environ['CPU_NUM'] = str(2)
+
+          exe = fluid.Executor(place)
+
+          train_program = fluid.Program()
+          startup_program = fluid.Program()
+          with fluid.program_guard(train_program, startup_program):
+              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
+              hidden = fluid.layers.fc(input=data, size=10)
+              loss = fluid.layers.mean(hidden)
+              test_program = fluid.default_main_program().clone(for_test=True)
+              fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
+
+          startup_program.random_seed=1
+          exe.run(startup_program)
+
+          train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                             main_program=train_program,
+                                             loss_name=loss.name)
+          test_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                            main_program=test_program,
+                                            share_vars_from=train_exe)
+
+          x = numpy.random.random(size=(10, 1)).astype('float32')
+          loss_data, = train_exe.run(feed={"X": x},
+                                     fetch_list=[loss.name])
+
+          loss_data, = test_exe.run(feed={"X": x},
+                                    fetch_list=[loss.name])
+
     Args:
         use_cuda (bool): Whether to use CUDA or not.
         loss_name (str): The loss name must set in training. Default None.
@@ -66,16 +113,6 @@ class ParallelExecutor(object):
     Raises:
         TypeError: If share_vars_from is provided, but not ParallelExecutor object.
 
-    Examples:
-        .. code-block:: python
-
-          train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
-          test_exe = fluid.ParallelExecutor(use_cuda=True,
-                                            main_program=test_program,
-                                            share_vars_from=train_exe)
-
-          train_loss, = train_exe.run([loss.name], feed=feed_dict)
-          test_loss, = test_exe.run([loss.name], feed=feed_dict)
     """
 
     def __init__(self,
@@ -152,24 +189,58 @@ class ParallelExecutor(object):
         assume the data has been splitted into multiple devices, the each
         element in the list will be copied to each device directly.
 
-        For example, if the feed is a dict:
-
-        >>> exe = ParallelExecutor()
-        >>> # the image will be splitted into devices. If there is two devices
-        >>> # each device will process an image with shape (24, 1, 28, 28)
-        >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
-
-        For example, if the feed is a list:
+        Examples:
+            .. code-block:: python
 
-        >>> exe = ParallelExecutor()
-        >>> # each device will process each element in the list.
-        >>> # the 1st device will process an image with shape (48, 1, 28, 28)
-        >>> # the 2nd device will process an image with shape (32, 1, 28, 28)
-        >>> #
-        >>> # you can use exe.device_count to get the device number.
-        >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
-        >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
-        >>>              ])
+              import paddle.fluid as fluid
+              import numpy
+              import os
+
+              use_cuda = True
+              place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+              # NOTE: If you use CPU to run the program, you need
+              # to specify the CPU_NUM, otherwise, fluid will use
+              # all the number of the logic core as the CPU_NUM,
+              # in that case, the batch size of the input should be
+              # greater than CPU_NUM, if not, the process will be
+              # failed by an exception.
+              if not use_cuda:
+                  os.environ['CPU_NUM'] = str(2)
+
+              exe = fluid.Executor(place)
+
+              train_program = fluid.Program()
+              startup_program = fluid.Program()
+              with fluid.program_guard(train_program, startup_program):
+                  data = fluid.layers.data(name='X', shape=[1], dtype='float32')
+                  hidden = fluid.layers.fc(input=data, size=10)
+                  loss = fluid.layers.mean(hidden)
+                  fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
+
+              startup_program.random_seed=1
+              exe.run(startup_program)
+
+              train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                                 main_program=train_program,
+                                                 loss_name=loss.name)
+
+              # If the feed is a dict:
+              # the image will be splitted into devices. If there is two devices
+              # each device will process an image with shape (5, 1)
+              x = numpy.random.random(size=(10, 1)).astype('float32')
+              loss_data, = train_exe.run(feed={"X": x},
+                                         fetch_list=[loss.name])
+
+              # If the feed is a list:
+              # each device will process each element in the list.
+              # the 1st device will process an image with shape (10, 1)
+              # the 2nd device will process an image with shape (9, 1)
+              #
+              # you can use exe.device_count to get the device number.
+              x2 = numpy.random.random(size=(9, 1)).astype('float32')
+              loss_data, = train_exe.run(feed=[{"X": x}, {"X": x2}],
+                                         fetch_list=[loss.name])
 
         Args:
             fetch_list(list): The fetched variable names