Polish Executor and Compiler doc (#17262)

* polish doc test=develop * updata parallel executor doc test=develop * update API.spec test=develop * polish code test=develop

Polish Executor and Compiler doc (#17262)
* polish doc test=develop * updata parallel executor doc test=develop * update API.spec test=develop * polish code test=develop
8f534696 · chengduo · GitHub · dd86b400 · 8f534696 · 8f534696
4 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -15,12 +15,12 @@ paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=N
 paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912'))
 paddle.fluid.in_dygraph_mode (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f06314a1cb30c96b5808dde2219c2dae'))
 paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
+paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3a584496aa1343f36eebf3c46b323a74'))
 paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '9c7decb955b9c4f718114179c8985581'))
-paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'f482e93b38b4018796969a2e1dde479d'))
+paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', '4cfcd9c15b766a51b584cc46d38f1ad8'))
 paddle.fluid.Executor.train_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', 'd521011d79e71080fe9b5bb179b43518'))
-paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0'))
+paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f65788d9ead293ada47551339df12203'))
-paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2'))
+paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'e480208ccc0c9abf084867206dab4d2c'))
 paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680'))
 paddle.fluid.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8'))
@@ -31,7 +31,7 @@ paddle.fluid.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'pr
 paddle.fluid.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ac4114d3df16264f1946deb3a8434a6f'))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
 paddle.fluid.ParallelExecutor.__init__ (ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '2cb4bd74481861345c70228a0f57620c'))
+paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '33ce6ec50f8eeb05d340e6b114b026fd'))
 paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', 'b82ea20e2dc5ff2372e0643169ca47ff'))
 paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '74dc6d23185d90a7a50fbac19f5b65fb'))
 paddle.fluid.DataFeedDesc.__init__ (ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -40,7 +40,7 @@ paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], v
 paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'eb894b464bbcd1b4bc8038398954f766'))
 paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', '415c56600ce4e198c071cad01409a690'))
 paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a8c7793803cf976680d9478e378fa356'))
+paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', '0e17773521634ef798fddd7d2ea3ef96'))
 paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8'))
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None

--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -58,19 +58,33 @@ class CompiledProgram(object):
    optimizations, for example.
      * Pre-compute some logic once so that each run is faster.
      * Transform the program so that it can run in multiple devices.
-      * TODO: transform the program for optimized inference or distributed
+      * Transform the program for optimized inference or distributed
-              training.
+        training. **Note that: this part is not finished.**
    Example:
        .. code-block:: python
-            place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+          import paddle.fluid as fluid
+          import paddle.fluid.compiler as compiler
+          import numpy
+          import os
+          place = fluid.CUDAPlace(0) # fluid.CPUPlace()
          exe = fluid.Executor(place)
-            exe.run(startup)
-            compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
+          data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-                loss_name=loss.name)
+          hidden = fluid.layers.fc(input=data, size=10)
-            for i in range(5):
+          loss = fluid.layers.mean(hidden)
-                test_loss, = exe.run(compiled_prog,
+          fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
-                                     feed=feed_dict,
+          fluid.default_startup_program().random_seed=1
+          exe.run(fluid.default_startup_program())
+          compiled_prog = compiler.CompiledProgram(
+                   fluid.default_main_program())
+          x = numpy.random.random(size=(10, 1)).astype('float32')
+          loss_data, = exe.run(compiled_prog,
+                               feed={"X": x},
                               fetch_list=[loss.name])
    Args:
@@ -108,6 +122,44 @@ class CompiledProgram(object):
                           places=None):
        """Configs the program to run in data parallel way.
+        Example:
+            .. code-block:: python
+              import paddle.fluid as fluid
+              import paddle.fluid.compiler as compiler
+              import numpy
+              import os
+              use_cuda = True
+              place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+              # NOTE: If you use CPU to run the program, you need
+              # to specify the CPU_NUM, otherwise, fluid will use
+              # all the number of the logic core as the CPU_NUM,
+              # in that case, the batch size of the input should be
+              # greater than CPU_NUM, if not, the process will be
+              # failed by an exception.
+              if not use_cuda:
+                  os.environ['CPU_NUM'] = str(2)
+              exe = fluid.Executor(place)
+              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
+              hidden = fluid.layers.fc(input=data, size=10)
+              loss = fluid.layers.mean(hidden)
+              fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
+              fluid.default_startup_program().random_seed=1
+              exe.run(fluid.default_startup_program())
+              compiled_prog = compiler.CompiledProgram(
+                       fluid.default_main_program()).with_data_parallel(
+                                loss_name=loss.name)
+              x = numpy.random.random(size=(10, 1)).astype('float32')
+              loss_data, = exe.run(compiled_prog,
+                                   feed={"X": x},
+                                   fetch_list=[loss.name])
        Args:
            loss_name (str): The loss name must set in training. Default None.
            build_strategy(BuildStrategy): build_strategy is used to

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -38,6 +38,15 @@ def global_scope():
    Get the global/default scope instance. There are a lot of APIs use
    :code:`global_scope` as its default value, e.g., :code:`Executor.run`
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          import numpy
+          fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
+          numpy.array(fluid.global_scope().find_var("data").get_tensor())
    Returns:
        Scope: The global/default scope instance.
    """
@@ -58,10 +67,15 @@ def scope_guard(scope):
    variable in runtime will assigned to the new scope.
    Examples:
-        >>> import paddle.fluid as fluid
+        .. code-block:: python
-        >>> new_scope = fluid.Scope()
-        >>> with fluid.scope_guard(new_scope):
+          import paddle.fluid as fluid
-        >>>     ...
+          import numpy
+          new_scope = fluid.Scope()
+          with fluid.scope_guard(new_scope):
+              fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
+          numpy.array(new_scope.find_var("data").get_tensor())
    Args:
        scope: The new global/default scope.
@@ -75,11 +89,18 @@ def as_numpy(tensor):
    """
    Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information.
    For higher dimensional sequence data, please use LoDTensor directly.
    Examples:
-        >>> import paddle.fluid as fluid
+        .. code-block:: python
-        >>> outs = executor.run(...)
-        >>> np_outs = map(lambda x: as_numpy(x), outs)
+          import paddle.fluid as fluid
-        >>>     ...
+          import numpy
+          new_scope = fluid.Scope()
+          with fluid.scope_guard(new_scope):
+              fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
+          tensor = new_scope.find_var("data").get_tensor()
+          fluid.executor.as_numpy(tensor) # or numpy.array(new_scope.find_var("data").get_tensor())
    Args:
       tensor(Variable): a instance of Tensor
@@ -263,42 +284,70 @@ def _as_lodtensor(data, place):
 class Executor(object):
    """
-    An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
+    An Executor in Python, supports single/multiple-GPU running,
-    Python executor takes a program, adds feed operators and fetch operators to this program according
+    and single/multiple-CPU running. Python executor takes a program,
-    to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
+    adds feed operators and fetch operators to this program according
-    the variables(or names) that user wants to get after program runs. Note: the executor will run all
+    to feed map and fetch_list. Feed map provides input data for the
-    operators in the program but not only the operators dependent by the fetch_list.
+    program. fetch_list provides the variables(or names) that user wants
-    It stores the global variables into the global scope, and creates a local scope for the temporary
+    to get after program runs. Note: the executor will run all operators
-    variables. The contents in local scope may be discarded after every minibatch forward/backward
+    in the program but not only the operators dependent by the fetch_list.
-    finished. But the global scope variables will be persistent through different runs.
+    It stores the global variables into the global scope, and creates a
+    local scope for the temporary variables. The contents in local scope
+    may be discarded after every minibatch forward/backward finished.
-    Example:
+    But the global scope variables will be persistent through different runs.
+    Examples:
        .. code-block:: python
-            # First create the Executor.
+          import paddle.fluid as fluid
+          import paddle.fluid.compiler as compiler
+          import numpy
+          import os
+          use_cuda = True
          place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
          exe = fluid.Executor(place)
+          train_program = fluid.Program()
+          startup_program = fluid.Program()
+          with fluid.program_guard(train_program, startup_program):
+              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
+              hidden = fluid.layers.fc(input=data, size=10)
+              loss = fluid.layers.mean(hidden)
+              fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
          # Run the startup program once and only once.
          # Not need to optimize/compile the startup program.
-            exe.run(fluid.default_startup_program())
+          startup_program.random_seed=1
+          exe.run(startup_program)
          # Run the main program directly without compile.
-            loss, = exe.run(fluid.default_main_program(),
+          x = numpy.random.random(size=(10, 1)).astype('float32')
-                            feed=feed_dict,
+          loss_data, = exe.run(train_program,
+                               feed={"X": x},
                               fetch_list=[loss.name])
-            # Or, compiled the program and run. See `CompiledProgram` for more detail.
+          # Or, compiled the program and run. See `CompiledProgram`
+          # for more detail.
+          # NOTE: If you use CPU to run the program, you need
+          # to specify the CPU_NUM, otherwise, fluid will use
+          # all the number of the logic core as the CPU_NUM,
+          # in that case, the batch size of the input should be
+          # greater than CPU_NUM, if not, the process will be
+          # failed by an exception.
+          if not use_cuda:
+              os.environ['CPU_NUM'] = str(2)
          compiled_prog = compiler.CompiledProgram(
-                fluid.default_main_program()).with_data_parallel(
+              train_program).with_data_parallel(
              loss_name=loss.name)
-            loss, = exe.run(compiled_prog,
+          loss_data, = exe.run(compiled_prog,
-                            feed=feed_dict,
+                               feed={"X": x},
                               fetch_list=[loss.name])
    Args:
-        place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
+        place(fluid.CPUPlace|fluid.CUDAPlace(n)): indicate the executor run on which device.
    """
    def __init__(self, place):
@@ -392,14 +441,18 @@ class Executor(object):
        Close this executor.
        You can no longer use this executor after calling this method.
-        For the distributed training, this method would free the resource on PServers related to
+        For the distributed training, this method would free the resource
-        the current Trainer.
+        on PServers related to the current Trainer.
-        Example:
+        Examples:
-            >>> cpu = core.CPUPlace()
+            .. code-block:: python
-            >>> exe = Executor(cpu)
-            >>> ...
+              import paddle.fluid as fluid
-            >>> exe.close()
+              cpu = fluid.CPUPlace()
+              exe = fluid.Executor(cpu)
+              # execute training or testing
+              exe.close()
        """
        if not self._closed:
            self._default_executor.close()
@@ -490,13 +543,37 @@ class Executor(object):
            return_numpy=True,
            use_program_cache=False):
        """
-        Run program by this Executor. Feed data by feed map, fetch result by fetch_list.
+        Run program by this Executor. Feed data by feed map, fetch result by
-        Python executor takes a program, add feed operators and fetch operators to this program according
+        fetch_list. Python executor takes a program, add feed operators and
-        to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
+        fetch operators to this program according to feed map and fetch_list.
+        Feed map provides input data for the program. fetch_list provides
        the variables(or names) that user want to get after program run.
-        Note: the executor will run all
+        Note: the executor will run all operators in the program but not
-        operators in the program but not only the operators dependent by the fetch_list
+        only the operators dependent by the fetch_list.
+        Examples:
+            .. code-block:: python
+              import paddle.fluid as fluid
+              import numpy
+              # First create the Executor.
+              place = fluid.CPUPlace() # fluid.CUDAPlace(0)
+              exe = fluid.Executor(place)
+              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
+              hidden = fluid.layers.fc(input=data, size=10)
+              loss = fluid.layers.mean(hidden)
+              adam = fluid.optimizer.Adam()
+              adam.minimize(loss)
+              # Run the startup program once and only once.
+              exe.run(fluid.default_startup_program())
+              x = numpy.random.random(size=(10, 1)).astype('float32')
+              outs = exe.run(feed={'X': x},
+                             fetch_list=[loss.name])
        Args:
            program(Program|CompiledProgram): the program that need to run,
@@ -520,26 +597,6 @@ class Executor(object):
        Returns:
            list(numpy.array): fetch result according to fetch_list.
-        Examples:
-            >>> data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-            >>> out = fluid.layers.create_tensor(dtype='float32')
-            >>> hidden = fluid.layers.fc(input=data, size=10)
-            >>> fluid.layers.assign(hidden,out)
-            >>> loss = fluid.layers.mean(out)
-            >>> adam = fluid.optimizer.Adam()
-						>>> adam.minimize(loss)
-            >>> cpu = core.CPUPlace()
-            >>> exe = fluid.Executor(cpu)
-            >>> exe.run(fluid.default_startup_program())
-            >>> x = numpy.random.random(size=(10, 1)).astype('float32')
-            >>> outs = exe.run(
-            >>>     feed={'X': x},
-            >>>     fetch_list=[loss.name])
        """
        if self._closed:

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -37,6 +37,53 @@ class ParallelExecutor(object):
    is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number
    of CPUs in the system.
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          import numpy
+          import os
+          use_cuda = True
+          place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+          # NOTE: If you use CPU to run the program, you need
+          # to specify the CPU_NUM, otherwise, fluid will use
+          # all the number of the logic core as the CPU_NUM,
+          # in that case, the batch size of the input should be
+          # greater than CPU_NUM, if not, the process will be
+          # failed by an exception.
+          if not use_cuda:
+              os.environ['CPU_NUM'] = str(2)
+          exe = fluid.Executor(place)
+          train_program = fluid.Program()
+          startup_program = fluid.Program()
+          with fluid.program_guard(train_program, startup_program):
+              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
+              hidden = fluid.layers.fc(input=data, size=10)
+              loss = fluid.layers.mean(hidden)
+              test_program = fluid.default_main_program().clone(for_test=True)
+              fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
+          startup_program.random_seed=1
+          exe.run(startup_program)
+          train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                             main_program=train_program,
+                                             loss_name=loss.name)
+          test_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                            main_program=test_program,
+                                            share_vars_from=train_exe)
+          x = numpy.random.random(size=(10, 1)).astype('float32')
+          loss_data, = train_exe.run(feed={"X": x},
+                                     fetch_list=[loss.name])
+          loss_data, = test_exe.run(feed={"X": x},
+                                    fetch_list=[loss.name])
    Args:
        use_cuda (bool): Whether to use CUDA or not.
        loss_name (str): The loss name must set in training. Default None.
@@ -66,16 +113,6 @@ class ParallelExecutor(object):
    Raises:
        TypeError: If share_vars_from is provided, but not ParallelExecutor object.
-    Examples:
-        .. code-block:: python
-          train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
-          test_exe = fluid.ParallelExecutor(use_cuda=True,
-                                            main_program=test_program,
-                                            share_vars_from=train_exe)
-          train_loss, = train_exe.run([loss.name], feed=feed_dict)
-          test_loss, = test_exe.run([loss.name], feed=feed_dict)
    """
    def __init__(self,
@@ -152,24 +189,58 @@ class ParallelExecutor(object):
        assume the data has been splitted into multiple devices, the each
        element in the list will be copied to each device directly.
-        For example, if the feed is a dict:
+        Examples:
+            .. code-block:: python
-        >>> exe = ParallelExecutor()
-        >>> # the image will be splitted into devices. If there is two devices
-        >>> # each device will process an image with shape (24, 1, 28, 28)
-        >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
-        For example, if the feed is a list:
-        >>> exe = ParallelExecutor()
+              import paddle.fluid as fluid
-        >>> # each device will process each element in the list.
+              import numpy
-        >>> # the 1st device will process an image with shape (48, 1, 28, 28)
+              import os
-        >>> # the 2nd device will process an image with shape (32, 1, 28, 28)
-        >>> #
+              use_cuda = True
-        >>> # you can use exe.device_count to get the device number.
+              place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
-        >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
+              # NOTE: If you use CPU to run the program, you need
-        >>>              ])
+              # to specify the CPU_NUM, otherwise, fluid will use
+              # all the number of the logic core as the CPU_NUM,
+              # in that case, the batch size of the input should be
+              # greater than CPU_NUM, if not, the process will be
+              # failed by an exception.
+              if not use_cuda:
+                  os.environ['CPU_NUM'] = str(2)
+              exe = fluid.Executor(place)
+              train_program = fluid.Program()
+              startup_program = fluid.Program()
+              with fluid.program_guard(train_program, startup_program):
+                  data = fluid.layers.data(name='X', shape=[1], dtype='float32')
+                  hidden = fluid.layers.fc(input=data, size=10)
+                  loss = fluid.layers.mean(hidden)
+                  fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
+              startup_program.random_seed=1
+              exe.run(startup_program)
+              train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                                 main_program=train_program,
+                                                 loss_name=loss.name)
+              # If the feed is a dict:
+              # the image will be splitted into devices. If there is two devices
+              # each device will process an image with shape (5, 1)
+              x = numpy.random.random(size=(10, 1)).astype('float32')
+              loss_data, = train_exe.run(feed={"X": x},
+                                         fetch_list=[loss.name])
+              # If the feed is a list:
+              # each device will process each element in the list.
+              # the 1st device will process an image with shape (10, 1)
+              # the 2nd device will process an image with shape (9, 1)
+              #
+              # you can use exe.device_count to get the device number.
+              x2 = numpy.random.random(size=(9, 1)).astype('float32')
+              loss_data, = train_exe.run(feed=[{"X": x}, {"X": x2}],
+                                         fetch_list=[loss.name])
        Args:
            fetch_list(list): The fetched variable names