diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index a4b2ea837fe26eb4b87f49608091497c7e8a8c40..1e6714479d4a6ec3d08acec2c52a857a8cf986cc 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -34,6 +34,10 @@ class CompiledProgram(object):
     """
     Compiles a Program for execution.
 
+    1. Users first create the program with layers.
+    2. Optionally, users use CompiledProgram to optimize the program before run.
+    3. The original program or CompiledProgram is run by executor.
+
     The CompiledProgram is used to transform a program for various
     optimizations, for example.
       * Pre-compute some logic once so that each run is faster.
@@ -42,11 +46,19 @@ class CompiledProgram(object):
               training.
 
     Example:
-
+        .. code-block:: python
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup)
+            compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
+                loss_name=loss.name)
+            for i in range(5):
+                test_loss, = exe.run(compiled_prog,
+                                     feed=feed_dict,
+                                     fetch_list=[loss.name])
 
     Args:
         program: Program instance that contains the model logic.
-
     """
 
     def __init__(self, program):
@@ -57,11 +69,32 @@ class CompiledProgram(object):
         self._compiled = False
         self._is_data_parallel = False
 
-    def _with_data_parallel(self,
-                            loss_name=None,
-                            build_strategy=None,
-                            exec_strategy=None,
-                            share_vars_from=None):
+    def with_data_parallel(self,
+                           loss_name=None,
+                           build_strategy=None,
+                           exec_strategy=None,
+                           share_vars_from=None):
+        """Configs the program to run in data parallel way.
+
+        Args:
+            loss_name (str): The loss name must set in training. Default None.
+            build_strategy(BuildStrategy): build_strategy is used to
+                build the graph so it can run on multiple devices/cores with
+                optimized topology.
+                For more information, please refer to fluid.BuildStrategy.
+                Default None.
+            exec_strategy(ExecutionStrategy): exec_strategy is used to
+                to select the a way to execute the graph, for example how many
+                threads are used, how many iterations to clean up the temp
+                variables. For more information, please refer
+                to fluid.ExecutionStrategy. Default None.
+            share_vars_from(CompiledProgram): If provide, this CompiledProgram
+                will share variables from `share_vars_from`. `share_vars_from`
+                must be run by the executor before this CompiledProgram so that
+                vars are ready.
+        Returns:
+            self
+        """
         assert not self._is_data_parallel, "Already compiled with parallel."
         self._is_data_parallel = True
         self._build_strategy = build_strategy
@@ -145,6 +178,16 @@ class CompiledProgram(object):
             self._exec_strategy, self._build_strategy)
 
     def _compile(self, scope, place):
+        """Compile the program based on the configs.
+
+        Args:
+            scope: The variables (resources) that are associated with
+               this compiled program.
+            place: The location that the compiled program will be run on.
+
+        Returns:
+            self
+        """
         if self._compiled:
             if scope and self._scope != scope:
                 raise ValueError("Cannot compile with different scope")
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 784fe64c4ec682a4f0ebb24912a933c721c6b555..1ba47d5a57665e00f08ffde9ebf3b5b10412c2ee 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -81,7 +81,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             if use_cuda and core.is_compiled_with_cuda():
                 build_strategy.remove_unnecessary_lock = True
             if use_parallel_executor:
-                binary = compiler.CompiledProgram(main)._with_data_parallel(
+                binary = compiler.CompiledProgram(main).with_data_parallel(
                     loss_name=loss.name,
                     build_strategy=build_strategy,
                     exec_strategy=exec_strategy)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index aacf52e01126b0ed1c486d191d7adbe6bbf0e803..3fcdc57906c214bdc8179c55b576e2e9e8d80973 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -132,7 +132,7 @@ class TestDistRunnerBase(object):
             build_stra.num_trainers = 1
             build_stra.trainer_id = 0
 
-        binary = compiler.CompiledProgram(trainer_prog)._with_data_parallel(
+        binary = compiler.CompiledProgram(trainer_prog).with_data_parallel(
             loss_name=avg_cost.name,
             build_strategy=build_stra,
             exec_strategy=strategy)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index 3cc954a77a902ccbfdb15e45d2750eea3cfa7f6e..d89fd87a38be460c561dbff656cdaa069ffbbd53 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -62,13 +62,12 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
             exe.run(startup)
             feed_dict = {'image': image, 'label': label}
 
-            train_cp = compiler.CompiledProgram(main)._with_data_parallel(
+            train_cp = compiler.CompiledProgram(main).with_data_parallel(
                 loss_name=loss.name, build_strategy=build_strategy)
-            test_cp = compiler.CompiledProgram(
-                test_program)._with_data_parallel(
-                    loss_name=loss.name,
-                    build_strategy=build_strategy,
-                    share_vars_from=train_cp)
+            test_cp = compiler.CompiledProgram(test_program).with_data_parallel(
+                loss_name=loss.name,
+                build_strategy=build_strategy,
+                share_vars_from=train_cp)
 
             for i in range(5):
                 exe.run(train_cp, feed=feed_dict, fetch_list=[loss.name])