diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index e264906b57f006810bb37dba8a411fa34cea0ad8..6c4e0e9168a932dcf80e87ce489a751d0db682b2 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -386,12 +386,16 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
           CreateComputationalOps(&result, node, places_.size());
         }
 
-        // if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) {
-        // insert synchronous ops at the backpropagation; and
-        // insert synchronous ops if the graph contains mutilple places.
+// insert synchronous ops at the backpropagation; and
+// insert synchronous ops if the graph contains mutilple places.
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
         if (!is_forwarding &&
             (places_.size() > 1 || num_trainers > 1 ||
              (nccl_ctxs_ && nccl_ctxs_->contexts_.size() > 1))) {
+#else
+        if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) {
+#endif
           // Currently, we assume that once gradient is generated, it can be
           // broadcast, and each gradient is only broadcast once.
           if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h
index 517d66974433e1e7ae93bf86c8c17d85d46e7a8f..635483158fc2892d408c419a10ae0358b9ef98de 100644
--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
@@ -95,7 +95,7 @@ class CTRReader : public framework::FileReader {
     queue_->ReOpen();
     VLOG(3) << "reopen success";
     VLOG(3) << "thread_num " << thread_num_;
-    for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
+    for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) {
       read_threads_.emplace_back(new std::thread(
           std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_,
                     thread_id, &read_thread_status_, queue_)));
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 9cebdda69382e1c865592b95214651eff5f261af..3beb93e7b3ebe83e5b5c82afad280512f043e175 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -789,7 +789,18 @@ All parameter, weight, gradient are variables in Paddle.
           [](ExecutionStrategy &self, ExecutionStrategy::ExecutorType type) {
             self.type_ = type;
           },
-          R"DOC()DOC");
+          R"DOC(The type is ExecutorType which is the enum ranging from Default, 
+ParallelGraph and Experiment:
+
+Default: Compile the main_program into a multi-devices graph,
+         and execute this graph on multi-devices with multiple threads which
+         specified by build_strategy.num_threads.
+ParallelGraph: Compile the main_program into multiple graphs, and execute each of the graphs on one
+               device with one thread. Please note, this mode only supports all-reduce mode and use_cuda=True.
+               This approach can achieve better performance in some scenarios.
+Experimental: Compile the main_program into a multi-devices graph,
+              and executor this graph with a faster execution mode than the Default,
+              this approach is on the experiments.)DOC");
 
   py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
     BuildStrategy allows the user to more preciously control how to
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
index 18d95c94ad36316b7149eb5412260b40a57ac002..eff76ce0d49df52b0219ba920103a3252e6cc026 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
@@ -17,6 +17,8 @@ import unittest
 import logging
 import six
 
+ExecutorType = fluid.ExecutionStrategy().ExecutorType
+
 
 class TestBase(unittest.TestCase):
     def main(self,
@@ -24,7 +26,7 @@ class TestBase(unittest.TestCase):
              iter=10,
              iter_per_pe=10,
              use_gpu=True,
-             use_experimental_executor=False):
+             exec_type=ExecutorType.Default):
         if use_gpu and not fluid.core.is_compiled_with_cuda():
             logging.warning(
                 "Paddle is not compiled with CUDA, skip GPU unittests")
@@ -43,7 +45,7 @@ class TestBase(unittest.TestCase):
         for _ in six.moves.xrange(iter):
             exe_strategy = fluid.ExecutionStrategy()
             exe_strategy._dry_run = True
-            exe_strategy.use_experimental_executor = use_experimental_executor
+            exe_strategy.executor_type = exec_type
             pe = fluid.ParallelExecutor(
                 use_cuda=use_gpu,
                 loss_name=loss.name,
@@ -56,11 +58,11 @@ class TestBase(unittest.TestCase):
 class TestMNISTDryRun(TestBase):
     def test_mnist_dry_run(self):
         for use_gpu in (False, True):
-            for use_experimental_executor in (False, True):
+            for exec_type in (ExecutorType.Default, ExecutorType.Experimental):
                 self.main(
                     network_func=TestMNISTDryRun.network_func,
                     use_gpu=use_gpu,
-                    use_experimental_executor=use_experimental_executor)
+                    exec_type=exec_type)
 
     @staticmethod
     def network_func():