fix ci test=develop

23eb8c42 · Yancey1989 · 106e2852 · 23eb8c42 · 23eb8c42 · 23eb8c42
4 changed file
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -386,12 +386,16 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
          CreateComputationalOps(&result, node, places_.size());
        }

-        // if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) {
-        // insert synchronous ops at the backpropagation; and
-        // insert synchronous ops if the graph contains mutilple places.
+// insert synchronous ops at the backpropagation; and
+// insert synchronous ops if the graph contains mutilple places.
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
        if (!is_forwarding &&
            (places_.size() > 1 || num_trainers > 1 ||
             (nccl_ctxs_ && nccl_ctxs_->contexts_.size() > 1))) {
+#else
+        if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) {
+#endif
          // Currently, we assume that once gradient is generated, it can be
          // broadcast, and each gradient is only broadcast once.
          if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(

--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
@@ -95,7 +95,7 @@ class CTRReader : public framework::FileReader {
    queue_->ReOpen();
    VLOG(3) << "reopen success";
    VLOG(3) << "thread_num " << thread_num_;
-    for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
+    for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) {
      read_threads_.emplace_back(new std::thread(
          std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_,
                    thread_id, &read_thread_status_, queue_)));

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -789,7 +789,18 @@ All parameter, weight, gradient are variables in Paddle.
          [](ExecutionStrategy &self, ExecutionStrategy::ExecutorType type) {
            self.type_ = type;
          },
-          R"DOC()DOC");
+          R"DOC(The type is ExecutorType which is the enum ranging from Default, 
+ParallelGraph and Experiment:
+
+Default: Compile the main_program into a multi-devices graph,
+         and execute this graph on multi-devices with multiple threads which
+         specified by build_strategy.num_threads.
+ParallelGraph: Compile the main_program into multiple graphs, and execute each of the graphs on one
+               device with one thread. Please note, this mode only supports all-reduce mode and use_cuda=True.
+               This approach can achieve better performance in some scenarios.
+Experimental: Compile the main_program into a multi-devices graph,
+              and executor this graph with a faster execution mode than the Default,
+              this approach is on the experiments.)DOC");

  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
    BuildStrategy allows the user to more preciously control how to

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
@@ -17,6 +17,8 @@ import unittest
 import logging
 import six

+ExecutorType = fluid.ExecutionStrategy().ExecutorType
+

 class TestBase(unittest.TestCase):
    def main(self,
@@ -24,7 +26,7 @@ class TestBase(unittest.TestCase):
             iter=10,
             iter_per_pe=10,
             use_gpu=True,
-             use_experimental_executor=False):
+             exec_type=ExecutorType.Default):
        if use_gpu and not fluid.core.is_compiled_with_cuda():
            logging.warning(
                "Paddle is not compiled with CUDA, skip GPU unittests")
@@ -43,7 +45,7 @@ class TestBase(unittest.TestCase):
        for _ in six.moves.xrange(iter):
            exe_strategy = fluid.ExecutionStrategy()
            exe_strategy._dry_run = True
-            exe_strategy.use_experimental_executor = use_experimental_executor
+            exe_strategy.executor_type = exec_type
            pe = fluid.ParallelExecutor(
                use_cuda=use_gpu,
                loss_name=loss.name,
@@ -56,11 +58,11 @@ class TestBase(unittest.TestCase):
 class TestMNISTDryRun(TestBase):
    def test_mnist_dry_run(self):
        for use_gpu in (False, True):
-            for use_experimental_executor in (False, True):
+            for exec_type in (ExecutorType.Default, ExecutorType.Experimental):
                self.main(
                    network_func=TestMNISTDryRun.network_func,
                    use_gpu=use_gpu,
-                    use_experimental_executor=use_experimental_executor)
+                    exec_type=exec_type)

    @staticmethod
    def network_func():