update by comment test=develop

4a4ccac1 · Yancey1989 · 4f304eaa · 4a4ccac1 · 4a4ccac1 · 4a4ccac1
9 changed file
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -107,22 +107,20 @@ void AllReduceOpHandle::RunImpl() {
          PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
              buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
              ncclSum, comm, stream));
-          if (!nccl_ctxs_->need_group_call_) cudaStreamSynchronize(stream);
+          // TODO(Yancey1989): synchronize here can get better performance
+          // if don't use NCCL group call, but need more profileing.
+          if (local_scopes_.size() == 1UL) cudaStreamSynchronize(stream);
        });
      }

      this->RunAndRecordEvent([&] {
-        // TODO(Yancey1989): need allreduce operator to avoid this flag
-        if (nccl_ctxs_->need_group_call_) {
+        if (all_reduce_calls.size() == 1UL) {
+          all_reduce_calls[0]();
+        } else {
          platform::NCCLGroupGuard guard;
          for (auto &call : all_reduce_calls) {
            call();
          }
-        } else {
-          // only used in executor_type == ParallalGraph, one thread one GPU
-          // TODO(Yancey1989): use allreduce operator to avoid this tricky.
-          PADDLE_ENFORCE(all_reduce_calls.size() == 1UL);
-          all_reduce_calls[0]();
        }
      });


--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -386,8 +386,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
          CreateComputationalOps(&result, node, places_.size());
        }

-// insert synchronous ops at the backpropagation; and
-// insert synchronous ops if the graph contains mutilple places.
+// insert collective ops at the backpropagation; and
+// insert collective ops if the graph contains mutilple places.

 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
        if (!is_forwarding &&

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -52,6 +52,7 @@ void OpHandleBase::Run(bool use_cuda) {
 #else
  PADDLE_ENFORCE(!use_cuda);
 #endif
+
  RunImpl();
 }


--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -216,6 +216,7 @@ void ThreadedSSAGraphExecutor::RunOp(
      if (LIKELY(!strategy_.dry_run_)) {
        op->Run(strategy_.use_cuda_);
      }
+      VLOG(10) << op << " " << op->Name() << " Done ";
      running_ops_--;
      ready_var_q->Extend(op->Outputs());
      VLOG(10) << op << " " << op->Name() << "Signal posted";

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -231,7 +231,6 @@ ParallelExecutor::ParallelExecutor(
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
    ncclUniqueId *nccl_id = nullptr;
-    bool need_group_call = true;
    if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) {
      // parallel graph mode should initialize nccl by ncclCommInitRank since
      // it call nccl operator per device per thread.
@@ -243,17 +242,16 @@ ParallelExecutor::ParallelExecutor(
      } else {
        nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
      }
-      need_group_call = false;
    } else if (nccl_id_var != nullptr) {  // the other executor type.
      // the distributed training with nccl mode would initialize the nccl id in
      // startup_program.
      nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
    } else {
-      // initlize NCCL by ncclCommInitAll, do not need nccl_id.
+      // initlize NCCL by ncclCommInitAll, do not need to intialize the nccl_id.
    }

    member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
-        member_->places_, nccl_id, num_trainers, trainer_id, need_group_call));
+        member_->places_, nccl_id, num_trainers, trainer_id));
 #else
    PADDLE_THROW("Not compiled with CUDA");
 #endif
@@ -288,6 +286,14 @@ ParallelExecutor::ParallelExecutor(
  graphs.push_back(std::move(graph));
 #endif

+  auto max_memory_size = GetEagerDeletionThreshold();
+  // TODO(Yancey1989): fix gc failed on ParallelGraph executor.
+  if (max_memory_size >= 0 &&
+      exec_strategy.type_ != ExecutionStrategy::kParallelGraph) {
+    graphs[0] = member_->PrepareGCAndRefCnts(
+        std::move(graphs[0]), static_cast<size_t>(max_memory_size));
+  }
+
  // Step 3. Create vars in each scope. Passes may also create new vars.
  //         skip control vars and empty vars
  std::vector<details::VariableInfo> var_infos;

--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -27,6 +27,7 @@ limitations under the License. */

 namespace paddle {
 namespace framework {
+
 struct ExceptionHandler {
  mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
  explicit ExceptionHandler(

--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -46,6 +46,7 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
      sin >> num;
      place = platform::CUDAPlace(static_cast<int>(num));
    }
+
    out->Reset(framework::MakeDecoratedReader<BufferedReader>(underlying_reader,
                                                              place, 2));
  }

--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -82,15 +82,12 @@ struct NCCLContext {
 struct NCCLContextMap {
  std::unordered_map<int, NCCLContext> contexts_;
  std::vector<int> order_;
-  bool need_group_call_;

  explicit NCCLContextMap(const std::vector<platform::Place> &places,
                          ncclUniqueId *nccl_id = nullptr,
-                          size_t num_trainers = 1, size_t trainer_id = 0,
-                          bool need_group_call = true) {
+                          size_t num_trainers = 1, size_t trainer_id = 0) {
    PADDLE_ENFORCE(!places.empty());
    order_.reserve(places.size());
-    need_group_call_ = need_group_call;
    for (auto &p : places) {
      int dev_id = boost::get<CUDAPlace>(p).device;
      order_.emplace_back(dev_id);

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -123,7 +123,7 @@ class TestMNIST(TestParallelExecutorBase):
        self.check_simple_fc_convergence(False)

    def test_simple_fc_with_new_strategy(self):
-        # use_cuda, use_reducea
+        # use_cuda, use_reduce
        self._compare_reduce_and_allreduce(simple_fc_net, True)
        self._compare_reduce_and_allreduce(simple_fc_net, False)