Reorganize Code

f28ae6e4 · Yu Yang · 5c333e41 · f28ae6e4 · f28ae6e4 · f28ae6e4
5 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -87,9 +87,15 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
 framework_proto backward glog lod_rank_table feed_fetch_method)
+if(WITH_GPU)
+  set(parallel_executor_cuda_deps nccl_all_reduce_op_handle)
+else()
+  set(parallel_executor_cuda_deps)
+endif()
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope
        framework_proto backward glog lod_rank_table simple_threadpool scale_loss_grad_op_handle
-        fetch_op_handle)
+        fetch_op_handle ${parallel_executor_cuda_deps})
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -2,3 +2,5 @@ cc_library(var_handle SRCS var_handle.cc DEPS place)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+        dynload_cuda)
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    const platform::NCCLContextMap &ctxs)
+    : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
+  for (auto &p : places_) {
+    this->dev_ctx_[p] = nccl_ctxs_.DevCtx(p);
+  }
+}
+void NCCLAllReduceOpHandle::RunImpl() {
+  if (inputs_.size() == 1) {
+    return;  // No need to all reduce when GPU count = 1;
+  } else {
+    // Wait input done
+    for (auto *in : inputs_) {
+      auto &p = static_cast<VarHandle *>(in)->place_;
+      in->generated_op_->Wait(dev_ctx_[p]);
+    }
+    auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_;
+    int dtype = -1;
+    size_t numel = 0;
+    platform::NCCLGroupGuard guard;
+    for (size_t i = 0; i < local_scopes_.size(); ++i) {
+      auto &p = places_[i];
+      auto *s = local_scopes_[i];
+      int dev_id = boost::get<platform::CUDAPlace>(p).device;
+      auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
+      void *buffer = const_cast<void *>(lod_tensor.data<void>());
+      uintptr_t buf = reinterpret_cast<uintptr_t>(buffer);
+      if (buf % sizeof(float) != 0) {
+        VLOG(3) << "Buffer is not aligned " << buf;
+      }
+      if (dtype == -1) {
+        dtype = platform::ToNCCLDataType(lod_tensor.type());
+      }
+      if (numel == 0) {
+        numel = static_cast<size_t>(lod_tensor.numel());
+      }
+      auto &nccl_ctx = nccl_ctxs_.at(dev_id);
+      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+          buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
+          nccl_ctx.comm_, nccl_ctx.stream()));
+    }
+  }
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+namespace paddle {
+namespace framework {
+namespace details {
+struct NCCLAllReduceOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+  const platform::NCCLContextMap &nccl_ctxs_;
+  NCCLAllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                        const std::vector<platform::Place> &places,
+                        const platform::NCCLContextMap &ctxs);
+ protected:
+  void RunImpl() override;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "lod_tensor_array.h"
 #include "op_registry.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
 #include "paddle/fluid/framework/details/var_handle.h"
@@ -28,6 +29,7 @@ namespace framework {
 using details::DummyVarHandle;
 using details::FetchOpHandle;
+using details::NCCLAllReduceOpHandle;
 using details::OpHandleBase;
 using details::ScaleLossGradOpHandle;
 using details::VarHandle;
@@ -123,69 +125,6 @@ class ParallelExecutorPrivate {
    var.place_ = place;
    op_handle->AddOutput(&var);
  }
-};  // namespace framework
-struct NCCLAllReduceOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
-  const platform::NCCLContextMap &nccl_ctxs_;
-  explicit NCCLAllReduceOpHandle(const std::vector<Scope *> &local_scopes,
-                                 const std::vector<platform::Place> &places,
-                                 const platform::NCCLContextMap &ctxs)
-      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
-    for (auto &p : places_) {
-      this->dev_ctx_[p] = nccl_ctxs_.DevCtx(p);
-    }
-  }
-  void Wait(platform::DeviceContext *waited_dev) override {
-    OpHandleBase::Wait(waited_dev);
-  }
- protected:
-  void RunImpl() override {
-    if (inputs_.size() == 1) {
-      return;  // No need to all reduce when GPU count = 1;
-    } else {
-      // Wait input done
-      for (auto *in : inputs_) {
-        auto &p = static_cast<VarHandle *>(in)->place_;
-        in->generated_op_->Wait(dev_ctx_[p]);
-      }
-      auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_;
-      int dtype = -1;
-      size_t numel = 0;
-      platform::NCCLGroupGuard guard;
-      for (size_t i = 0; i < local_scopes_.size(); ++i) {
-        auto &p = places_[i];
-        auto *s = local_scopes_[i];
-        int dev_id = boost::get<platform::CUDAPlace>(p).device;
-        auto &lod_tensor = s->FindVar(var_name)->Get<framework::LoDTensor>();
-        void *buffer = const_cast<void *>(lod_tensor.data<void>());
-        uintptr_t buf = reinterpret_cast<uintptr_t>(buffer);
-        if (buf % sizeof(float) != 0) {
-          VLOG(3) << "Buffer is not aligned " << buf;
-        }
-        if (dtype == -1) {
-          dtype = platform::ToNCCLDataType(lod_tensor.type());
-        }
-        if (numel == 0) {
-          numel = static_cast<size_t>(lod_tensor.numel());
-        }
-        auto &nccl_ctx = nccl_ctxs_.at(dev_id);
-        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
-            nccl_ctx.comm_, nccl_ctx.stream()));
-      }
-    }
-  }
 };
 struct ComputationOpHandle : public OpHandleBase {