diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 4779647435411ff838dbad6481d3527887634ddd..d6811aa6e0c3bda832a935a1a6c7bb04308f1c95 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -2,8 +2,6 @@ cc_library(var_handle SRCS var_handle.cc DEPS place)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
-nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-        dynload_cuda)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
@@ -11,12 +9,16 @@ cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 
 if(WITH_GPU)
+    nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
+    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim dynload_cuda)
 else()
     set(multi_devices_graph_builder_deps)
+    cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim)
 endif()
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-            scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
+        scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
 
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
@@ -24,11 +26,10 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS
 
 cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory)
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory)
-cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim)
 
 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context gather_op_handle)
 cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
-        device_context reduce_op_handle)
+        device_context reduce_op_handle )
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index ecaa83eb7ebfc227d1e563deca8fbea8caee4cc5..c805d15fbbf99381ce84731c12ca2be8b85ecd81 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -13,30 +13,16 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
-#include "paddle/fluid/framework/details/gather_op_handle.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-std::vector<VarHandle *> GetValidVarHandle(
-    const std::vector<VarHandleBase *> &inputs) {
-  std::vector<VarHandle *> in_var_handles;
-  for (auto *in : inputs) {
-    auto *in_handle = dynamic_cast<VarHandle *>(in);
-    if (in_handle) {
-      in_var_handles.push_back(in_handle);
-    }
-  }
-  return in_var_handles;
-}
-
 void ReduceOpHandle::RunImpl() {
   // the input and output may have dummy var.
-  std::vector<VarHandle *> in_var_handles = GetValidVarHandle(inputs_);
-  std::vector<VarHandle *> out_var_handles = GetValidVarHandle(outputs_);
+  std::vector<VarHandle *> in_var_handles = GetValidVarHandles(inputs_);
+  std::vector<VarHandle *> out_var_handles = GetValidVarHandles(outputs_);
 
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), places_.size(),
@@ -45,15 +31,10 @@ void ReduceOpHandle::RunImpl() {
                     "The number of output should be one.");
 
   // Wait input done, this Wait is asynchronous operation
-  if (in_var_handles[0]->generated_op_) {
-    for (auto *in : in_var_handles) {
-      auto &in_p = in->place_;
-      in_var_handles[0]->generated_op_->Wait(dev_ctxes_[in_p]);
-    }
-  }
+  WaitEvents(in_var_handles);
 
   // check in the same place
-  auto in_0_handle = static_cast<VarHandle *>(in_var_handles[0]);
+  auto in_0_handle = in_var_handles[0];
   auto pre_place = in_0_handle->place_;
 
   std::vector<platform::Place> in_places;
@@ -120,6 +101,7 @@ void ReduceOpHandle::RunImpl() {
       for (size_t i = 0; i < local_scopes_.size(); ++i) {
         auto &p = in_places[i];
         auto &lod_tensor = lod_tensors[i];
+
         int dev_id = boost::get<platform::CUDAPlace>(p).device;
         auto &nccl_ctx = nccl_ctxs_->at(dev_id);
         auto stream = nccl_ctx.stream();
@@ -139,18 +121,41 @@ void ReduceOpHandle::RunImpl() {
         });
       }
 
-      platform::NCCLGroupGuard guard;
-      for (auto &call : all_reduce_calls) {
-        call();
-      }
+      this->RunAndRecordEvent([&] {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : all_reduce_calls) {
+          call();
+        }
+      });
 #else
       PADDLE_THROW("CUDA is not support.");
 #endif
     } else {
-      PADDLE_THROW("Error");
+      PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
     }
   }
 }
+
+void ReduceOpHandle::WaitEvents(
+    const std::vector<VarHandle *> &in_var_handles) {
+  if (in_var_handles[0]->generated_op_) {
+    for (auto *in : in_var_handles) {
+      in_var_handles[0]->generated_op_->Wait(dev_ctxes_[in->place_]);
+    }
+  }
+}
+
+std::vector<VarHandle *> ReduceOpHandle::GetValidVarHandles(
+    const std::vector<VarHandleBase *> &inputs) {
+  std::vector<VarHandle *> in_var_handles;
+  for (auto *in : inputs) {
+    auto *in_handle = dynamic_cast<VarHandle *>(in);
+    if (in_handle) {
+      in_var_handles.push_back(in_handle);
+    }
+  }
+  return in_var_handles;
+}
 std::string ReduceOpHandle::Name() const { return "reduce"; }
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 0e91ad20695d7599737f02b9856535326a171808..7b36ce4a7bceaeb93ceb03730b2d54d0f36fed3d 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -23,7 +23,9 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -57,6 +59,10 @@ struct ReduceOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+  std::vector<VarHandle *> GetValidVarHandles(
+      const std::vector<VarHandleBase *> &inputs);
+
+  void WaitEvents(const std::vector<VarHandle *> &in_var_handles);
 };
 
 }  // namespace details