diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 181f08d028919d3d55821186d777f3a8a636ae3a..6f990e28666829dd2f2fe6f07362188a77ae6468 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -2,8 +2,6 @@ cc_library(var_handle SRCS var_handle.cc DEPS place)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
-nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-        dynload_cuda)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
@@ -11,12 +9,17 @@ cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 
 if(WITH_GPU)
+    nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
+    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim dynload_cuda)
 else()
     set(multi_devices_graph_builder_deps)
+    cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim)
 endif()
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-            scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
+        scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
+
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
@@ -30,3 +33,5 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha
         device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context gather_op_handle)
+cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+        device_context reduce_op_handle )
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index e587210b357ea6caa3272903d8aa6b3e4b2e8228..28f9139987faa3dfee1e7733fb846a4d4efadc7b 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
-
 #include <algorithm>
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
 
 namespace paddle {
 namespace framework {
@@ -29,32 +29,6 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
   }
 }
 
-struct ReduceLoDTensor {
-  const std::vector<LoDTensor> &src_tensors_;
-  LoDTensor &dst_tensor_;
-
-  ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
-      : src_tensors_(src), dst_tensor_(*dst) {}
-
-  template <typename T>
-  void operator()() const {
-    PADDLE_ENFORCE(!src_tensors_.empty());
-    auto &t0 = src_tensors_[0];
-    PADDLE_ENFORCE_NE(t0.numel(), 0);
-    dst_tensor_.Resize(t0.dims());
-    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
-    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
-
-    for (size_t i = 1; i < src_tensors_.size(); ++i) {
-      auto &t = src_tensors_[i];
-      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
-      PADDLE_ENFORCE_EQ(t.type(), t0.type());
-      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
-                     [](T a, T b) -> T { return a + b; });
-    }
-  }
-};
-
 void NCCLAllReduceOpHandle::RunImpl() {
   if (inputs_.size() == 1) {
     return;  // No need to all reduce when GPU count = 1;
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
new file mode 100644
index 0000000000000000000000000000000000000000..7957fba8a449f7dc05588fad335df0b45a34b575
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -0,0 +1,94 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <map>
+#include <vector>
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ReduceLoDTensor {
+  const std::vector<LoDTensor> &src_tensors_;
+  LoDTensor &dst_tensor_;
+
+  ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
+      : src_tensors_(src), dst_tensor_(*dst) {}
+
+  template <typename T>
+  void operator()() const {
+    PADDLE_ENFORCE(!src_tensors_.empty());
+    auto &t0 = src_tensors_[0];
+    PADDLE_ENFORCE_NE(t0.numel(), 0);
+    dst_tensor_.Resize(t0.dims());
+    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
+    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+
+    for (size_t i = 1; i < src_tensors_.size(); ++i) {
+      auto &t = src_tensors_[i];
+      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
+      PADDLE_ENFORCE_EQ(t.type(), t0.type());
+      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
+                     [](T a, T b) -> T { return a + b; });
+    }
+  }
+};
+
+inline void GatherSelectedRows(
+    const std::vector<const SelectedRows *> &src_selecte_rows_,
+    const std::vector<platform::Place> &in_places,
+    const std::unordered_map<platform::Place, platform::DeviceContext *,
+                             platform::PlaceHash> &dev_ctxes,
+    const platform::Place &out_place, SelectedRows *dst_selecte_rows) {
+  PADDLE_ENFORCE(!src_selecte_rows_.empty());
+
+  std::vector<Tensor> in_tensors;
+  std::vector<int64_t> out_rows;
+
+  for (auto in_sr_ptr : src_selecte_rows_) {
+    auto &in_sr = *in_sr_ptr;
+    in_tensors.emplace_back(in_sr.value());
+    out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end());
+  }
+
+  auto &pre_in = src_selecte_rows_[0];
+
+  auto &dst_tensor = *dst_selecte_rows;
+  dst_tensor.set_height(pre_in->height());
+  dst_tensor.set_rows(out_rows);
+  size_t rows = out_rows.size();
+  DDim out_dim = pre_in->GetCompleteDims();
+  out_dim[0] = static_cast<int64_t>(rows);
+  dst_tensor.mutable_value()->Resize(out_dim);
+  dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type());
+  Tensor *out_tensor = dst_tensor.mutable_value();
+
+  // copy
+  int s = 0, e = 0;
+  for (size_t j = 0; j < in_tensors.size(); ++j) {
+    e += in_tensors[j].dims()[0];
+    auto sub_out = out_tensor->Slice(s, e);
+    paddle::framework::TensorCopy(in_tensors[j], out_place,
+                                  *(dev_ctxes.at(in_places[j])), &sub_out);
+    s = e;
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c951de5dd5a7b66ce03c705e9bdcbe3f5c3e565d
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -0,0 +1,161 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reduce_op_handle.h"
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void ReduceOpHandle::RunImpl() {
+  // the input and output may have dummy var.
+  std::vector<VarHandle *> in_var_handles = GetValidVarHandles(inputs_);
+  std::vector<VarHandle *> out_var_handles = GetValidVarHandles(outputs_);
+
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The number of output should equal to the number of places.");
+  PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
+                    "The number of output should be one.");
+
+  // Wait input done, this Wait is asynchronous operation
+  WaitEvents(in_var_handles);
+
+  // check in the same place
+  auto in_0_handle = in_var_handles[0];
+  auto pre_place = in_0_handle->place_;
+
+  std::vector<platform::Place> in_places;
+  for (auto *in_handle : in_var_handles) {
+    auto in_p = in_handle->place_;
+    PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
+                      "Places must be all on CPU or all on CUDA.");
+    in_places.emplace_back(in_p);
+  }
+
+  auto out_var = local_scopes_[out_var_handles[0]->scope_idx_]->FindVar(
+      out_var_handles[0]->name_);
+
+  auto pre_in_var =
+      local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
+
+  if (pre_in_var->IsType<framework::SelectedRows>()) {
+    auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
+    std::vector<const SelectedRows *> in_selected_rows;
+
+    for (auto *in_handle : in_var_handles) {
+      auto in_var =
+          local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
+      auto &in_sr = in_var->Get<framework::SelectedRows>();
+
+      PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
+                        "The type of input is not consistent.");
+
+      in_selected_rows.emplace_back(&in_sr);
+    }
+    auto trg = out_var->GetMutable<framework::SelectedRows>();
+    GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_,
+                       out_var_handles[0]->place_, trg);
+  } else {
+    auto pre_in = pre_in_var->Get<framework::LoDTensor>();
+    std::vector<LoDTensor> lod_tensors;
+
+    // can be refined
+    for (auto *in_handle : in_var_handles) {
+      auto in_var =
+          local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
+      auto &in_sr = in_var->Get<framework::LoDTensor>();
+
+      PADDLE_ENFORCE_EQ(in_sr.type(), pre_in.type(),
+                        "The type of input is not consistent.");
+
+      lod_tensors.emplace_back(in_sr);
+    }
+
+    auto trg = out_var->GetMutable<framework::LoDTensor>();
+    trg->Resize(pre_in.dims());
+    trg->mutable_data(out_var_handles[0]->place_, pre_in.type());
+
+    if (paddle::platform::is_cpu_place(pre_place)) {
+      ReduceLoDTensor func(lod_tensors, trg);
+      VisitDataType(ToDataType(lod_tensors[0].type()), func);
+    } else if (paddle::platform::is_gpu_place(pre_place)) {
+#ifdef PADDLE_WITH_CUDA
+      auto out_p = out_var_handles[0]->place_;
+      int root = boost::get<platform::CUDAPlace>(out_p).device;
+
+      std::vector<std::function<void()>> all_reduce_calls;
+      for (size_t i = 0; i < local_scopes_.size(); ++i) {
+        auto &p = in_places[i];
+        auto &lod_tensor = lod_tensors[i];
+
+        int dev_id = boost::get<platform::CUDAPlace>(p).device;
+        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+        auto stream = nccl_ctx.stream();
+        auto comm = nccl_ctx.comm_;
+
+        void *buffer = const_cast<void *>(lod_tensor.data<void>());
+        void *recvbuffer = nullptr;
+        if (root == dev_id) {
+          recvbuffer = trg->mutable_data(out_var_handles[0]->place_);
+        }
+
+        all_reduce_calls.emplace_back([=] {
+          PADDLE_ENFORCE(platform::dynload::ncclReduce(
+              buffer, recvbuffer, static_cast<size_t>(lod_tensor.numel()),
+              platform::ToNCCLDataType(lod_tensor.type()), ncclSum, root, comm,
+              stream));
+        });
+      }
+
+      this->RunAndRecordEvent([&] {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : all_reduce_calls) {
+          call();
+        }
+      });
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
+    }
+  }
+}
+
+void ReduceOpHandle::WaitEvents(
+    const std::vector<VarHandle *> &in_var_handles) {
+  if (in_var_handles[0]->generated_op_) {
+    for (auto *in : in_var_handles) {
+      in_var_handles[0]->generated_op_->Wait(dev_ctxes_[in->place_]);
+    }
+  }
+}
+
+std::vector<VarHandle *> ReduceOpHandle::GetValidVarHandles(
+    const std::vector<VarHandleBase *> &inputs) {
+  std::vector<VarHandle *> in_var_handles;
+  for (auto *in : inputs) {
+    auto *in_handle = dynamic_cast<VarHandle *>(in);
+    if (in_handle) {
+      in_var_handles.push_back(in_handle);
+    }
+  }
+  return in_var_handles;
+}
+std::string ReduceOpHandle::Name() const { return "reduce"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b36ce4a7bceaeb93ceb03730b2d54d0f36fed3d
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -0,0 +1,70 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ReduceOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+
+#ifdef PADDLE_WITH_CUDA
+  const platform::NCCLContextMap *nccl_ctxs_;
+  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places,
+                 const platform::NCCLContextMap *nccl_ctxs)
+      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+    if (nccl_ctxs_) {
+      for (auto &p_ctx : nccl_ctxs_->contexts_) {
+        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+      }
+    }
+  }
+#else
+  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places)
+      : local_scopes_(local_scopes), places_(places) {}
+#endif
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+  std::vector<VarHandle *> GetValidVarHandles(
+      const std::vector<VarHandleBase *> &inputs);
+
+  void WaitEvents(const std::vector<VarHandle *> &in_var_handles);
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ed6a1355a3a4c4e5c0b70ef6cb705be0a768280f
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -0,0 +1,275 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reduce_op_handle.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+// test data amount
+const f::DDim kDims = {20, 20};
+
+struct TestReduceOpHandle {
+  bool use_gpu_;
+  Scope g_scope_;
+  std::vector<Scope *> local_scopes_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
+#endif
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+#ifdef PADDLE_WITH_CUDA
+    if (nccl_ctxs_) {
+      nccl_ctxs_->WaitAll();
+    }
+#endif
+  }
+
+  void InitCtxOnGpu(bool use_gpu) {
+    use_gpu_ = use_gpu;
+    if (use_gpu) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+      nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
+#ifdef PADDLE_WITH_CUDA
+      nccl_ctxs_.reset(nullptr);
+#endif
+    }
+  }
+
+  void InitReduceOp(size_t input_scope_idx) {
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      local_scopes_[j]->Var("out");
+    }
+    local_scopes_[input_scope_idx]->Var("input");
+
+    if (use_gpu_) {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(
+          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(
+          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+#else
+      op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_));
+#endif
+    }
+
+    // add input
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      if (!use_gpu_) {
+        op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      }
+      auto *in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
+      in_var_handle->generated_op_ = nullptr;
+      vars_.emplace_back(in_var_handle);
+      op_handle_->AddInput(in_var_handle);
+    }
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle *in_dummy_var_handle =
+        static_cast<DummyVarHandle *>(vars_.back().get());
+    in_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(in_dummy_var_handle);
+
+    // add output
+    auto *out_var_handle =
+        new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
+    vars_.emplace_back(out_var_handle);
+    op_handle_->AddOutput(out_var_handle);
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle *dummy_var_handle =
+        static_cast<DummyVarHandle *>(vars_.back().get());
+    op_handle_->AddOutput(dummy_var_handle);
+  }
+
+  void TestReduceSelectedRows(size_t output_scope_idx) {
+    int height = kDims[0] * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    std::vector<float> send_vector(f::product(kDims));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+
+    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
+         ++input_scope_idx) {
+      auto in_var = local_scopes_[input_scope_idx]->Var("input");
+      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+      auto value = in_selected_rows->mutable_value();
+      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+
+      in_selected_rows->set_height(height);
+      in_selected_rows->set_rows(rows);
+
+      paddle::framework::TensorFromVector<float>(
+          send_vector, *(ctxs_[input_scope_idx]), value);
+      value->Resize(kDims);
+    }
+
+    auto out_var = local_scopes_[output_scope_idx]->Var("out");
+    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
+
+    auto in_var = local_scopes_[output_scope_idx]->Var("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+
+    out_selected_rows->mutable_value()->ShareDataWith(
+        in_selected_rows->value());
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+
+    auto &out_select_rows = out_var->Get<f::SelectedRows>();
+    auto rt = out_select_rows.value();
+
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+    }
+
+    f::Tensor result_tensor;
+    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
+    float *ct = result_tensor.data<float>();
+
+    for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
+    }
+  }
+
+  void TestReduceLodTensors(size_t output_scope_idx) {
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+    f::LoD lod{{0, 10, 20}};
+
+    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
+         ++input_scope_idx) {
+      auto in_var = local_scopes_[input_scope_idx]->Var("input");
+      auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+      in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+      in_lod_tensor->set_lod(lod);
+
+      paddle::framework::TensorFromVector<float>(
+          send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
+    }
+
+    auto out_var = local_scopes_[output_scope_idx]->Var("out");
+    auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
+
+    auto in_var = local_scopes_[output_scope_idx]->Var("input");
+    auto in_lodtensor = in_var->Get<f::LoDTensor>();
+
+    out_lodtensor->ShareDataWith(in_lodtensor);
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+
+    auto &rt = out_var->Get<f::LoDTensor>();
+
+    f::Tensor result_tensor;
+    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
+    float *ct = result_tensor.data<float>();
+
+    for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
+    }
+  }
+};
+
+TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
+  TestReduceOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitReduceOp(input_scope_idx);
+  test_op.TestReduceSelectedRows(input_scope_idx);
+}
+TEST(ReduceTester, TestCPUReduceTestLodTensor) {
+  TestReduceOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitReduceOp(input_scope_idx);
+  test_op.TestReduceLodTensors(input_scope_idx);
+}
+#ifdef PADDLE_WITH_CUDA
+
+TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
+  TestReduceOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitReduceOp(input_scope_idx);
+  test_op.TestReduceSelectedRows(input_scope_idx);
+}
+
+TEST(ReduceTester, TestGPUReduceTestLodTensor) {
+  TestReduceOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitReduceOp(input_scope_idx);
+  test_op.TestReduceLodTensors(input_scope_idx);
+}
+#endif
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle