diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 53e8f9f36653ea5861f6ffb8e4dbb63d82f9ccf1..24115cae819e68c90631295e1f26ee20d1a5dcfb 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -18,45 +18,74 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+Tensor *GetTensorFromVar(Variable *in_var) {
+  if (in_var->IsType<LoDTensor>()) {
+    return in_var->GetMutable<LoDTensor>();
+  } else if (in_var->IsType<SelectedRows>()) {
+    return in_var->GetMutable<SelectedRows>()->mutable_value();
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+  }
+  return nullptr;
+}
+
 BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places)
     : local_scopes_(local_scopes), places_(places) {}
 
 void BroadcastOpHandle::RunImpl() {
-  PADDLE_ENFORCE_EQ(this->inputs_.size(), 1,
+  // the input may have dummy var.
+  std::vector<VarHandle *> in_var_handle;
+  for (auto *in : inputs_) {
+    auto *out_handle = dynamic_cast<VarHandle *>(in);
+    if (out_handle) {
+      in_var_handle.push_back(out_handle);
+    }
+  }
+  PADDLE_ENFORCE_EQ(in_var_handle.size(), 1,
                     "The number of input should be one.");
+
+  // the output may have dummy var.
+  std::vector<VarHandle *> out_var_handles;
+  for (auto *out : outputs_) {
+    auto *out_handle = dynamic_cast<VarHandle *>(out);
+    if (out_handle) {
+      out_var_handles.push_back(out_handle);
+    }
+  }
+
   PADDLE_ENFORCE_EQ(
-      this->outputs_.size(), places_.size(),
+      out_var_handles.size(), places_.size(),
       "The number of output should equal to the number of places.");
 
   // Wait input done, this Wait is asynchronous operation
-  auto in_var_handle = static_cast<VarHandle *>(this->inputs_[0]);
-  auto &in_place = in_var_handle->place_;
-  if (inputs_[0]->generated_op_) {
-    inputs_[0]->generated_op_->Wait(dev_ctxes_[in_place]);
-    for (auto *out : outputs_) {
-      auto out_handle = static_cast<VarHandle *>(out);
-      auto &out_p = out_handle->place_;
-      inputs_[0]->generated_op_->Wait(dev_ctxes_[out_p]);
+  auto &in_place = in_var_handle[0]->place_;
+  if (in_var_handle[0]->generated_op_) {
+    in_var_handle[0]->generated_op_->Wait(dev_ctxes_[in_place]);
+    for (auto *out : out_var_handles) {
+      auto &out_p = out->place_;
+      if (platform::is_same_place(in_place, out_p)) continue;
+      in_var_handle[0]->generated_op_->Wait(dev_ctxes_[out_p]);
     }
   }
 
-  auto in_scope_idx = in_var_handle->scope_idx_;
+  //
+  auto in_scope_idx = in_var_handle[0]->scope_idx_;
   PADDLE_ENFORCE_LT(in_scope_idx, local_scopes_.size(),
                     "The input(%s) is not in the local_scopes.",
-                    in_var_handle->name_);
-  auto in_var = local_scopes_[in_scope_idx]->FindVar(in_var_handle->name_);
-
+                    in_var_handle[0]->name_);
+  auto in_var = local_scopes_[in_scope_idx]->FindVar(in_var_handle[0]->name_);
   Tensor *in_tensor = GetTensorFromVar(in_var);
-  for (auto *out : outputs_) {
-    auto out_handle = static_cast<VarHandle *>(out);
-    auto &out_p = out_handle->place_;
 
-    auto out_scope_idx = out_handle->scope_idx_;
+  for (auto *out : out_var_handles) {
+    auto &out_p = out->place_;
+
+    auto out_scope_idx = out->scope_idx_;
     PADDLE_ENFORCE_LT(out_scope_idx, local_scopes_.size(),
-                      "%s is not in the local_scopes ", out_handle->name_);
+                      "%s is not in the local_scopes ", out->name_);
+
     auto *s = local_scopes_[out_scope_idx];
-    auto out_var = s->FindVar(out_handle->name_);
+    auto out_var = s->FindVar(out->name_);
     PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(),
                       "The place of input and output should be the same.");
 
@@ -89,7 +118,7 @@ void BroadcastOpHandle::RunImpl() {
       auto dst_gpu_place = boost::get<platform::CUDAPlace>(out_p);
       void *dst_ptr = out_tensor->mutable_data(out_p);
       void *src_ptr = in_tensor->data<void>();
-      int64_t size = in_tensor->numel();
+      int64_t size = in_tensor->numel() * SizeOfType(in_tensor->type());
       memory::Copy(
           dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
           reinterpret_cast<platform::CUDADeviceContext *>(dev_ctxes_[out_p])
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index 9bf72f03602c1533d2500bac9e3ba663a73aa832..dfc52b012f8b6bf5cf1a3feab90dc1ec7842ad6c 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -27,8 +27,20 @@ namespace p = paddle::platform;
 // test data amount
 const f::DDim kDims = {20, 20};
 
-class BroadcastTester : public ::testing::Test {
- public:
+struct TestBroadcastOpHandle {
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+  std::vector<Scope*> local_scopes_;
+  Scope g_scope_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+  }
+
   void InitCtxOnGpu(bool use_gpu) {
     if (use_gpu) {
 #ifdef PADDLE_WITH_CUDA
@@ -57,61 +69,56 @@ class BroadcastTester : public ::testing::Test {
     }
   }
 
-  void BroadcastInitOp(int input_scope_idx) {
+  void InitBroadcastOp(size_t input_scope_idx) {
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      local_scope_.push_back(&g_scope_.NewScope());
-      local_scope_[j]->Var("out");
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      local_scopes_[j]->Var("out");
     }
-    local_scope_[input_scope_idx]->Var("input");
+    local_scopes_[input_scope_idx]->Var("input");
 
-    bc_op_handle_ = new f::details::BroadcastOpHandle(local_scope_, gpu_list_);
+    op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
 
-    f::details::VarHandle* in_var_handle = new f::details::VarHandle();
+    vars_.emplace_back(new VarHandle());
+    VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
     in_var_handle->place_ = gpu_list_[input_scope_idx];
     in_var_handle->name_ = "input";
     in_var_handle->version_ = 1;
     in_var_handle->scope_idx_ = input_scope_idx;
     in_var_handle->generated_op_ = nullptr;
-    bc_op_handle_->AddInput(in_var_handle);
+    op_handle_->AddInput(in_var_handle);
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(dummy_var_handle);
 
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      bc_op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j];
-      f::details::VarHandle* out_var_handle = new f::details::VarHandle();
+      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      vars_.emplace_back(new VarHandle());
+      VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
       out_var_handle->place_ = gpu_list_[j];
       out_var_handle->name_ = "out";
       out_var_handle->version_ = 2;
       out_var_handle->scope_idx_ = j;
-      bc_op_handle_->AddOutput(out_var_handle);
-    }
-  }
-  void BroadcastOpDestroy() {
-    for (auto in : bc_op_handle_->inputs_) {
-      delete in;
-    }
-    for (auto out : bc_op_handle_->outputs_) {
-      delete out;
+      op_handle_->AddOutput(out_var_handle);
     }
-    delete bc_op_handle_;
-    for (size_t j = 0; j < ctxs_.size(); ++j) {
-      delete ctxs_[j];
-    }
-  }
 
-  void WaitAll() {
-    for (size_t j = 0; j < ctxs_.size(); ++j) {
-      ctxs_[j]->Wait();
-    }
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* out_dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    out_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddOutput(out_dummy_var_handle);
   }
 
-  void TestBroadcastLodTensor() {
-    int input_scope_idx = 0;
-    BroadcastInitOp(input_scope_idx);
-
-    auto in_var = local_scope_[input_scope_idx]->Var("input");
+  void TestBroadcastLodTensor(size_t input_scope_idx) {
+    auto in_var = local_scopes_[input_scope_idx]->Var("input");
     auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
     in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
 
-    std::vector<float> send_vector(f::product(kDims), input_scope_idx + 12);
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
     for (size_t k = 0; k < send_vector.size(); ++k) {
       send_vector[k] = k;
     }
@@ -120,13 +127,13 @@ class BroadcastTester : public ::testing::Test {
         send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
     in_lod_tensor->set_lod(lod);
 
-    bc_op_handle_->Run(false);
+    op_handle_->Run(false);
 
     WaitAll();
 
     p::CPUPlace cpu_place;
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = local_scope_[j]->Var("out");
+      auto out_var = local_scopes_[j]->Var("out");
       auto out_tensor = out_var->Get<f::LoDTensor>();
       PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
 
@@ -134,42 +141,37 @@ class BroadcastTester : public ::testing::Test {
       f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
       float* ct = result_tensor.mutable_data<float>(cpu_place);
 
-      for (int64_t j = 0; j < f::product(kDims); ++j) {
-        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+      for (int64_t i = 0; i < f::product(kDims); ++i) {
+        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
       }
     }
-
-    BroadcastOpDestroy();
   }
 
-  void TestBroadcastSelectedRows() {
-    int input_scope_idx = 0;
-    BroadcastInitOp(input_scope_idx);
-
-    auto in_var = local_scope_[input_scope_idx]->Var("input");
+  void TestBroadcastSelectedRows(size_t input_scope_idx) {
+    auto in_var = local_scopes_[input_scope_idx]->Var("input");
     auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
     auto value = in_selected_rows->mutable_value();
     value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
-    int height = kDims[0] * 2;
+    int height = static_cast<int>(kDims[0]) * 2;
     std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
                               2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
     in_selected_rows->set_height(height);
     in_selected_rows->set_rows(rows);
 
-    std::vector<float> send_vector(f::product(kDims));
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
     for (size_t k = 0; k < send_vector.size(); ++k) {
       send_vector[k] = k;
     }
     paddle::framework::TensorFromVector<float>(
         send_vector, *(ctxs_[input_scope_idx]), value);
 
-    bc_op_handle_->Run(false);
+    op_handle_->Run(false);
 
     WaitAll();
 
     p::CPUPlace cpu_place;
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = local_scope_[j]->Var("out");
+      auto out_var = local_scopes_[j]->Var("out");
       auto& out_select_rows = out_var->Get<f::SelectedRows>();
       auto rt = out_select_rows.value();
 
@@ -183,41 +185,44 @@ class BroadcastTester : public ::testing::Test {
       f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor);
       float* ct = result_tensor.data<float>();
 
-      for (int64_t j = 0; j < f::product(kDims); ++j) {
-        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+      for (int64_t i = 0; i < f::product(kDims); ++i) {
+        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
       }
     }
-
-    BroadcastOpDestroy();
   }
-
- public:
-  f::Scope g_scope_;
-  std::vector<p::DeviceContext*> ctxs_;
-  std::vector<f::Scope*> local_scope_;
-  std::vector<p::Place> gpu_list_;
-  f::details::BroadcastOpHandle* bc_op_handle_;
 };
 
-TEST_F(BroadcastTester, TestCPUBroadcastTestLodTensor) {
-  InitCtxOnGpu(false);
-  TestBroadcastLodTensor();
+TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastLodTensor(input_scope_idx);
 }
 
-TEST_F(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
-  InitCtxOnGpu(false);
-  TestBroadcastSelectedRows();
+TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastSelectedRows(input_scope_idx);
 }
 
 #ifdef PADDLE_WITH_CUDA
-TEST_F(BroadcastTester, TestGPUBroadcastTestLodTensor) {
-  InitCtxOnGpu(true);
-  TestBroadcastLodTensor();
+TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastLodTensor(input_scope_idx);
 }
 
-TEST_F(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
-  InitCtxOnGpu(true);
-  TestBroadcastSelectedRows();
+TEST(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastSelectedRows(input_scope_idx);
 }
 #endif
 
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index f9dfb2f5c698751fe85e748b1e5baed342dc5978..3c3054c03d95eaee3f5efdeb3e9863223458bb33 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -23,32 +23,54 @@ GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
     : local_scopes_(local_scopes), places_(places) {}
 
 void GatherOpHandle::RunImpl() {
+  // the input may have dummy var.
+  std::vector<VarHandle *> in_var_handles;
+  for (auto *in : inputs_) {
+    auto *in_handle = dynamic_cast<VarHandle *>(in);
+    if (in_handle) {
+      in_var_handles.push_back(in_handle);
+    }
+  }
   PADDLE_ENFORCE_EQ(
-      this->inputs_.size(), places_.size(),
-      "The number of inputs should be equal to the number of place.");
-  PADDLE_ENFORCE_EQ(this->outputs_.size(), 1,
+      in_var_handles.size(), places_.size(),
+      "The number of output should equal to the number of places.");
+
+  // the output may have dummy var.
+  std::vector<VarHandle *> out_var_handles;
+  for (auto *out : outputs_) {
+    auto *out_handle = dynamic_cast<VarHandle *>(out);
+    if (out_handle) {
+      out_var_handles.push_back(out_handle);
+    }
+  }
+  PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
                     "The number of output should be one.");
-  auto in_0_handle = static_cast<VarHandle *>(inputs_[0]);
+
+  auto in_0_handle = static_cast<VarHandle *>(in_var_handles[0]);
   auto pre_in_var =
       local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
+  auto pre_place = in_0_handle->place_;
+
   PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
                  "Currently, gather_op only can gather SelectedRows.");
-  auto pre_place = in_0_handle->place_;
+
+  PADDLE_ENFORCE_EQ(out_var_handles[0]->place_.which(), pre_place.which(),
+                    "The place of input and output should be the same.");
 
   // Wait input done, this Wait is asynchronous operation
-  for (auto *in : inputs_) {
-    if (inputs_[0]->generated_op_) {
-      auto &p = static_cast<VarHandle *>(in)->place_;
-      in->generated_op_->Wait(dev_ctxes_[p]);
+  for (auto *in : in_var_handles) {
+    if (in->generated_op_) {
+      in->generated_op_->Wait(dev_ctxes_[in->place_]);
     }
   }
 
   std::vector<int64_t> out_rows;
-  std::vector<Tensor *> in_tensors;
+  std::vector<Tensor> in_tensors;
   std::vector<platform::Place> in_places;
 
+  auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
   // gather the inputs
-  for (auto *in : inputs_) {
+  for (auto *in : in_var_handles) {
     auto in_handle = static_cast<VarHandle *>(in);
     auto in_p = in_handle->place_;
     in_places.push_back(in_p);
@@ -58,63 +80,46 @@ void GatherOpHandle::RunImpl() {
                       "The place of input should be the same.");
     auto *s = local_scopes_[in_handle->scope_idx_];
     auto in_var = s->FindVar(in_handle->name_);
-    PADDLE_ENFORCE_EQ(in_var->Type(), pre_in_var->Type(),
+
+    auto &in_sr = in_var->Get<framework::SelectedRows>();
+
+    PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
                       "The type of input is not consistent.");
+    PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
+                      "The height of inputs is not consistent.");
+    PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), ,
+                      "The dims of inputs is not consistent.");
 
-    if (in_var->IsType<framework::SelectedRows>()) {
-      auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
-      auto &in_sr = in_var->Get<framework::SelectedRows>();
-      auto in_sr_rows = in_sr.rows();
-      out_rows.insert(out_rows.begin(), in_sr_rows.begin(), in_sr_rows.end());
-      PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
-                        "The height of inputs is not consistent.");
-      PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), ,
-                        "The dims of inputs is not consistent.");
-    } else if (in_var->IsType<framework::LoDTensor>()) {
-      auto &pre_in = pre_in_var->Get<framework::LoDTensor>();
-      auto &in_lodtensor = in_var->Get<framework::LoDTensor>();
-      PADDLE_ENFORCE_EQ(in_lodtensor.lod(), pre_in.lod(),
-                        "The lod of inputs is not consistent.");
-      PADDLE_ENFORCE_EQ(in_lodtensor.dims(), pre_in.dims(),
-                        "The dims of inputs is not consistent.");
-    } else {
-      PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
-    }
-    in_tensors.push_back(GetTensorFromVar(in_var));
-    pre_in_var = in_var;
+    auto in_sr_rows = in_sr.rows();
+    out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
+
+    in_tensors.emplace_back(in_sr.value());
   }
 
   // write the output
-  auto out_handle = static_cast<VarHandle *>(this->outputs_[0]);
-  auto &out_place = out_handle->place_;
-  auto out_scope_idx = out_handle->scope_idx_;
-  auto out_var = local_scopes_[out_scope_idx]->FindVar(out_handle->name_);
-  PADDLE_ENFORCE_EQ(out_place.which(), pre_place.which(),
-                    "The place of input and output should be the same.");
-  if (pre_in_var->IsType<framework::SelectedRows>()) {
-    auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
-    auto out = out_var->GetMutable<framework::SelectedRows>();
-    out->set_height(pre_in.height());
-    out->set_rows(out_rows);
-    size_t rows = out_rows.size();
-    DDim out_dim = pre_in.GetCompleteDims();
-    out_dim[0] = static_cast<int64_t>(rows);
-    out->mutable_value()->Resize(out_dim);
-    out->mutable_value()->mutable_data(out_place, pre_in.value().type());
-    auto out_tensor = out->mutable_value();
-    // copy
-    int s = 0, e = 0;
-    for (size_t j = 0; j < in_tensors.size(); ++j) {
-      e += in_tensors[j]->dims()[0];
-      auto sub_out = out_tensor->Slice(s, e);
-      paddle::framework::TensorCopy(*(in_tensors[j]), out_place,
-                                    *(dev_ctxes_[in_places[j]]), &sub_out);
-      s = e;
-    }
-  } else if (pre_in_var->IsType<framework::LoDTensor>()) {
-    PADDLE_THROW("Currently, Var only can be SelectedRows.");
-  } else {
-    PADDLE_THROW("Var should be SelectedRows.");
+  auto &out_place = out_var_handles[0]->place_;
+  auto out_scope_idx = out_var_handles[0]->scope_idx_;
+  auto out_var =
+      local_scopes_[out_scope_idx]->FindVar(out_var_handles[0]->name_);
+
+  auto out = out_var->GetMutable<framework::SelectedRows>();
+  out->set_height(pre_in.height());
+  out->set_rows(out_rows);
+  size_t rows = out_rows.size();
+  DDim out_dim = pre_in.GetCompleteDims();
+  out_dim[0] = static_cast<int64_t>(rows);
+  out->mutable_value()->Resize(out_dim);
+  out->mutable_value()->mutable_data(out_place, pre_in.value().type());
+  Tensor *out_tensor = out->mutable_value();
+
+  // copy
+  int s = 0, e = 0;
+  for (size_t j = 0; j < in_tensors.size(); ++j) {
+    e += in_tensors[j].dims()[0];
+    auto sub_out = out_tensor->Slice(s, e);
+    paddle::framework::TensorCopy(in_tensors[j], out_place,
+                                  *(dev_ctxes_[in_places[j]]), &sub_out);
+    s = e;
   }
 }
 
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 3cf21553207eed7784fcaf23286e00c3480440be..10839f239d59e97946575297a6d125968a1458f4 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -26,14 +26,26 @@ namespace p = paddle::platform;
 // test data amount
 const f::DDim kDims = {20, 20};
 
-class GatherTester : public ::testing::Test {
- public:
+struct TestGatherOpHandle {
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+  std::vector<Scope*> local_scopes_;
+  Scope g_scope_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+  }
+
   void InitCtxOnGpu(bool use_gpu) {
     if (use_gpu) {
 #ifdef PADDLE_WITH_CUDA
       int count = p::GetCUDADeviceCount();
       if (count <= 1) {
-        LOG(WARNING) << "Cannot test multi-gpu Gather, because the CUDA "
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
                         "device count is "
                      << count;
         exit(0);
@@ -56,57 +68,51 @@ class GatherTester : public ::testing::Test {
     }
   }
 
-  void InitGatherOp(int input_scope_idx) {
+  void InitGatherOp(size_t input_scope_idx) {
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      local_scope_.push_back(&g_scope_.NewScope());
-      local_scope_[j]->Var("input");
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      local_scopes_[j]->Var("out");
     }
-    local_scope_[input_scope_idx]->Var("out");
-
-    gather_op_handle_ = new f::details::GatherOpHandle(local_scope_, gpu_list_);
-
-    f::details::VarHandle* out_var_handle = new f::details::VarHandle();
-    out_var_handle->place_ = gpu_list_[input_scope_idx];
-    out_var_handle->name_ = "out";
-    out_var_handle->version_ = 2;
-    out_var_handle->scope_idx_ = input_scope_idx;
-    out_var_handle->generated_op_ = gather_op_handle_;
-    gather_op_handle_->AddOutput(out_var_handle);
+    local_scopes_[input_scope_idx]->Var("input");
 
+    op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
+    // add input
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      gather_op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j];
-      f::details::VarHandle* in_var_handle = new f::details::VarHandle();
+      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      vars_.emplace_back(new VarHandle());
+      VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
       in_var_handle->place_ = gpu_list_[j];
       in_var_handle->name_ = "input";
       in_var_handle->version_ = 1;
       in_var_handle->scope_idx_ = j;
       in_var_handle->generated_op_ = nullptr;
-      gather_op_handle_->AddInput(in_var_handle);
-    }
-  }
-  void GatherOpDestroy() {
-    for (auto in : gather_op_handle_->inputs_) {
-      delete in;
-    }
-    for (auto out : gather_op_handle_->outputs_) {
-      delete out;
-    }
-    delete gather_op_handle_;
-    for (size_t j = 0; j < ctxs_.size(); ++j) {
-      delete ctxs_[j];
+      op_handle_->AddInput(in_var_handle);
     }
-  }
 
-  void WaitAll() {
-    for (size_t j = 0; j < ctxs_.size(); ++j) {
-      ctxs_[j]->Wait();
-    }
-  }
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* in_dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    in_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(in_dummy_var_handle);
+
+    // add output
+    vars_.emplace_back(new VarHandle());
+    VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
+    out_var_handle->place_ = gpu_list_[input_scope_idx];
+    out_var_handle->name_ = "out";
+    out_var_handle->version_ = 2;
+    out_var_handle->scope_idx_ = input_scope_idx;
+    op_handle_->AddOutput(out_var_handle);
 
-  void TestGatherSelectedRows() {
-    int output_scope_idx = 0;
-    InitGatherOp(output_scope_idx);
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    op_handle_->AddOutput(dummy_var_handle);
+  }
 
+  void TestGatherSelectedRows(size_t output_scope_idx) {
     int height = kDims[0] * 2;
     std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
                               2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
@@ -117,7 +123,7 @@ class GatherTester : public ::testing::Test {
 
     for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
          ++input_scope_idx) {
-      auto in_var = local_scope_[input_scope_idx]->Var("input");
+      auto in_var = local_scopes_[input_scope_idx]->Var("input");
       auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
       auto value = in_selected_rows->mutable_value();
       value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -130,13 +136,21 @@ class GatherTester : public ::testing::Test {
       value->Resize(kDims);
     }
 
-    gather_op_handle_->Run(false);
+    auto out_var = local_scopes_[output_scope_idx]->Var("out");
+    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
+
+    auto in_var = local_scopes_[output_scope_idx]->Var("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+
+    out_selected_rows->mutable_value()->ShareDataWith(
+        in_selected_rows->value());
+
+    op_handle_->Run(false);
 
     WaitAll();
 
     p::CPUPlace cpu_place;
 
-    auto out_var = local_scope_[output_scope_idx]->Var("out");
     auto& out_select_rows = out_var->Get<f::SelectedRows>();
     auto rt = out_select_rows.value();
 
@@ -152,28 +166,25 @@ class GatherTester : public ::testing::Test {
     for (int64_t j = 0; j < f::product(kDims); ++j) {
       ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
     }
-
-    GatherOpDestroy();
   }
-
- public:
-  f::Scope g_scope_;
-  std::vector<p::DeviceContext*> ctxs_;
-  std::vector<f::Scope*> local_scope_;
-  std::vector<p::Place> gpu_list_;
-  f::details::GatherOpHandle* gather_op_handle_;
 };
 
-TEST_F(GatherTester, TestCPUGatherTestSelectedRows) {
-  InitCtxOnGpu(false);
-  TestGatherSelectedRows();
+TEST(GatherTester, TestCPUGatherTestSelectedRows) {
+  TestGatherOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitGatherOp(input_scope_idx);
+  test_op.TestGatherSelectedRows(input_scope_idx);
 }
 
 #ifdef PADDLE_WITH_CUDA
 
-TEST_F(GatherTester, TestGPUGatherTestSelectedRows) {
-  InitCtxOnGpu(true);
-  TestGatherSelectedRows();
+TEST(GatherTester, TestGPUGatherTestSelectedRows) {
+  TestGatherOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitGatherOp(input_scope_idx);
+  test_op.TestGatherSelectedRows(input_scope_idx);
 }
 #endif
 }  // namespace details
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 0d7fbdfeab4d6a5c7e5618ee231302822d426b5b..e4194a7442f677ec8970dbc387bb01ebbbf579f1 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -17,21 +17,6 @@
 namespace paddle {
 namespace framework {
 namespace details {
-
-// GetTensorFromVar is used in broadcast_op handle and gather_op handle, so it
-// should be placed in a commonplace. I don't find an appropriate place, so I
-// temporarily place it in op_handle_base.
-Tensor *GetTensorFromVar(Variable *in_var) {
-  if (in_var->IsType<LoDTensor>()) {
-    return in_var->GetMutable<LoDTensor>();
-  } else if (in_var->IsType<SelectedRows>()) {
-    return in_var->GetMutable<SelectedRows>()->mutable_value();
-  } else {
-    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
-  }
-  return nullptr;
-}
-
 std::string OpHandleBase::DebugString() const {
   std::stringstream ss;
   ss << "(";
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index b733817dcd8c705f8a62c7b11b4df6a00dc553b1..fbdb54ba8d940c8dedd44a42a85825af5d2ec664 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -17,9 +17,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/details/var_handle.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -27,11 +24,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-// GetTensorFromVar is used in broadcast_op handle and gather_op handle, so it
-// should be placed in a commonplace. I don't find an appropriate place, so I
-// temporarily place it in op_handle.
-Tensor *GetTensorFromVar(Variable *in_var);
-
 constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
 
 class OpHandleBase {