Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_vlog

test=develop

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_vlog
test=develop
3db9fad7 · minqiyang · 3da43dca · 387610aa · 3db9fad7 · 3db9fad7
37 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -174,6 +174,7 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))

--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -37,8 +37,9 @@ struct TestBroadcastOpHandle {
  std::vector<Scope*> local_scopes_;
  std::vector<Scope*> param_scopes_;
  Scope g_scope_;
-  std::unique_ptr<OpHandleBase> op_handle_;
-  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  OpHandleBase* op_handle_;
+  std::vector<VarHandleBase*> vars_;
+  std::vector<std::unique_ptr<ir::Node>> nodes_;
  std::vector<p::Place> place_list_;
  bool use_gpu_;
 #ifdef PADDLE_WITH_CUDA
@@ -90,6 +91,7 @@ struct TestBroadcastOpHandle {
  }

  void InitBroadcastOp(size_t input_scope_idx) {
+    nodes_.clear();
    for (size_t j = 0; j < place_list_.size(); ++j) {
      local_scopes_.push_back(&(g_scope_.NewScope()));
      Scope& local_scope = local_scopes_.back()->NewScope();
@@ -101,39 +103,39 @@ struct TestBroadcastOpHandle {
    }
    param_scopes_[input_scope_idx]->Var("input");

-    std::unique_ptr<ir::Node> n =
-        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation);
+    nodes_.emplace_back(
+        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
    if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_,
-                                             place_list_, nccl_ctxs_.get()));
+      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
+                                         place_list_, nccl_ctxs_.get());
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_,
-                                             place_list_, nccl_ctxs_.get()));
+      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
+                                         place_list_, nccl_ctxs_.get());
 #else
-      op_handle_.reset(
-          new BroadcastOpHandle(n.get(), local_scopes_, place_list_));
+      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
+                                         place_list_);
 #endif
    }

-    std::unique_ptr<ir::Node> v =
-        ir::CreateNodeForTest("node1", ir::Node::Type::kVariable);
-    auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
-                                        place_list_[input_scope_idx]);
+    nodes_.emplace_back(
+        ir::CreateNodeForTest("node1", ir::Node::Type::kVariable));
+    auto* in_var_handle = new VarHandle(nodes_.back().get(), 1, input_scope_idx,
+                                        "input", place_list_[input_scope_idx]);
    vars_.emplace_back(in_var_handle);
    op_handle_->AddInput(in_var_handle);

    // add dummy var

-    std::unique_ptr<ir::Node> v2 =
-        ir::CreateNodeForTest("node2", ir::Node::Type::kVariable);
-    vars_.emplace_back(new DummyVarHandle(v2.get()));
+    nodes_.emplace_back(
+        ir::CreateNodeForTest("node2", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(nodes_.back().get()));
    DummyVarHandle* dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back().get());
+        static_cast<DummyVarHandle*>(vars_.back());
    dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddInput(dummy_var_handle);

@@ -141,20 +143,20 @@ struct TestBroadcastOpHandle {
      if (!use_gpu_) {
        op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get());
      }
-      std::unique_ptr<ir::Node> v3 =
-          ir::CreateNodeForTest("node3", ir::Node::Type::kVariable);
+      nodes_.emplace_back(
+          ir::CreateNodeForTest("node3", ir::Node::Type::kVariable));
      VarHandle* out_var_handle =
-          new VarHandle(v3.get(), 2, j, "out", place_list_[j]);
+          new VarHandle(nodes_.back().get(), 2, j, "out", place_list_[j]);
      vars_.emplace_back(out_var_handle);
      op_handle_->AddOutput(out_var_handle);
    }

    // add dummy var
-    std::unique_ptr<ir::Node> v4 =
-        ir::CreateNodeForTest("node4", ir::Node::Type::kVariable);
-    vars_.emplace_back(new DummyVarHandle(v4.get()));
+    nodes_.emplace_back(
+        ir::CreateNodeForTest("node4", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(nodes_.back().get()));
    DummyVarHandle* out_dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back().get());
+        static_cast<DummyVarHandle*>(vars_.back());
    out_dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddOutput(out_dummy_var_handle);
  }

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -16,6 +16,7 @@
 #include <vector>
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"

 namespace paddle {
 namespace framework {
@@ -32,13 +33,11 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
      pool_(strategy.num_threads_ +
            1),  // add one more thread for generate op_deps
      fetch_ctxs_(places) {
-  auto &ops = graph_->Get<details::GraphOps>("ops");
-
-  for (auto &op : ops) {
+  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
    int dep = static_cast<int>(op->NotReadyInputSize());
-    op_deps_.emplace(op.get(), dep);
+    op_deps_.emplace(op, dep);
    if (dep == 0) {
-      bootstrap_ops_.emplace_back(op.get());
+      bootstrap_ops_.emplace_back(op);
    }
  }

@@ -54,13 +53,13 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
  paddle::framework::FeedFetchList fetches;
  fetches.resize(fetch_tensors.size());
  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
-  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
+  std::vector<FetchOpHandle *> fetch_ops;

  for (auto &fetch_var_name : fetch_tensors) {
    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
      auto it = var_map.find(fetch_var_name);
      if (it != var_map.end()) {
-        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
+        fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
      }
    }
  }
@@ -110,7 +109,10 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
          complete_q->Pop();
        }
      }
-      exception_.ReThrow();
+      if (exception_.IsCaught()) {
+        ClearFetchOp(graph_.get(), &fetch_ops);
+        exception_.ReThrow();
+      }
    }
    num_complete += num_comp;
  }

--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -28,11 +28,7 @@ FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
      offset_(offset),
      local_scopes_(local_scopes) {}

-FetchOpHandle::~FetchOpHandle() {
-  for (auto *input_var : inputs_) {
-    input_var->RemoveOutput(this, this->Node());
-  }
-}
+FetchOpHandle::~FetchOpHandle() {}

 void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
  PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");

--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -22,8 +22,10 @@ namespace details {

 struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
  std::vector<std::string> out_varnames_;
+  std::vector<std::unique_ptr<ir::Node>> nodes_;

  void InitFusedBroadcastOp(std::vector<size_t> input_scope_idxes) {
+    nodes_.clear();
    // initialize scope and var
    for (size_t i = 0; i < place_list_.size(); ++i) {
      local_scopes_.push_back(&(g_scope_.NewScope()));
@@ -39,41 +41,41 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
    }

    // create op handle node
-    std::unique_ptr<ir::Node> n =
-        ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation);
+    nodes_.emplace_back(
+        ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
    if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(new FusedBroadcastOpHandle(
-          n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
+      op_handle_ = new FusedBroadcastOpHandle(
+          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
      PADDLE_THROW("CUDA is not supported.");
 #endif
    } else {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(new FusedBroadcastOpHandle(
-          n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
+      op_handle_ = new FusedBroadcastOpHandle(
+          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
-      op_handle_.reset(
-          new FusedBroadcastOpHandle(n.get(), local_scopes_, place_list_));
+      op_handle_ = new FusedBroadcastOpHandle(nodes_.back().get(),
+                                              local_scopes_, place_list_);
 #endif
    }

    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
      // add input var handle
-      std::unique_ptr<ir::Node> in_node =
-          ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable);
+      nodes_.emplace_back(
+          ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable));
      VarHandle* in_var_handle =
-          new VarHandle(in_node.get(), 1, input_scope_idxes[i], "in_var" + i,
-                        place_list_[input_scope_idxes[i]]);
+          new VarHandle(nodes_.back().get(), 1, input_scope_idxes[i],
+                        "in_var" + i, place_list_[input_scope_idxes[i]]);
      vars_.emplace_back(in_var_handle);
      op_handle_->AddInput(in_var_handle);

      // add output var handle
      for (size_t j = 0; j < place_list_.size(); ++j) {
-        std::unique_ptr<ir::Node> out_node =
-            ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable);
-        VarHandle* out_var_handle =
-            new VarHandle(out_node.get(), 2, j, "out_var" + i, place_list_[j]);
+        nodes_.emplace_back(
+            ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable));
+        VarHandle* out_var_handle = new VarHandle(
+            nodes_.back().get(), 2, j, "out_var" + i, place_list_[j]);
        vars_.emplace_back(out_var_handle);
        op_handle_->AddOutput(out_var_handle);
      }

--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -31,9 +31,10 @@ struct TestGatherOpHandle {
  std::vector<Scope*> local_scopes_;
  std::vector<Scope*> param_scopes_;
  Scope g_scope_;
-  std::unique_ptr<OpHandleBase> op_handle_;
-  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  OpHandleBase* op_handle_;
+  std::vector<VarHandleBase*> vars_;
  std::vector<p::Place> gpu_list_;
+  std::vector<std::unique_ptr<ir::Node>> nodes_;

  void WaitAll() {
    for (size_t j = 0; j < ctxs_.size(); ++j) {
@@ -70,7 +71,7 @@ struct TestGatherOpHandle {
  }

  void InitGatherOp(size_t input_scope_idx) {
-    std::vector<std::unique_ptr<ir::Node>> nodes;
+    nodes_.clear();
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      local_scopes_.push_back(&(g_scope_.NewScope()));
      Scope& local_scope = local_scopes_.back()->NewScope();
@@ -82,44 +83,45 @@ struct TestGatherOpHandle {
    }
    param_scopes_[input_scope_idx]->Var("out");

-    nodes.emplace_back(
+    nodes_.emplace_back(
        ir::CreateNodeForTest("node", ir::Node::Type::kOperation).release());
-    op_handle_.reset(
-        new GatherOpHandle(nodes.back().get(), local_scopes_, gpu_list_));
+    op_handle_ =
+        new GatherOpHandle(nodes_.back().get(), local_scopes_, gpu_list_);
    // add input
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
-      nodes.emplace_back(
+      nodes_.emplace_back(
          ir::CreateNodeForTest("node1", ir::Node::Type::kVariable).release());
      auto* in_var_handle =
-          new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]);
+          new VarHandle(nodes_.back().get(), 1, j, "input", gpu_list_[j]);
      vars_.emplace_back(in_var_handle);
      op_handle_->AddInput(in_var_handle);
    }

    // add dummy var
-    nodes.emplace_back(
+    nodes_.emplace_back(
        ir::CreateNodeForTest("node2", ir::Node::Type::kVariable).release());
-    vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
+    vars_.emplace_back(new DummyVarHandle(nodes_.back().get()));
    DummyVarHandle* in_dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back().get());
+        static_cast<DummyVarHandle*>(vars_.back());
    in_dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddInput(in_dummy_var_handle);

    // add output
-    nodes.emplace_back(
+    nodes_.emplace_back(
        ir::CreateNodeForTest("node3", ir::Node::Type::kVariable).release());
-    auto* out_var_handle = new VarHandle(nodes.back().get(), 2, input_scope_idx,
-                                         "out", gpu_list_[input_scope_idx]);
+    auto* out_var_handle =
+        new VarHandle(nodes_.back().get(), 2, input_scope_idx, "out",
+                      gpu_list_[input_scope_idx]);
    vars_.emplace_back(out_var_handle);
    op_handle_->AddOutput(out_var_handle);

    // add dummy var
-    nodes.emplace_back(
+    nodes_.emplace_back(
        ir::CreateNodeForTest("node4", ir::Node::Type::kVariable).release());
-    vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
+    vars_.emplace_back(new DummyVarHandle(nodes_.back().get()));
    DummyVarHandle* dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back().get());
+        static_cast<DummyVarHandle*>(vars_.back());
    op_handle_->AddOutput(dummy_var_handle);
  }


--- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
+++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_graph_view.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"

 namespace paddle {
 namespace framework {
@@ -35,10 +36,10 @@ static bool IsLockAndRecordEventFreeComputationOpHandle(

 std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
    std::unique_ptr<ir::Graph> ir_graph) const {
-  auto &all_ops = ir_graph->Get<GraphOps>(kGraphOps);
+  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*ir_graph);
  OpGraphView graph_view(all_ops);
  for (auto &op : all_ops) {
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
    if (compute_op == nullptr) continue;
    bool is_lock_and_record_event_free =
        IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view);

--- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"

 namespace paddle {
 namespace framework {
@@ -36,20 +37,20 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
    for (auto &name_pair : var_map) {
      for (auto &version_pair : name_pair.second) {
-        insert_pending_var(version_pair.get());
+        insert_pending_var(version_pair);
      }
    }
  }

  for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
-    insert_pending_var(var.get());
+    insert_pending_var(var);
  }

-  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
+  for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) {
    if (op->Inputs().empty()) {
-      ready_ops.insert(op.get());
+      ready_ops.insert(op);
    } else {
-      pending_ops.insert({op.get(), op.get()->NoDupInputSize()});
+      pending_ops.insert({op, op->NoDupInputSize()});
    }
  }

@@ -89,6 +90,4 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
 REGISTER_PASS(multi_devices_check_pass,
              paddle::framework::details::SSAGraghBuilderWithChecker)
    .RequireGraphAttr(paddle::framework::details::kGraphVars)
-    .RequireGraphAttr(paddle::framework::details::kGraphDepVars)
-    .RequireGraphAttr(paddle::framework::details::kGraphOps)
-    .RequireGraphAttr(paddle::framework::details::kShardedVarDevice);
+    .RequireGraphAttr(paddle::framework::details::kGraphDepVars);
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -34,7 +34,14 @@
 namespace paddle {
 namespace framework {
 namespace details {
+
 namespace {
+// TODO(panyx0718): Clean this up as well.
+// all operators. NOTE that even we use a vector here, the operators is
+// unordered.
+typedef std::vector<OpHandleBase *> GraphOps;
+const char kGraphOps[] = "ops";
+
 void PolishGraphToSupportDataHazards(ir::Graph *graph) {
  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
    for (auto &name_pair : var_map) {
@@ -92,7 +99,7 @@ VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
    }
    var_holder.emplace_back(var);
  } else {
-    var = var_holder.rbegin()->get();
+    var = *var_holder.rbegin();
  }
  return var;
 }
@@ -154,7 +161,7 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
                                                ir::Node *node,
                                                size_t place_id) const {
  auto p = places_[place_id];
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
  op_handle->SetDeviceContext(p,
                              platform::DeviceContextPool::Instance().Get(p));

@@ -303,7 +310,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
  result.Set(kGraphVars, new GraphVars(places_.size()));
  result.Set(kGraphDepVars, new GraphDepVars);
  result.Set(kGraphOps, new GraphOps);
-  result.Set(kShardedVarDevice, new ShardedVarDevice);

  // find send/recv vars so that we can place the distributed training
  // related op in the place 0
@@ -317,11 +323,13 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
  bool is_forwarding = true;
  bool is_dist_train = false;

+  std::unordered_map<std::string, int> sharded_var_device;
+
  for (ir::Node *node : sorted_ops) {
    if (boost::get<int>(
            node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
        static_cast<int>(OpRole::kRPC)) {
-      int op_dev_id = CreateRPCOp(&result, node);
+      int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device);
      PADDLE_ENFORCE(op_dev_id != -1,
                     "Can not schedule the RPC operator to the right place.");
      if (node->Op()->Type() == "recv") {
@@ -337,7 +345,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
    } else if (boost::get<int>(node->Op()->GetAttr(
                   OpProtoAndCheckerMaker::OpRoleAttrName())) ==
               static_cast<int>(OpRole::kDist)) {
-      int op_dev_id = CreateDistTrainOp(&result, node);
+      int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device);
      if (node->Op()->Type() == "concat") {
        auto origin_param_name = node->Op()->OutputArgumentNames()[0];
        bcast_var_name_set[op_dev_id].emplace(origin_param_name);
@@ -356,12 +364,11 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
      // the block.
      is_forwarding = false;
    } else {
-      int op_dev_id = GetOpDeviceID(result, node);
+      int op_dev_id = GetOpDeviceID(result, node, sharded_var_device);
      if (op_dev_id != -1) {  // This op only runs on one specific device.
        CreateComputationalOp(&result, node, op_dev_id);
        for (ir::Node *n : node->outputs) {
-          graph->Get<ShardedVarDevice>(kShardedVarDevice)
-              .emplace(n->Name(), op_dev_id);
+          sharded_var_device.emplace(n->Name(), op_dev_id);
        }
      } else {
        // This op runs on all devices, and its output may have parameter's
@@ -398,8 +405,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
                  case BuildStrategy::ReduceStrategy::kReduce:
                    cur_device_id = GetAppropriateDeviceID({g_name});
                    CreateReduceOp(&result, g_name, cur_device_id);
-                    graph->Get<ShardedVarDevice>(kShardedVarDevice)
-                        .emplace(g_name, cur_device_id);
+                    sharded_var_device.emplace(g_name, cur_device_id);
                    if (!is_dist_train) {
                      bcast_var_name_set[cur_device_id].emplace(p_name);
                    }
@@ -458,7 +464,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   * Only variables should be the leaves of graph.
   */
  AddOutputToLeafOps(&result);
-  PADDLE_ENFORCE(!ir::HasCircle(result));
+  result.Erase<GraphOps>(kGraphOps);
  return graph;
 }

@@ -498,7 +504,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
  result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);

  auto *in =
-      result->Get<GraphVars>(kGraphVars).at(src_dev_id).at(p_name).back().get();
+      result->Get<GraphVars>(kGraphVars).at(src_dev_id).at(p_name).back();
  op_handle->AddInput(in);

  for (size_t i = 0; i < places_.size(); ++i) {
@@ -535,7 +541,7 @@ void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
  for (size_t dev_id = 0; dev_id < bcast_varnames.size(); ++dev_id) {
    for (auto &p_name : bcast_varnames[dev_id]) {
      auto *in =
-          result->Get<GraphVars>(kGraphVars).at(dev_id).at(p_name).back().get();
+          result->Get<GraphVars>(kGraphVars).at(dev_id).at(p_name).back();
      op_handle->AddInput(in);
      for (size_t out_dev_id = 0; out_dev_id < places_.size(); ++out_dev_id) {
        auto &p = places_[out_dev_id];
@@ -571,7 +577,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
      local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();

  for (size_t i = 0; i < places_.size(); ++i) {
    auto &p = places_[i];
@@ -579,7 +585,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
    auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
    PADDLE_ENFORCE(!vars.empty());
    auto &prev_grad = vars.back();
-    op_handle->AddInput(prev_grad.get());
+    op_handle->AddInput(prev_grad);

    auto var =
        new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
@@ -600,14 +606,14 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
      result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
      local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
  for (size_t i = 0; i < places_.size(); ++i) {
    auto &p = places_[i];
    SetCommunicationContext(op_handle, p);
    for (const std::string &d_name : datas) {
      auto &vars = result->Get<GraphVars>(kGraphVars)[i][d_name];
      PADDLE_ENFORCE(!vars.empty());
-      op_handle->AddInput(vars.back().get());
+      op_handle->AddInput(vars.back());
      auto var = new VarHandle(
          result->CreateEmptyNode(d_name, ir::Node::Type::kVariable),
          vars.size(), i, d_name, p);
@@ -617,8 +623,9 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
  }
 }

-int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
-                                           ir::Node *node) const {
+int MultiDevSSAGraphBuilder::GetOpDeviceID(
+    const ir::Graph &graph, ir::Node *node,
+    const std::unordered_map<std::string, int> &sharded_var_device) const {
  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
    return -1;
  }
@@ -631,15 +638,15 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));

  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
-  int dev_id = GetVarDeviceID(graph, param_grad[1]);
+  int dev_id = GetVarDeviceID(graph, param_grad[1], sharded_var_device);
  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
                    node->Op()->Type(), param_grad[0], param_grad[1]);
  return dev_id;
 }

-int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
-                                            const std::string &varname) const {
-  auto &sharded_var_device = graph.Get<ShardedVarDevice>(kShardedVarDevice);
+int MultiDevSSAGraphBuilder::GetVarDeviceID(
+    const ir::Graph &graph, const std::string &varname,
+    const std::unordered_map<std::string, int> &sharded_var_device) const {
  auto got = sharded_var_device.find(varname);
  return got == sharded_var_device.end() ? -1 : got->second;
 }
@@ -690,7 +697,7 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
      result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
      local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();

  for (size_t i = 0; i < places_.size(); ++i) {
    auto &p = places_[i];
@@ -698,7 +705,7 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
    auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
    PADDLE_ENFORCE(!vars.empty());
    auto &prev_grad = vars.back();
-    op_handle->AddInput(prev_grad.get());
+    op_handle->AddInput(prev_grad);
  }
  auto &vars = result->Get<GraphVars>(kGraphVars)[dst_dev_id][og];
  auto var =
@@ -709,8 +716,9 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
  return var;
 }

-int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
-                                               ir::Node *node) const {
+int MultiDevSSAGraphBuilder::CreateDistTrainOp(
+    ir::Graph *result, ir::Node *node,
+    std::unordered_map<std::string, int> *sharded_var_device) const {
  int op_dev_id = -1;
  std::vector<std::string> input_var_names;
  std::vector<std::string> output_var_names;
@@ -725,23 +733,22 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
      node->Op()->Type() == "split_selected_rows" ||
      node->Op()->Type() == "split_ids") {
    // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
+    op_dev_id =
+        GetVarDeviceID(*result, input_var_names[0], *sharded_var_device);
    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
      op_dev_id = GetAppropriateDeviceID(input_var_names);
      for (auto &varname : input_var_names) {
-        result->Get<ShardedVarDevice>(kShardedVarDevice)
-            .emplace(varname, op_dev_id);
+        sharded_var_device->emplace(varname, op_dev_id);
      }
    }
    for (auto &varname : output_var_names) {
-      result->Get<ShardedVarDevice>(kShardedVarDevice)
-          .emplace(varname, op_dev_id);
+      sharded_var_device->emplace(varname, op_dev_id);
    }
  } else if (node->Op()->Type() == "concat") {
-    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
+    op_dev_id =
+        GetVarDeviceID(*result, input_var_names[0], *sharded_var_device);
    for (auto &varname : output_var_names) {
-      result->Get<ShardedVarDevice>(kShardedVarDevice)
-          .emplace(varname, op_dev_id);
+      sharded_var_device->emplace(varname, op_dev_id);
    }
  } else {
    LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
@@ -759,14 +766,14 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
 }

 void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
  for (ir::Node *input : node->inputs) {
    VarHandle *var = nullptr;
    for (int place_offset = 0; place_offset < num_places; ++place_offset) {
      auto &var_holders = result->Get<GraphVars>(kGraphVars)[place_offset];
      auto &var_holder = var_holders[input->Name()];
      if (!var_holder.empty()) {
-        var = var_holder.rbegin()->get();
+        var = *var_holder.rbegin();
        op_handle->AddInput(var);
      }
    }
@@ -774,12 +781,14 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
 }

 // Create RPC related op handles that connects its in ops and out ops.
-int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
-                                         ir::Node *node) const {
+int MultiDevSSAGraphBuilder::CreateRPCOp(
+    ir::Graph *result, ir::Node *node,
+    std::unordered_map<std::string, int> *sharded_var_device) const {
  int op_dev_id = -1;
  if (node->Op()->Type() == "send") {
    // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id = GetVarDeviceID(*result, node->inputs[0]->Name());
+    op_dev_id =
+        GetVarDeviceID(*result, node->inputs[0]->Name(), *sharded_var_device);
    PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
                   "This hack no longer holds, please fix.");
    // the variable name which contains .block means it was splited by
@@ -797,11 +806,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
      VLOG(100) << "send grad " << input_var_names[0] << " origin "
                << send_param_grad[1] << " place: " << op_dev_id;
      for (auto &varname : input_var_names) {
-        result->Get<ShardedVarDevice>(kShardedVarDevice)
-            .emplace(varname, op_dev_id);
+        sharded_var_device->emplace(varname, op_dev_id);
      }
-      result->Get<ShardedVarDevice>(kShardedVarDevice)
-          .emplace(send_param_grad[1], op_dev_id);
+      sharded_var_device->emplace(send_param_grad[1], op_dev_id);
    }
  } else if (node->Op()->Type() == "recv") {
    std::vector<std::string> output_var_names;
@@ -811,7 +818,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
    auto recv_param_grad = boost::get<std::vector<std::string>>(
        node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
    if (recv_param_grad.size() == 2U) {
-      op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]);
+      op_dev_id =
+          GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device);
      VLOG(100) << "recv param " << recv_param_grad[0]
                << " get grad place: " << recv_param_grad[1]
                << " place: " << op_dev_id;
@@ -819,8 +827,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
      op_dev_id = GetAppropriateDeviceID(output_var_names);
    }
    for (auto &varname : output_var_names) {
-      result->Get<ShardedVarDevice>(kShardedVarDevice)
-          .emplace(varname, op_dev_id);
+      sharded_var_device->emplace(varname, op_dev_id);
    }
  } else {
    // send_barrier, fetch_barrier will run on place 0;
@@ -839,7 +846,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
    // send_barrier, recv, fetch_barrier's inputs are deps var, get them from
    // all places
    auto p = places_[op_dev_id];
-    auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+    auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
    op_handle->SetDeviceContext(p,
                                platform::DeviceContextPool::Instance().Get(p));

@@ -847,7 +854,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
    for (ir::Node *output : node->outputs) {
      int outvar_dev_id = op_dev_id;
      if (node->Op()->Type() == "fetch_barrier") {
-        outvar_dev_id = GetVarDeviceID(*result, output->Name());
+        outvar_dev_id =
+            GetVarDeviceID(*result, output->Name(), *sharded_var_device);
        PADDLE_ENFORCE_NE(outvar_dev_id, -1);
      }
      p = places_[outvar_dev_id];

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -44,12 +44,18 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif

-  int GetVarDeviceID(const ir::Graph &graph, const std::string &varname) const;
+  int GetVarDeviceID(
+      const ir::Graph &graph, const std::string &varname,
+      const std::unordered_map<std::string, int> &sharded_var_device) const;

  bool IsScaleLossOp(ir::Node *node) const;

-  int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
-  int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
+  int CreateRPCOp(
+      ir::Graph *result, ir::Node *node,
+      std::unordered_map<std::string, int> *sharded_var_device) const;
+  int CreateDistTrainOp(
+      ir::Graph *result, ir::Node *node,
+      std::unordered_map<std::string, int> *sharded_var_device) const;

  std::vector<std::string> FindDistTrainSendVars(
      const std::vector<ir::Node *> &nodes) const;
@@ -69,7 +75,9 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  void CreateComputationalOp(ir::Graph *result, ir::Node *node,
                             int dev_id) const;

-  int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;
+  int GetOpDeviceID(
+      const ir::Graph &graph, ir::Node *node,
+      const std::unordered_map<std::string, int> &sharded_var_device) const;

  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;


--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"

 namespace paddle {
 namespace framework {
@@ -62,7 +63,7 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
  });

  size_t op_id = 0;
-  for (auto &op : graph.Get<GraphOps>(kGraphOps)) {
+  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(graph)) {
    std::string op_name = "op_" + std::to_string(op_id++);
    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
         << std::endl;

--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -35,23 +35,14 @@ namespace details {
 // The outside vector is the device vector. Each element of this vector is a
 // map from variable name to variables. The variables, who have the same name,
 // will have a differsent version. The offset in the
-// `std::vector<std::unique_ptr<VarHandle>>` is the version of varaibles.
-typedef std::vector<
-    std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
+// `std::vector<VarHandle*>` is the version of varaibles.
+typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle*>>>
    GraphVars;
 const char kGraphVars[] = "vars";

 // aux variables to represent dependency. Useful to resolve data hazard.
-typedef std::unordered_set<std::unique_ptr<VarHandleBase>> GraphDepVars;
+typedef std::unordered_set<VarHandleBase*> GraphDepVars;
 const char kGraphDepVars[] = "dep_vars";
-
-// all operators. NOTE that even we use a vector here, the operators is
-// unordered.
-typedef std::vector<std::unique_ptr<OpHandleBase>> GraphOps;
-const char kGraphOps[] = "ops";
-
-typedef std::unordered_map<std::string, int> ShardedVarDevice;
-const char kShardedVarDevice[] = "sharded_var_device";
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/op_graph_view.cc
+++ b/paddle/fluid/framework/details/op_graph_view.cc
@@ -20,19 +20,16 @@ namespace paddle {
 namespace framework {
 namespace details {

-OpGraphView::OpGraphView(
-    const std::vector<std::unique_ptr<OpHandleBase>> &ops) {
-  Build(ops);
-}
+OpGraphView::OpGraphView(const std::vector<OpHandleBase *> &ops) { Build(ops); }

-void OpGraphView::Build(const std::vector<std::unique_ptr<OpHandleBase>> &ops) {
+void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
  for (auto &op : ops) {
-    preceding_ops_[op.get()];
-    pending_ops_[op.get()];
+    preceding_ops_[op];
+    pending_ops_[op];
    for (auto &var : op->Outputs()) {
      for (auto &pending_op : var->PendingOps()) {
-        preceding_ops_[pending_op].insert(op.get());
-        pending_ops_[op.get()].insert(pending_op);
+        preceding_ops_[pending_op].insert(op);
+        pending_ops_[op].insert(pending_op);
      }
    }
  }
@@ -41,8 +38,6 @@ void OpGraphView::Build(const std::vector<std::unique_ptr<OpHandleBase>> &ops) {
      "There are duplicate ops in graph.");
 }

-size_t OpGraphView::OpNumber() const { return preceding_ops_.size(); }
-
 std::unordered_set<OpHandleBase *> OpGraphView::AllOps() const {
  std::unordered_set<OpHandleBase *> ret;
  for (auto &pair : preceding_ops_) {
@@ -60,12 +55,6 @@ void OpGraphView::EnforceHasOp(OpHandleBase *op) const {
                 op == nullptr ? "nullptr" : op->DebugString());
 }

-const std::unordered_set<OpHandleBase *> &OpGraphView::PrecedingOps(
-    OpHandleBase *op) const {
-  EnforceHasOp(op);
-  return preceding_ops_.at(op);
-}
-
 const std::unordered_set<OpHandleBase *> &OpGraphView::PendingOps(
    OpHandleBase *op) const {
  EnforceHasOp(op);

--- a/paddle/fluid/framework/details/op_graph_view.h
+++ b/paddle/fluid/framework/details/op_graph_view.h
@@ -26,21 +26,16 @@ namespace details {

 class OpGraphView {
 public:
-  explicit OpGraphView(const std::vector<std::unique_ptr<OpHandleBase>> &ops);
-
-  size_t OpNumber() const;
+  explicit OpGraphView(const std::vector<OpHandleBase *> &ops);

  std::unordered_set<OpHandleBase *> AllOps() const;

-  const std::unordered_set<OpHandleBase *> &PrecedingOps(
-      OpHandleBase *op) const;
-
  const std::unordered_set<OpHandleBase *> &PendingOps(OpHandleBase *op) const;

  bool HasOp(OpHandleBase *op) const;

 private:
-  void Build(const std::vector<std::unique_ptr<OpHandleBase>> &ops);
+  void Build(const std::vector<OpHandleBase *> &ops);
  void EnforceHasOp(OpHandleBase *op) const;

  std::unordered_map<OpHandleBase *, std::unordered_set<OpHandleBase *>>

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -31,7 +31,10 @@ constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
 // It's responsible for populating necessary fields of ir::Node.
 class OpHandleBase {
 public:
-  explicit OpHandleBase(ir::Node *node) : node_(node) {}
+  // Owned by `node`. No need to be deleted explicitly.
+  explicit OpHandleBase(ir::Node *node) : node_(node) {
+    node_->WrappedBy(this);
+  }

  virtual ~OpHandleBase();


--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -30,8 +30,8 @@ struct TestReduceOpHandle {
  Scope g_scope_;
  std::vector<Scope *> local_scopes_;
  std::vector<Scope *> param_scopes_;
-  std::unique_ptr<OpHandleBase> op_handle_;
-  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  OpHandleBase *op_handle_;
+  std::vector<VarHandleBase *> vars_;
  std::vector<p::Place> gpu_list_;
  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;


--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/reference_count_pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"

 namespace paddle {
 namespace framework {
@@ -71,14 +72,13 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
  // Step 2: Find all variables in non-computation ops which refers to variables
  // in computation ops
  std::unordered_set<std::string> names;
-  std::unordered_map<OpHandleBase *, std::unique_ptr<ReferenceCountOpHandle>>
+  std::unordered_map<OpHandleBase *, ReferenceCountOpHandle *>
      compute_ref_cnt_map;

  auto get_ref_cnts_from_compute_op = [&](
-      const std::unique_ptr<OpHandleBase> &op,
-      const std::vector<VarHandleBase *> &vars) {
+      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
    std::vector<std::string> var_names_in_op;
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
    if (compute_op == nullptr ||
        !platform::is_gpu_place(compute_op->GetPlace()))
      return var_names_in_op;
@@ -121,9 +121,8 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
  };

  auto update_ref_cnts_from_non_compute_op = [&](
-      const std::unique_ptr<OpHandleBase> &op,
-      const std::vector<VarHandleBase *> &vars) {
-    if (dynamic_cast<ComputationOpHandle *>(op.get()) != nullptr) return;
+      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
+    if (dynamic_cast<ComputationOpHandle *>(op) != nullptr) return;
    for (VarHandleBase *var_handle_base : vars) {
      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
@@ -151,21 +150,21 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
                ref_cnt_node, next_compute_op->GetScope(), place, {var_name},
                gcs[place.device].get(), cur_ref_cnts[place.device].get());
            AddDependencyBetween(next_compute_op, ref_cnt_handle, graph.get());
-            compute_ref_cnt_map[next_compute_op].reset(ref_cnt_handle);
+            compute_ref_cnt_map[next_compute_op] = ref_cnt_handle;
          }
        }
      }
    }
  };

-  auto &all_ops = graph->Get<GraphOps>(kGraphOps);
+  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
  for (auto &op : all_ops) {
    auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs());
    auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs());
    if (in_var_names.empty() && out_var_names.empty()) continue;
    in_var_names.insert(in_var_names.end(), out_var_names.begin(),
                        out_var_names.end());
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
    ir::Node *ref_cnt_node =
        graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation);
@@ -173,7 +172,7 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
        ref_cnt_node, compute_op->GetScope(), place, in_var_names,
        gcs[place.device].get(), cur_ref_cnts[place.device].get());
    AddDependencyBetween(compute_op, ref_cnt_handle, graph.get());
-    compute_ref_cnt_map[compute_op].reset(ref_cnt_handle);
+    compute_ref_cnt_map[compute_op] = ref_cnt_handle;
  }

  for (auto &op : all_ops) {
@@ -181,11 +180,11 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
    update_ref_cnts_from_non_compute_op(op, op->Outputs());
  }

-  std::vector<std::unique_ptr<OpHandleBase>> new_all_ops;
+  std::vector<OpHandleBase *> new_all_ops;
  new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size());
  for (auto &op : all_ops) {
    new_all_ops.emplace_back(std::move(op));
-    auto it = compute_ref_cnt_map.find(new_all_ops.back().get());
+    auto it = compute_ref_cnt_map.find(new_all_ops.back());
    if (it != compute_ref_cnt_map.end()) {
      // Add LeafNode to ReferenceCountOpHandle
      auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());

--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -19,14 +19,16 @@ namespace framework {
 namespace details {
 SSAGraphExecutor::~SSAGraphExecutor() {}

-void ClearFetchOp(ir::Graph* graph,
-                  std::vector<std::unique_ptr<FetchOpHandle>>* fetch_ops) {
+void ClearFetchOp(ir::Graph* graph, std::vector<FetchOpHandle*>* fetch_ops) {
  if (fetch_ops->empty()) return;

  for (auto& op : *fetch_ops) {
    for (auto& out_var : op->Node()->outputs) {
      graph->RemoveNode(out_var);
    }
+    for (auto& in_var : op->Inputs()) {
+      in_var->RemoveOutput(op, op->Node());
+    }
    graph->RemoveNode(op->Node());
  }
  fetch_ops->clear();

--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -38,8 +38,7 @@ class SSAGraphExecutor {
  virtual FeedFetchList Run(const std::vector<std::string>& fetch_tensors) = 0;
 };

-void ClearFetchOp(ir::Graph* graph,
-                  std::vector<std::unique_ptr<FetchOpHandle>>* fetch_ops);
+void ClearFetchOp(ir::Graph* graph, std::vector<FetchOpHandle*>* fetch_ops);
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"

 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"

 namespace paddle {
@@ -51,25 +52,25 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
    for (auto &name_pair : var_map) {
      for (auto &version_pair : name_pair.second) {
-        InsertPendingVar(&pending_vars, ready_vars.get(), version_pair.get());
+        InsertPendingVar(&pending_vars, ready_vars.get(), version_pair);
      }
    }
  }
  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
-    InsertPendingVar(&pending_vars, ready_vars.get(), var.get());
+    InsertPendingVar(&pending_vars, ready_vars.get(), var);
  }

-  for (auto &op : graph_->Get<details::GraphOps>(details::kGraphOps)) {
+  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
    if (op->Inputs().empty()) {  // Special case, Op has no input.
-      ready_ops.insert(op.get());
+      ready_ops.insert(op);
    } else {
-      InsertPendingOp(&pending_ops, op.get());
+      InsertPendingOp(&pending_ops, op);
    }
  }

  // Step 2. Insert FetchOps
-  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
-  std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
+  std::vector<FetchOpHandle *> fetch_ops;
+  std::unordered_set<VarHandleBase *> fetch_dependencies;
  FeedFetchList fetch_data(fetch_tensors.size());

  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
@@ -109,6 +110,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
        for (auto &run_op_future : run_op_futures_) {
          run_op_future.wait();
        }
+        ClearFetchOp(graph_.get(), &fetch_ops);
        exception_holder_.ReThrow();
      } else {
        continue;
@@ -140,8 +142,8 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(

 void ThreadedSSAGraphExecutor::InsertFetchOps(
    const std::vector<std::string> &fetch_tensors,
-    std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
-    std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
+    std::vector<FetchOpHandle *> *fetch_ops,
+    std::unordered_set<VarHandleBase *> *fetch_dependencies,
    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
    std::unordered_set<VarHandleBase *> *pending_vars,
    BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data) {
@@ -151,7 +153,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
      auto it = var_map.find(fetch_var_name);
      if (it != var_map.end()) {
-        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
+        fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
      }
    }
  }

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -70,13 +70,13 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
                        BlockingQueue<VarHandleBase *> *ready_vars,
                        VarHandleBase *var) const;

-  void InsertFetchOps(
-      const std::vector<std::string> &fetch_tensors,
-      std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
-      std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
-      std::unordered_map<OpHandleBase *, size_t> *pending_ops,
-      std::unordered_set<VarHandleBase *> *pending_vars,
-      BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data);
+  void InsertFetchOps(const std::vector<std::string> &fetch_tensors,
+                      std::vector<FetchOpHandle *> *fetch_ops,
+                      std::unordered_set<VarHandleBase *> *fetch_dependencies,
+                      std::unordered_map<OpHandleBase *, size_t> *pending_ops,
+                      std::unordered_set<VarHandleBase *> *pending_vars,
+                      BlockingQueue<VarHandleBase *> *ready_vars,
+                      FeedFetchList *fetch_data);

 private:
  ExecutionStrategy strategy_;

--- a/paddle/fluid/framework/details/var_handle.cc
+++ b/paddle/fluid/framework/details/var_handle.cc
@@ -20,6 +20,8 @@ namespace details {

 VarHandleBase::~VarHandleBase() {}

+VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); }
+
 std::string VarHandle::DebugString() const {
  std::stringstream ss;
  ss << name_ << ":" << place_;
@@ -27,6 +29,10 @@ std::string VarHandle::DebugString() const {
 }

 std::string DummyVarHandle::DebugString() const { return node_->Name(); }
+
+DummyVarHandle::~DummyVarHandle() {
+  VLOG(4) << "deleting dummy var handle " << DebugString();
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -35,7 +35,10 @@ class OpHandleBase;
 // A variable can only be generated by a single operator. i.e.
 // This is a single assignment graph.
 struct VarHandleBase {
-  explicit VarHandleBase(ir::Node* node) : node_(node) {}
+  // Owned by `node`. No need to be deleted explicitly.
+  explicit VarHandleBase(ir::Node* node) : node_(node) {
+    node_->WrappedBy(this);
+  }

  virtual ~VarHandleBase();

@@ -94,6 +97,8 @@ struct VarHandleBase {
 struct VarHandle : public VarHandleBase {
  explicit VarHandle(ir::Node* node) : VarHandleBase(node) {}

+  virtual ~VarHandle();
+
  std::string DebugString() const override;

  VarHandle(ir::Node* node, size_t version, size_t scope_index,
@@ -121,6 +126,8 @@ struct VarHandle : public VarHandleBase {
 struct DummyVarHandle : public VarHandleBase {
  explicit DummyVarHandle(ir::Node* node) : VarHandleBase(node) {}

+  virtual ~DummyVarHandle();
+
  std::string DebugString() const override;
 };


--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -53,6 +53,7 @@ set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")

 cc_library(pass_builder SRCS pass_builder.cc DEPS pass)

+cc_test(node_test SRCS node_test.cc DEPS node)
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -102,6 +102,15 @@ class Graph {
    attr_dels_[attr_name] = []() {};
  }

+  template <typename AttrType>
+  void Erase(const std::string &attr_name) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) != 0, "%s not set in the graph",
+                   attr_name);
+    attr_dels_[attr_name]();
+    attrs_.erase(attr_name);
+    attr_dels_.erase(attr_name);
+  }
+
  const std::unordered_set<ir::Node *> &Nodes() const { return node_set_; }

  // Create a normal variable with non-null VarDesc.

--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -37,6 +37,15 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
 std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
    const Graph &graph);

+template <typename T>
+std::vector<T *> FilterByNodeWrapper(const Graph &graph) {
+  std::vector<T *> ret;
+  for (ir::Node *n : graph.Nodes()) {
+    if (n->IsWrappedBy<T>()) ret.push_back(&n->Wrapper<T>());
+  }
+  return ret;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -15,7 +15,10 @@ limitations under the License. */
 #pragma once

 #include <string>
+#include <typeindex>
+#include <typeinfo>
 #include <vector>
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/macros.h"
@@ -24,9 +27,33 @@ namespace paddle {
 namespace framework {
 namespace ir {

-// Node should normally created by Graph::CreateXXXNode().
+// Node should only created by Graph::CreateXXXNode().
+// 1. Every Node should be part of a graph. No dangling Node exists.
+// 2. Node only contains members necessary for building graph structure.
+//    It doesn't contain other unrelated members, such as device, etc.
+//
+// Sometimes, for specific usages, Node needs to have additional members,
+// such as device_placement, version in order to be executed. It is suggested
+// to use composition pattern.
+//
+// class RunnableOp {
+//    RunnableOp(ir::Node* n) : n_(n) { n_.WrappedBy(this); }
+//
+//    int any_thing_;
+// }
+//
+// RunnableOp is owned by the ir::Node that composes it. In other words.
+// ir::Node will be responsible for deleting RunnableOp, say, when ir::Node
+// is deleted from the graph.
 class Node {
 public:
+  virtual ~Node() {
+    if (!wrapper_.empty()) {
+      VLOG(4) << "ir::Node deleting a wrapper node " << Name();
+      wrapper_deleter_();
+    }
+  }
+
  enum class Type { kOperation, kVariable };
 #if !defined(_WIN32)  // msvc not support constexpr correctly.
  static constexpr char kControlDepVarName[] = "__control_var";
@@ -48,6 +75,29 @@ class Node {
    return op_desc_.get();
  }

+  // Set the `wrapper` that wraps the Node. `wrapper` is owned by Node.
+  template <typename T>
+  void WrappedBy(T* wrapper) {
+    if (!wrapper_.empty()) {
+      wrapper_deleter_();
+    }
+    wrapper_ = wrapper;
+    wrapper_deleter_ = [wrapper]() { delete wrapper; };
+    wrapper_type_ = std::type_index(typeid(T));
+  }
+
+  // Return a reference to the `wrapper`.
+  template <typename T>
+  T& Wrapper() {
+    return *boost::any_cast<T*>(wrapper_);
+  }
+
+  // Test if the Node is wrapped by type T.
+  template <typename T>
+  bool IsWrappedBy() {
+    return std::type_index(typeid(T)) == wrapper_type_;
+  }
+
  // Please don't use this API!
  int id() const { return id_; }

@@ -99,6 +149,11 @@ class Node {
  static int count_;
  // Please don't use this API or make this public.
  static void ResetId() { count_ = 0; }
+
+  boost::any wrapper_;
+  std::function<void(void)> wrapper_deleter_;
+  std::type_index wrapper_type_ = std::type_index(typeid(void));
+
  DISABLE_COPY_AND_ASSIGN(Node);
 };


--- a/paddle/fluid/framework/ir/node_test.cc
+++ b/paddle/fluid/framework/ir/node_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class RunnableOp {
+ public:
+  RunnableOp(Node* node, bool* alive) : node_(node), alive_(alive) {
+    node_->WrappedBy(this);
+  }
+
+  virtual ~RunnableOp() { *alive_ = false; }
+
+ private:
+  Node* node_;
+  bool* alive_;
+};
+
+class RunnableOp2 {
+ public:
+  RunnableOp2(Node* node, bool* alive) : node_(node), alive_(alive) {
+    node_->WrappedBy(this);
+  }
+
+  virtual ~RunnableOp2() { *alive_ = false; }
+
+ private:
+  Node* node_;
+  bool* alive_;
+};
+
+TEST(NodeTest, Basic) {
+  bool alive1 = true;
+  bool alive2 = true;
+  std::unique_ptr<Node> n1(CreateNodeForTest("n1", Node::Type::kVariable));
+  std::unique_ptr<Node> n2(CreateNodeForTest("n2", Node::Type::kVariable));
+
+  EXPECT_FALSE(n1->IsWrappedBy<RunnableOp>());
+  EXPECT_FALSE(n1->IsWrappedBy<RunnableOp2>());
+  EXPECT_FALSE(n2->IsWrappedBy<RunnableOp>());
+  EXPECT_FALSE(n2->IsWrappedBy<RunnableOp2>());
+
+  new RunnableOp(n1.get(), &alive1);
+  new RunnableOp2(n2.get(), &alive2);
+
+  EXPECT_TRUE(n1->IsWrappedBy<RunnableOp>());
+  EXPECT_FALSE(n1->IsWrappedBy<RunnableOp2>());
+  EXPECT_FALSE(n2->IsWrappedBy<RunnableOp>());
+  EXPECT_TRUE(n2->IsWrappedBy<RunnableOp2>());
+
+  EXPECT_TRUE(alive1);
+  EXPECT_TRUE(alive2);
+
+  n1.reset(nullptr);
+  n2.reset(nullptr);
+  EXPECT_FALSE(alive1);
+  EXPECT_FALSE(alive2);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -38,8 +38,8 @@ if(WITH_TESTING)
                      ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
  set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
 endif()
-cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
-        ARGS --dirname=${PYTHON_TESTS_DIR}/book)
+cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps}
+        ARGS --dirname=${WORD2VEC_MODEL_DIR})

 if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine

--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -24,7 +24,7 @@ using contrib::AnalysisConfig;

 TEST(AnalysisPredictor, ZeroCopy) {
  AnalysisConfig config;
-  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
+  config.model_dir = FLAGS_dirname;
  config.use_feed_fetch_ops = false;

  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);

--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/space_to_depth_op.h"
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class SpaceToDepthOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SpaceToDepthOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SpaceToDepthOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 4, "input should be a 4D tensor");
+    auto blocksize = ctx->Attrs().Get<int64_t>("blocksize");
+
+    PADDLE_ENFORCE_GT(blocksize, 1, "The blocksize should be Greater than 1");
+    PADDLE_ENFORCE_GT(x_dims[1], 0, "input channel should be Greater than 0");
+    PADDLE_ENFORCE_GT(x_dims[2], 0, "input Height should be Greater than 0");
+    PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
+
+    PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
+                      "input channel should be divisible of the square of "
+                      "SpaceToDepthOp blocksize");
+    PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
+                      "input Height should be divisible of the square of "
+                      "SpaceToDepthOp blocksize");
+    PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
+                      "input Width should be divisible of the square of "
+                      "SpaceToDepthOp blocksize");
+
+    VLOG(3) << "SpaceToDepthOp operator x.shape=" << x_dims
+            << "Attribute blocksize" << blocksize << std::endl;
+
+    std::vector<int64_t> output_shape(4, 0);  // [B,C,H,W]
+    output_shape[0] = x_dims[0];
+    output_shape[1] = x_dims[1] * blocksize * blocksize;
+    output_shape[2] = x_dims[2] / blocksize;
+    output_shape[3] = x_dims[3] / blocksize;
+
+    auto out_dims = framework::make_ddim(output_shape);
+
+    ctx->SetOutputDim("Out", out_dims);
+
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+};
+
+class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor). The input should be a 4D tensor B * C * W * H of "
+             "SpaceToDepthOp "
+             "operator.");
+    AddOutput("Out",
+              "(Tensor), The output should be a 4D tensor B * C2 * W2 * H2 of "
+              "SpaceToDepthOp operator.");
+    AddAttr<int64_t>(
+        "blocksize",
+        "(int64_t, default 2) blocksize used to do change Space To Depth.")
+        .SetDefault(2)
+        .GreaterThan(1);
+    AddComment(R"DOC(
+        reorg operator used in Yolo v2.
+        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 ∗ blocksize + offset % blocksize, H2 = H1 ∗ blocksize + offset / blocksize, 
+
+        Reshape Input(X) into the shape according to Attr(blocksize). The
+        data in Input(X) are unchanged.
+
+        Examples:
+
+            1. Given a 4-D tensor Input(X) with a shape [128, 2048, 26, 26], and the blocksize is 2, the reorg operator will transform Input(X)
+            into a 4-D tensor with shape [128, 2048, 13, 13] and leaving Input(X)'s data unchanged.
+
+    )DOC");
+  }
+};
+
+class SpaceToDepthGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(space_to_depth, ops::SpaceToDepthOp, ops::SpaceToDepthOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(space_to_depth_grad, ops::SpaceToDepthGradOp);
+REGISTER_OP_CPU_KERNEL(
+    space_to_depth,
+    ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    space_to_depth_grad,
+    ops::SpaceToDepthGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SpaceToDepthGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SpaceToDepthGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/space_to_depth_op.cu
+++ b/paddle/fluid/operators/space_to_depth_op.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/space_to_depth_op.h"
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    space_to_depth,
+    ops::SpaceToDepthKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SpaceToDepthKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SpaceToDepthKernel<paddle::platform::CUDADeviceContext, int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(
+    space_to_depth_grad,
+    ops::SpaceToDepthGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SpaceToDepthGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SpaceToDepthGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/space_to_depth_op.h
+++ b/paddle/fluid/operators/space_to_depth_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
+#define PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
+#endif  // PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class space_to_depth_compute {
+ public:
+  HOSTDEVICE space_to_depth_compute(const T *x, int64_t w, int64_t h, int64_t c,
+                                    int64_t batch, int64_t blocksize,
+                                    int64_t forward, T *out)
+      : x_(x),
+        w_(w),
+        h_(h),
+        c_(c),
+        batch_(batch),
+        blocksize_(blocksize),
+        forward_(forward),
+        out_(out) {}
+
+  HOSTDEVICE void operator()(int64_t in_index) {
+    int64_t out_c = c_ / (blocksize_ * blocksize_);
+    // calculate each dim position with index of tensor
+    int64_t b = in_index / (c_ * h_ * w_);
+    int64_t k = (in_index % (c_ * h_ * w_)) / (h_ * w_);
+    int64_t j = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) / w_;
+    int64_t i = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) % w_;
+
+    int64_t c2 = k % out_c;
+    int64_t offset = k / out_c;
+    int64_t w2 = i * blocksize_ + offset % blocksize_;
+    int64_t h2 = j * blocksize_ + offset / blocksize_;
+    int64_t out_index =
+        w2 + w_ * blocksize_ * (h2 + h_ * blocksize_ * (c2 + out_c * b));
+    if (forward_)
+      out_[out_index] = x_[in_index];
+    else
+      out_[in_index] = x_[out_index];
+  }
+
+ private:
+  const T *x_;
+  int64_t w_, h_, c_, batch_, blocksize_, forward_;
+  T *out_;
+};
+
+template <typename DeviceContext, typename T>
+class SpaceToDepthKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *out = context.Output<framework::LoDTensor>("Out");
+    auto *x = context.Input<framework::LoDTensor>("X");
+    auto blocksize = context.Attr<int64_t>("blocksize");
+    auto in_dims = x->dims();
+    out->mutable_data(context.GetPlace(), x->type());
+
+    auto out_dims = out->dims();
+    auto B = in_dims[0];
+    auto C = in_dims[1];
+    auto H = in_dims[2];
+    auto W = in_dims[3];
+    platform::ForRange<DeviceContext> for_range(
+        context.template device_context<DeviceContext>(),
+        static_cast<size_t>(x->numel()));
+
+    auto *x_data = x->data<T>();
+    auto *out_data = out->data<T>();
+    paddle::operators::space_to_depth_compute<T> computer(
+        x_data, W, H, C, B, blocksize, 1, out_data);
+    for_range(computer);
+
+    out->Resize(out_dims);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SpaceToDepthGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *d_out =
+        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto *d_x =
+        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto blocksize = context.Attr<int64_t>("blocksize");
+    auto in_dims = d_x->dims();
+    d_x->mutable_data(context.GetPlace(), d_out->type());
+
+    auto B = in_dims[0];
+    auto C = in_dims[1];
+    auto H = in_dims[2];
+    auto W = in_dims[3];
+
+    platform::ForRange<DeviceContext> for_range(
+        context.template device_context<DeviceContext>(),
+        static_cast<size_t>(d_x->numel()));
+
+    auto *dx_data = d_x->data<T>();
+    auto *dout_data = d_out->data<T>();
+
+    paddle::operators::space_to_depth_compute<T> computer(
+        dout_data, W, H, C, B, blocksize, 0, dx_data);
+    for_range(computer);
+
+    d_x->Resize(in_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -154,6 +154,7 @@ __all__ = [
    'mul',
    'sigmoid_cross_entropy_with_logits',
    'maxout',
+    'space_to_depth',
    'affine_grid',
    'sequence_reverse',
    'affine_channel',
@@ -7674,6 +7675,66 @@ def maxout(x, groups, name=None):
    return out


+def space_to_depth(x, blocksize, name=None):
+    """
+    Gives a blocksize to space_to_depth the input LoDtensor with Layout: [batch, channel, height, width]
+    
+    This op rearranges blocks of spatial data, into depth. More specifically, this op outputs a copy of the 
+    input LoDtensor where values from the height and width dimensions are moved to the channel dimension. 
+    The attr blocksize indicates the input block size.
+    
+    space_to_depth will reorgnize the elements of input with shape[batch, channel, height, width] according 
+    to blocksize to construct output with shape [batch, channel * blocksize * blocksize, height/blocksize, width/blocksize]:
+    
+    space_to_depth is used to This operation is useful for resizing the activations between convolutions 
+    (but keeping all data)
+
+    - Non-overlapping blocks of size block_size x block size are rearranged into depth at each location.
+    - The depth of the output tensor is block_size * block_size * input channel 
+    - The Y, X coordinates within each block of the input become the high order component of the output channel index
+    - channel should be divisible by square of blocksize
+    - height, width should be divsible by blocksize
+
+
+    Args:
+        x(variable): The input LoDtensor.
+        blocksize(variable): The blocksize to select the element on each feature map should be > 2
+
+    Returns:
+        Variable: The output LoDtensor.
+
+    Raises:
+        TypeError: blocksize type must be a long.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(
+                name='data', shape=[1, 4, 2, 2], dtype='float32')
+            space_to_depthed = fluid.layers.space_to_depth(
+                x=data, blocksize=2)
+    """
+
+    helper = LayerHelper("space_to_depth", **locals())
+
+    if not (isinstance(blocksize, int)):
+        raise ValueError("blocksize must be a python Int")
+
+    if name is None:
+        out = helper.create_variable_for_type_inference(
+            dtype=x.dtype)  #fix create
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+
+    helper.append_op(
+        type="space_to_depth",
+        inputs={"X": x},
+        attrs={"blocksize": blocksize},
+        outputs={"Out": out})
+    return out
+
+
 @templatedoc()
 def sequence_reverse(x, name=None):
    """ 

--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
@@ -108,6 +108,8 @@ class OpDescCreationMethod(object):
                    new_attr.i = user_defined_attr
                elif attr.type == framework_pb2.FLOAT:
                    new_attr.f = user_defined_attr
+                elif attr.type == framework_pb2.LONG:
+                    new_attr.l = user_defined_attr
                elif attr.type == framework_pb2.STRING:
                    new_attr.s = user_defined_attr
                elif attr.type == framework_pb2.BOOLEAN:

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -248,6 +248,17 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(layers.softmax(hid))
        print(str(program))

+    def test_space_to_depth(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(
+                name='data',
+                shape=[32, 9, 6, 6],
+                append_batch_size=False,
+                dtype='float32')
+            self.assertIsNotNone(layers.space_to_depth(data, 3))
+        print(str(program))
+
    def test_sequence_unsqueeze(self):
        program = Program()
        with program_guard(program):

--- a/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py
+++ b/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+from op_test import OpTest
+
+
+class TestSpaceToDepthOp(OpTest):
+    @staticmethod
+    def helper(in_, width, height, channel, batch, blocksize, forward, out_):
+        channel_out = channel // (blocksize * blocksize)
+        for b in range(batch):
+            for k in range(channel):
+                for j in range(height):
+                    for i in range(width):
+                        in_index = i + width * (j + height * (k + channel * b))
+                        channel2 = k % channel_out
+                        offset = k // channel_out
+                        width2 = i * blocksize + offset % blocksize
+                        height2 = j * blocksize + offset // blocksize
+                        out_index = width2 + width * blocksize * (
+                            height2 + height * blocksize *
+                            (channel2 + channel_out * b))
+                        if forward:
+                            out_[out_index] = in_[in_index]
+                        else:
+                            out_[in_index] = in_[out_index]
+
+    def setUp(self):
+        self.init_data()
+
+        self.op_type = "space_to_depth"
+        self.inputs = {"X": self.x}
+        self.helper(self.x_1d, self.x.shape[3], self.x.shape[2],
+                    self.x.shape[1], self.x.shape[0], self.blocksize,
+                    self.forward, self.out_1d)
+        self.out = np.reshape(self.out_1d, self.infered_shape)
+        self.attrs = {"blocksize": self.blocksize}
+        self.outputs = {"Out": self.out}
+
+    def init_data(self):
+        self.ori_shape = (32, 12, 6, 6)
+        self.infered_shape = (32, 48, 3, 3)
+        self.one_d_len = 32 * 48 * 3 * 3
+
+        self.blocksize = 2
+        self.x = np.random.random(self.ori_shape).astype('float32')
+        self.x_1d = np.reshape(self.x, self.one_d_len)
+        self.out = np.zeros(self.infered_shape).astype('float32')
+        self.out_1d = np.reshape(self.out, self.one_d_len)
+        self.forward = 1
+
+    def test_check_output(self):
+        place = fluid.core.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.core.CPUPlace()
+        self.check_output_with_place(place, 1e-5, None, False)
+
+    def test_check_grad(self):
+        place = fluid.core.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.core.CPUPlace()
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
+class TestSpaceToDepthOpBasic(TestSpaceToDepthOp):
+    def init_data(self):
+        self.ori_shape = (32, 8, 6, 6)
+        self.infered_shape = (32, 32, 3, 3)
+        self.one_d_len = 32 * 32 * 3 * 3
+
+        self.blocksize = 2
+        self.x = np.random.random(self.ori_shape).astype('float32')
+        self.x_1d = np.reshape(self.x, self.one_d_len)
+        self.out = np.zeros(self.infered_shape).astype('float32')
+        self.out_1d = np.reshape(self.out, self.one_d_len)
+        self.forward = 1
+
+
+class TestSpaceToDepthOpDoubleBasic(TestSpaceToDepthOp):
+    def init_data(self):
+        self.ori_shape = (32, 8, 6, 6)
+        self.infered_shape = (32, 32, 3, 3)
+        self.one_d_len = 32 * 32 * 3 * 3
+
+        self.blocksize = 2
+        self.x = np.random.random(self.ori_shape).astype('float64')
+        self.x_1d = np.reshape(self.x, self.one_d_len)
+        self.out = np.zeros(self.infered_shape).astype('float64')
+        self.out_1d = np.reshape(self.out, self.one_d_len)
+        self.forward = 1
+
+
+class TestSpaceToDepthOpWithStride3(TestSpaceToDepthOp):
+    def init_data(self):
+        self.ori_shape = (32, 9, 6, 6)
+        self.infered_shape = (32, 81, 2, 2)
+        self.one_d_len = 32 * 81 * 2 * 2
+
+        self.blocksize = 3
+        self.x = np.random.random(self.ori_shape).astype('float32')
+        self.x_1d = np.reshape(self.x, self.one_d_len)
+        self.out = np.zeros(self.infered_shape).astype('float32')
+        self.out_1d = np.reshape(self.out, self.one_d_len)
+        self.forward = 1
+
+
+class TestSpaceToDepthOpWithNotSquare(TestSpaceToDepthOp):
+    def init_data(self):
+        self.ori_shape = (32, 9, 9, 6)
+        self.infered_shape = (32, 81, 3, 2)
+        self.one_d_len = 32 * 81 * 3 * 2
+
+        self.blocksize = 3
+        self.x = np.random.random(self.ori_shape).astype('float32')
+        self.x_1d = np.reshape(self.x, self.one_d_len)
+        self.out = np.zeros(self.infered_shape).astype('float32')
+        self.out_1d = np.reshape(self.out, self.one_d_len)
+        self.forward = 1
+
+
+if __name__ == '__main__':
+    unittest.main()