Support memory eager deletion on recurrent OP (#17710)

Test PaddingRNN on V100 GPU device. Test configuration: large model, padding mode (which is the mode using recurrentOp), one GPU. GPU memory (MiB): 6414 (this PR) vs 6837 (without this PR) Speed (steps/s): 10.28 (this PR) vs 9.89 (without this PR)

Support memory eager deletion on recurrent OP (#17710)
Test PaddingRNN on V100 GPU device. Test configuration: large model, padding mode (which is the mode using recurrentOp), one GPU. GPU memory (MiB): 6414 (this PR) vs 6837 (without this PR) Speed (steps/s): 10.28 (this PR) vs 9.89 (without this PR)
89bc3fd8 · Huihuang Zheng · GitHub · 0d8e6c9b · 89bc3fd8 · 89bc3fd8
18 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -193,7 +193,7 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
-target_link_libraries(executor while_op_helper executor_gc_helper)
+target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/trainer_factory.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/place.h"
@@ -410,6 +411,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    if (gc && ctx->prog_.Size() > 1) {
      operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_,
                                                                 ctx->ops_);
+      operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+          ctx->block_id_, ctx->ops_);
    }
  }

--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
 cc_library(op_graph_view SRCS op_graph_view.cc DEPS op_handle_base)
 cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle)
+cc_library(recurrent_op_eager_deletion_pass SRCS recurrent_op_eager_deletion_pass.cc DEPS recurrent_op_helper graph_helper pass computation_op_handle)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle var_handle)
 cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
@@ -14,7 +15,8 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_
 cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry)
-cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass reference_count_pass_helper)
+cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle
+    eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass recurrent_op_eager_deletion_pass reference_count_pass_helper)
 cc_library(record_skip_memory_opt_vars_pass SRCS record_skip_memory_opt_vars_pass.cc DEPS graph graph_helper)
 cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handle reference_count_pass_helper share_tensor_buffer_op_handle multi_devices_helper graph pass) 

--- a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
@@ -272,6 +272,10 @@ void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
  auto while_op_eager_deletion_pass =
      ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
  while_op_eager_deletion_pass->Apply(graph);
+  auto recurrent_op_eager_deletion_pass =
+      ir::PassRegistry::Instance().Get("recurrent_op_eager_deletion_pass");
+  recurrent_op_eager_deletion_pass->Apply(graph);
 }
 }  // namespace ir
@@ -285,3 +289,4 @@ REGISTER_PASS(eager_deletion_pass, paddle::framework::ir::EagerDeletionPass)
    .RequirePassAttr(paddle::framework::ir::kGarbageCollector);
 USE_PASS(while_op_eager_deletion_pass);
+USE_PASS(recurrent_op_eager_deletion_pass);
--- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h"
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+using paddle::operators::OpVariant;
+using paddle::operators::OpVariantSet;
+using paddle::operators::OpAndGradOpPair;
+void RecurrentOpEagerDeletionPass::ApplyImpl(Graph *graph) const {
+  // Find all recurrent_op and recurrent_grad_op in graph
+  // Note the graph only contains ops and block 0
+  std::unordered_map<size_t, OpAndGradOpPair> target_ops =
+      DeviceIdToRecurrentAndRecurrentGradOp(*graph);
+  for (auto &entry : target_ops) {
+    // Prepare safe eager deletion on different devices because the garbage
+    // collection may be different across devices
+    OpAndGradOpPair &op_pair = entry.second;
+    PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(&op_pair);
+  }
+}
+// Returns a std::unordered_map mapping from the device id to recurrent op and
+// grad op pair
+std::unordered_map<size_t, OpAndGradOpPair>
+RecurrentOpEagerDeletionPass::DeviceIdToRecurrentAndRecurrentGradOp(
+    const Graph &graph) const {
+  std::unordered_map<size_t, OpAndGradOpPair> ret;
+  std::vector<details::OpHandleBase *> all_ops =
+      FilterByNodeWrapper<details::OpHandleBase>(graph);
+  for (auto *op : all_ops) {
+    auto compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
+    if (compute_op == nullptr) continue;
+    if (compute_op->Name() == "recurrent") {
+      // GetScopeIdx() returns device/place id
+      ret[compute_op->GetScopeIdx()].first.emplace(compute_op->GetOp());
+    } else if (compute_op->Name() == "recurrent_grad") {
+      // GetScopeIdx() returns device/place id
+      ret[compute_op->GetScopeIdx()].second.emplace(compute_op->GetOp());
+    }
+  }
+  return ret;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(recurrent_op_eager_deletion_pass,
+              paddle::framework::ir::RecurrentOpEagerDeletionPass);
--- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <unordered_map>
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+// Pass class set skip eager deletion vars for recurrent ops
+class RecurrentOpEagerDeletionPass : public Pass {
+ protected:
+  void ApplyImpl(Graph *graph) const override;
+ private:
+  // Returns a std::unordered_map mapping from the device id to recurrent op and
+  // grad op pair
+  std::unordered_map<size_t, paddle::operators::OpAndGradOpPair>
+  DeviceIdToRecurrentAndRecurrentGradOp(const Graph &graph) const;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
 include(operators)
 register_operators(DEPS naive_executor)
-cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator) 
+cc_library(op_variant SRCS op_variant.cc DEPS operator proto_desc)
+cc_library(recurrent_op_helper SRCS recurrent_op_helper.cc DEPS operator op_variant recurrent_op)
+cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator op_variant) 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
--- a/paddle/fluid/operators/controlflow/op_variant.cc
+++ b/paddle/fluid/operators/controlflow/op_variant.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+namespace paddle {
+namespace operators {
+struct InputsVisitor
+    : public boost::static_visitor<const framework::VariableNameMap *> {
+  template <typename OpType>
+  const framework::VariableNameMap *operator()(const OpType *op) const {
+    return &(op->Inputs());
+  }
+};
+struct OutputsVisitor
+    : public boost::static_visitor<const framework::VariableNameMap *> {
+  template <typename OpType>
+  const framework::VariableNameMap *operator()(const OpType *op) const {
+    return &(op->Outputs());
+  }
+};
+struct AttributeMapVisitor
+    : public boost::static_visitor<const framework::AttributeMap *> {
+  const framework::AttributeMap *operator()(const framework::OpDesc *op) const {
+    return &(op->GetAttrMap());
+  }
+  const framework::AttributeMap *operator()(
+      const framework::OperatorBase *op) const {
+    return &(op->Attrs());
+  }
+};
+struct RawPointerVisitor : public boost::static_visitor<const void *> {
+  template <typename OpType>
+  const void *operator()(const OpType *op) const {
+    return op;
+  }
+};
+const framework::VariableNameMap &OpVariant::Inputs() const {
+  return *boost::apply_visitor(InputsVisitor(), op_);
+}
+const framework::VariableNameMap &OpVariant::Outputs() const {
+  return *boost::apply_visitor(OutputsVisitor(), op_);
+}
+const framework::AttributeMap &OpVariant::Attrs() const {
+  return *boost::apply_visitor(AttributeMapVisitor(), op_);
+}
+const void *OpVariant::RawPointer() const {
+  return boost::apply_visitor(RawPointerVisitor(), op_);
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/op_variant.h
+++ b/paddle/fluid/operators/controlflow/op_variant.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/variant.h"
+namespace paddle {
+namespace operators {
+// OpVariant is a wrapper class of OpDesc and OperatorBase pointer
+// So that API would be the same.
+class OpVariant {
+ public:
+  OpVariant(const framework::OperatorBase *op) : op_(op) {}  // NOLINT
+  OpVariant(const framework::OpDesc *op) : op_(op) {}  // NOLINT
+  const framework::VariableNameMap &Inputs() const;
+  const framework::VariableNameMap &Outputs() const;
+  const framework::AttributeMap &Attrs() const;
+  const void *RawPointer() const;
+  template <typename AttrType>
+  const AttrType &Attr(const std::string &name) const {
+    auto &attrs = Attrs();
+    auto it = attrs.find(name);
+    PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name);
+    return boost::get<AttrType>(it->second);
+  }
+  bool operator==(const OpVariant &other) const {
+    return RawPointer() == other.RawPointer();
+  }
+  int which() const { return static_cast<int>(op_.which()); }
+  struct Hasher {
+    size_t operator()(const OpVariant &op) const {
+      return reinterpret_cast<size_t>(op.RawPointer());
+    }
+  };
+ private:
+  const boost::variant<const framework::OperatorBase *,
+                       const framework::OpDesc *>
+      op_;
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/recurrent_op.h"
+namespace paddle {
+namespace operators {
+static bool IsMatchedRecurrentOpAndRecurrentGradOp(const OpVariant &fwd_op,
+                                                   const OpVariant &grad_op) {
+  return fwd_op.Inputs().at(RecurrentBase::kInputs) ==
+             grad_op.Inputs().at(RecurrentBase::kInputs) &&
+         fwd_op.Outputs().at(RecurrentBase::kOutputs) ==
+             grad_op.Inputs().at(RecurrentBase::kOutputs);
+}
+// Returns whether the variable is skippable in forward recurrent op
+// The variable is skippable in recurrent_op when the variable used in
+// recurrent_grad is not from grad_block.
+static bool IsSkippableVar(const std::string &name,
+                           framework::BlockDesc *grad_block) {
+  return name != framework::kEmptyVarName && !grad_block->HasVar(name);
+}
+static void ClearSkipVars(const OpVariant &op) {
+  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
+  std::vector<std::string> &attr_skip_vars =
+      boost::get<std::vector<std::string>>(
+          attrs[RecurrentBase::kSkipEagerDeletionVars]);
+  attr_skip_vars.clear();
+}
+// Add skip vars into op's attribute
+template <class Container>
+static void AddSkipVars(const OpVariant &op, const Container &skip_vars) {
+  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
+  VLOG(2) << "Prepare to add " << skip_vars.size()
+          << " skip var(s): " << paddle::string::join_strings(skip_vars, ' ');
+  std::vector<std::string> &attr_skip_vars =
+      boost::get<std::vector<std::string>>(
+          attrs[RecurrentBase::kSkipEagerDeletionVars]);
+  attr_skip_vars.insert(attr_skip_vars.end(), skip_vars.cbegin(),
+                        skip_vars.cend());
+}
+// Find all ops and grad ops with given type name. The ops and grad ops
+// may locate in different blocks so we should traverse all blocks in the
+// program and find them out
+static void FindAllOpAndGradOp(OpAndGradOpPair *op_and_grad_op,
+                               const std::string &type_name,
+                               const std::string &backward_type_name) {
+  OpVariantSet &ops = op_and_grad_op->first;
+  OpVariantSet &grad_ops = op_and_grad_op->second;
+  PADDLE_ENFORCE_GE(ops.size(), grad_ops.size(),
+                    "There are extra grad ops in the graph or program");
+  if (ops.empty()) return;
+  const auto *program =
+      ops.begin()
+          ->Attr<framework::BlockDesc *>(RecurrentBase::kStepBlock)
+          ->Program();
+  for (size_t i = 1; i < program->Size(); ++i) {
+    auto &block = program->Block(i);
+    for (size_t j = 0; j < block.OpSize(); ++j) {
+      auto *op = block.Op(j);
+      if (op->Type() == type_name) {
+        ops.emplace(op);
+      } else if (op->Type() == backward_type_name) {
+        grad_ops.emplace(op);
+      }
+    }
+  }
+  PADDLE_ENFORCE_GE(ops.size(), grad_ops.size(),
+                    "There are extra grad ops in the graph or program");
+}
+// Returns GradVarName of input var names
+static std::vector<std::string> GradVarLists(
+    const std::vector<std::string> &var_names) {
+  std::vector<std::string> retv;
+  retv.reserve(var_names.size());
+  std::transform(var_names.begin(), var_names.end(), std::back_inserter(retv),
+                 framework::GradVarName);
+  return retv;
+}
+// Add memory vars in recurrent op as skip vars.
+static void AddOpMemVarsAsSkip(const OpVariant &op, bool set_grad_mem_vars) {
+  bool has_state = op.Attr<bool>(RecurrentBase::kHasStates);
+  if (has_state) {
+    std::unordered_set<std::string> skip_vars;
+    auto &mem_vars = op.Attr<std::vector<std::string>>(RecurrentBase::kStates);
+    skip_vars.insert(mem_vars.begin(), mem_vars.end());
+    auto &pre_mem_vars =
+        op.Attr<std::vector<std::string>>(RecurrentBase::kExStates);
+    skip_vars.insert(pre_mem_vars.begin(), pre_mem_vars.end());
+    if (set_grad_mem_vars) {
+      auto mem_grad_vars = GradVarLists(mem_vars);
+      skip_vars.insert(mem_grad_vars.begin(), mem_grad_vars.end());
+      auto pre_mem_grad_vars = GradVarLists(pre_mem_vars);
+      skip_vars.insert(pre_mem_grad_vars.begin(), pre_mem_grad_vars.end());
+    }
+    AddSkipVars(op, skip_vars);
+  }
+}
+// Set outputs and memory vars of the input forward op as skip vars
+static void SetRecurrentForwardOpOnlySkipVarAttr(const OpVariant &fwd_op) {
+  ClearSkipVars(fwd_op);
+  AddOpMemVarsAsSkip(fwd_op, /* set_grad_mem_vars = */ false);
+  auto &output_vars = fwd_op.Outputs().at(RecurrentBase::kOutputs);
+  AddSkipVars(fwd_op, output_vars);
+}
+// Set skip vars of matched recurrent op and recurrent_grad op
+static void SetRecurrentOpAndRecurrentGradOpSkipVarAttr(
+    const OpVariant &fwd_op, const OpVariant &bwd_op) {
+  // Find all skippable variables in forward recurrent_op
+  ClearSkipVars(fwd_op);
+  AddOpMemVarsAsSkip(fwd_op, /* set_grad_mem_vars = */ false);
+  auto *grad_block =
+      bwd_op.Attr<framework::BlockDesc *>(RecurrentBase::kStepBlock);
+  std::unordered_set<std::string> fwd_skip_vars;
+  for (auto *op_desc : grad_block->AllOps()) {
+    for (auto &in_arg_name : op_desc->InputArgumentNames()) {
+      if (IsSkippableVar(in_arg_name, grad_block)) {
+        fwd_skip_vars.insert(in_arg_name);
+      }
+    }
+    for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
+      if (IsSkippableVar(out_arg_name, grad_block)) {
+        fwd_skip_vars.insert(out_arg_name);
+      }
+    }
+  }
+  AddSkipVars(fwd_op, fwd_skip_vars);
+  // Find all skippable variables in recurrent_grad_op
+  // The skippable variables are those which would be used across time steps
+  ClearSkipVars(bwd_op);
+  AddOpMemVarsAsSkip(bwd_op, /* set_grad_mem_vars = */ true);
+  std::unordered_set<std::string> bwd_skip_vars;
+  auto &fwd_input = fwd_op.Inputs().at(RecurrentBase::kInputs);
+  auto &in_grads =
+      bwd_op.Outputs().at(framework::GradVarName(RecurrentBase::kInputs));
+  PADDLE_ENFORCE_EQ(
+      fwd_input.size(), in_grads.size(),
+      "Backward input gradient number does not match forward input number.");
+  for (size_t i = 0; i < in_grads.size(); ++i) {
+    if (in_grads[i] == framework::kEmptyVarName) {
+      continue;
+    }
+    bwd_skip_vars.insert(in_grads[i]);
+    bwd_skip_vars.insert(framework::GradVarName(fwd_input[i]));
+  }
+  auto &fwd_param = fwd_op.Inputs().at(RecurrentBase::kParameters);
+  auto &param_grads =
+      bwd_op.Outputs().at(framework::GradVarName(RecurrentBase::kParameters));
+  PADDLE_ENFORCE_EQ(fwd_param.size(), param_grads.size(),
+                    "Backward parameter gradient number does not match forward "
+                    "parameter number.");
+  for (size_t i = 0; i < fwd_param.size(); ++i) {
+    if (param_grads[i] == framework::kEmptyVarName) {
+      continue;
+    }
+    bwd_skip_vars.insert(param_grads[i]);
+    bwd_skip_vars.insert(framework::GradVarName(fwd_param[i]));
+  }
+  AddSkipVars(bwd_op, bwd_skip_vars);
+}
+void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+    int block_id,
+    const std::vector<std::unique_ptr<paddle::framework::OperatorBase>>
+        &all_ops) {
+  // If block_id is not 0, returns
+  // This is because all recurrent_ops and recurrent_grad_ops in the whole
+  // program would be processed when block_id is 0 (i.e. when Executor::Run()
+  // or ParallelExecutor constructs).
+  // What's more, all recurrent_ops and recurrent_grad_ops must be processed
+  // when block_id is zero. If not, recurrent_op may run first and erase
+  // variables
+  // used in recurrent_grad_op, and in this moment, recurrent_grad_ops may be
+  // not constructed yet.
+  if (block_id != 0) return;
+  OpAndGradOpPair op_pair;
+  for (auto &op : all_ops) {
+    if (op->Type() == "recurrent") {
+      op_pair.first.emplace(op.get());
+    } else if (op->Type() == "recurrent_grad") {
+      op_pair.second.emplace(op.get());
+    }
+  }
+  PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(&op_pair);
+}
+void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+    OpAndGradOpPair *op_pair) {
+  // Find all ops and grad ops at all blocks
+  FindAllOpAndGradOp(op_pair, "recurrent", "recurrent_grad");
+  OpVariantSet &recurrent_ops = op_pair->first;
+  OpVariantSet &recurrent_grad_ops = op_pair->second;
+  VLOG(2) << "Found recurrent op num: " << recurrent_ops.size()
+          << ", recurrent grad op num: " << recurrent_grad_ops.size();
+  if (recurrent_ops.empty()) {
+    return;
+  }
+  for (auto &bwd_op : recurrent_grad_ops) {
+    const OpVariant *matched_fwd_op = nullptr;
+    for (auto &fwd_op : recurrent_ops) {
+      if (IsMatchedRecurrentOpAndRecurrentGradOp(fwd_op, bwd_op)) {
+        PADDLE_ENFORCE(matched_fwd_op == nullptr,
+                       "Found multiple matched recurrent op");
+        matched_fwd_op = &fwd_op;
+      }
+    }
+    PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, "Cannot find matched forward op");
+    SetRecurrentOpAndRecurrentGradOpSkipVarAttr(*matched_fwd_op, bwd_op);
+    recurrent_ops.erase(*matched_fwd_op);
+  }
+  for (auto &fwd_op : recurrent_ops) {
+    SetRecurrentForwardOpOnlySkipVarAttr(fwd_op);
+  }
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.h
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+#include "paddle/fluid/operators/recurrent_op.h"
+#include "paddle/fluid/platform/variant.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace operators {
+using OpVariantSet = std::unordered_set<OpVariant, OpVariant::Hasher>;
+using OpAndGradOpPair = std::pair<OpVariantSet, OpVariantSet>;
+// Set vars to skip eager deletion on input recurrent and recurrent_grad for
+// preparing safe eager deletion. Input contains all recurrent and
+// recurrent_grad ops at block 0 and the function will find all recurrent and
+// recurrent_grad ops across blocks.
+void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+    OpAndGradOpPair *op_pair);
+// Set vars to skip eager deletion on input recurrent and recurrent_grad for
+// preparing safe eager deletion. The input block_id must be 0 and caller can
+// input all ops in the block. The function will find all recurrent and
+// recurrent_grad ops across blocks.
+void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+    int block_id,
+    const std::vector<std::unique_ptr<paddle::framework::OperatorBase>>
+        &all_ops);
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -13,109 +13,18 @@
 // limitations under the License.
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+#include "paddle/fluid/string/string_helper.h"
 namespace paddle {
 namespace operators {
-// OpVariant is a wrapper class of OpDesc and OperatorBase
-// So that API would be the same.
-class OpVariant {
-  struct InputsVisitor
-      : public boost::static_visitor<const framework::VariableNameMap *> {
-    template <typename OpType>
-    const framework::VariableNameMap *operator()(const OpType *op) const {
-      return &(op->Inputs());
-    }
-  };
-  struct OutputsVisitor
-      : public boost::static_visitor<const framework::VariableNameMap *> {
-    template <typename OpType>
-    const framework::VariableNameMap *operator()(const OpType *op) const {
-      return &(op->Outputs());
-    }
-  };
-  struct AttributeMapVisitor
-      : public boost::static_visitor<const framework::AttributeMap *> {
-    const framework::AttributeMap *operator()(
-        const framework::OpDesc *op) const {
-      return &(op->GetAttrMap());
-    }
-    const framework::AttributeMap *operator()(
-        const framework::OperatorBase *op) const {
-      return &(op->Attrs());
-    }
-  };
-  struct RawPointerVisitor : public boost::static_visitor<const void *> {
-    template <typename OpType>
-    const void *operator()(const OpType *op) const {
-      return op;
-    }
-  };
- public:
-  OpVariant(const framework::OperatorBase *op) : op_(op) {}  // NOLINT
-  OpVariant(const framework::OpDesc *op) : op_(op) {}  // NOLINT
-  const framework::VariableNameMap &Inputs() const {
-    return *boost::apply_visitor(InputsVisitor(), op_);
-  }
-  const framework::VariableNameMap &Outputs() const {
-    return *boost::apply_visitor(OutputsVisitor(), op_);
-  }
-  const framework::AttributeMap &Attrs() const {
-    return *boost::apply_visitor(AttributeMapVisitor(), op_);
-  }
-  template <typename AttrType>
-  const AttrType &Attr(const std::string &name) const {
-    auto &attrs = Attrs();
-    auto it = attrs.find(name);
-    PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name);
-    return boost::get<AttrType>(it->second);
-  }
-  bool operator==(const OpVariant &other) const {
-    return RawPointer() == other.RawPointer();
-  }
-  const void *RawPointer() const {
-    return boost::apply_visitor(RawPointerVisitor(), op_);
-  }
-  int which() const { return static_cast<int>(op_.which()); }
-  struct Hasher {
-    size_t operator()(const OpVariant &op) const {
-      return reinterpret_cast<size_t>(op.RawPointer());
-    }
-  };
- private:
-  const boost::variant<const framework::OperatorBase *,
-                       const framework::OpDesc *>
-      op_;
-};
-static std::string GetDebugString(const std::vector<std::string> &names) {
-  if (names.empty()) return "";
-  std::string ret = names[0];
-  for (size_t i = 1; i < names.size(); ++i) {
-    ret += (" " + names[i]);
-  }
-  return ret;
-}
 // Set skip variables of while_op and while_grad_op
 // These variables should be skipped when eager deletion enables.
 // It is because:
@@ -124,7 +33,7 @@ static std::string GetDebugString(const std::vector<std::string> &names) {
 static void SetSkipVars(const OpVariant &op, std::vector<std::string> attr) {
  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
  VLOG(2) << "Prepare to skip " << attr.size()
-          << " var(s): " << GetDebugString(attr);
+          << " var(s): " << string::join_strings(attr, ' ');
  attrs[kSkipEagerDeletionVars] = std::move(attr);
 }

--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
--- a/paddle/fluid/operators/recurrent_op.h
+++ b/paddle/fluid/operators/recurrent_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+// StepScopes manages scopes inside RNN.
+//    StepScopes::CurScope() get the current scope
+//    StepScopes::ExScope() get the ex-scope, or scope in previous time step.
+//    StepScopes::Next() move to next time step.
+//
+// if is_train = False, then
+//   there are two scopes for the RNN and just support forward.
+// else
+//   the len(scopes) == seq_len
+//
+// if is_backward = True, then
+//   reversely access scopes
+// else
+//   access scopes from begin to end.
+class StepScopes {
+ public:
+  StepScopes(const platform::DeviceContext &dev_ctx,
+             const framework::Scope &parent,
+             std::vector<framework::Scope *> *scopes, bool is_train,
+             size_t seq_len, bool is_backward = false);
+  framework::Scope &CurScope();
+  framework::Scope &ExScope();
+  void Next();
+ private:
+  framework::Scope &GetScope(size_t scope_id) const;
+  size_t counter_;
+  std::vector<framework::Scope *> *scopes_;
+  bool is_train_;
+  bool is_backward_;
+};
+// Base class for RecurrentOp/RecurrentGradOp
+//    Some common protected functions for RecurrentOp/RecurrentGradOp
+class RecurrentBase : public framework::OperatorBase {
+ public:
+  static const char kInputs[];
+  static const char kInitialStates[];
+  static const char kParameters[];
+  static const char kOutputs[];
+  static const char kStepScopes[];
+  static const char kHasStates[];
+  static const char kExStates[];
+  static const char kStates[];
+  static const char kStepBlock[];
+  static const char kReverse[];
+  static const char kIsTrain[];
+  static const char kSkipEagerDeletionVars[];
+  static const char kInputGrads[];
+  static const char kOutputGrads[];
+  static const char kParamGrads[];
+  static const char kInitStateGrads[];
+  RecurrentBase(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs);
+ protected:
+  // Get SequenceLength from Scope
+  //   The sequence length is got from input tensor. The input tensor's
+  //   dimension should be [SEQ_LEN, ..., ...]. The first of the tensor's shape
+  //   is SEQ_LEN. The second of the tensor's shape could be the batch size or
+  //   nested sequence length.
+  int64_t GetSequenceLength(const framework::Scope &scope) const;
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.Var, dst_vars)):
+  //   dst_tensor.ShareDataWith(src_tensor)
+  static void LinkTensor(const framework::Scope &src_scope,
+                         const std::vector<std::string> &src_vars,
+                         framework::Scope *dst_scope,
+                         const std::vector<std::string> &dst_vars);
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.Var, dst_vars)):
+  //   callback(src_tensor, &dst_tensor)
+  template <typename Callback>
+  static void LinkTensorWithCallback(const framework::Scope &src_scope,
+                                     const std::vector<std::string> &src_vars,
+                                     framework::Scope *dst_scope,
+                                     const std::vector<std::string> &dst_vars,
+                                     Callback callback,
+                                     bool is_backward = false) {
+    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
+    for (size_t i = 0; i < dst_vars.size(); ++i) {
+      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback,
+                   is_backward);
+    }
+  }
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.FindVar, dst_vars)):
+  //   callback(src_tensor, &dst_tensor)
+  template <typename Callback>
+  static void LinkTensorWithCallback(const framework::Scope &src_scope,
+                                     const std::vector<std::string> &src_vars,
+                                     const framework::Scope &dst_scope,
+                                     const std::vector<std::string> &dst_vars,
+                                     Callback callback,
+                                     bool is_backward = false) {
+    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
+    for (size_t i = 0; i < dst_vars.size(); ++i) {
+      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback,
+                   is_backward);
+    }
+  }
+  // (seq_len, shape) -> return [seq_len] + list(shape)
+  static framework::DDim PrependDims(size_t seq_len,
+                                     const framework::DDim &src);
+ private:
+  template <typename Callback>
+  static void AccessTensor(const framework::Scope &src_scope,
+                           const std::string &src_var_name,
+                           framework::Scope *dst_scope,
+                           const std::string &dst_var_name, Callback callback,
+                           bool is_backward = false) {
+    auto *src_var = src_scope.FindVar(src_var_name);
+    if (is_backward && src_var == nullptr) {
+      return;
+    }
+    PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name);
+    auto &src_tensor = src_var->Get<framework::LoDTensor>();
+    auto *dst_var = dst_scope->Var(dst_var_name);
+    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
+    callback(src_tensor, dst_tensor);
+  }
+  template <typename Callback>
+  static void AccessTensor(const framework::Scope &src_scope,
+                           const std::string &src_var_name,
+                           const framework::Scope &dst_scope,
+                           const std::string &dst_var_name, Callback callback,
+                           bool is_backward = false) {
+    auto *dst_var = dst_scope.FindVar(dst_var_name);
+    if (is_backward && dst_var == nullptr) {
+      return;
+    }
+    auto *src_var = src_scope.FindVar(src_var_name);
+    PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name);
+    auto &src_tensor = src_var->Get<framework::LoDTensor>();
+    PADDLE_ENFORCE(dst_var != nullptr, "%s is not found.", dst_var_name);
+    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
+    callback(src_tensor, dst_tensor);
+  }
+};
+class RecurrentOp : public RecurrentBase {
+ public:
+  RecurrentOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs);
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override;
+ private:
+  StepScopes CreateStepScopes(const platform::DeviceContext &dev_ctx,
+                              const framework::Scope &scope,
+                              size_t seq_len) const;
+};
+class RecurrentGradOp : public RecurrentBase {
+ public:
+  RecurrentGradOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs);
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override;
+  StepScopes CreateStepScopes(const platform::DeviceContext &dev_ctx,
+                              const framework::Scope &scope,
+                              size_t seq_len) const;
+  std::unordered_set<std::string> List2Set(
+      const std::vector<std::string> &list) const;
+  std::unordered_set<std::string> LocalVarNames(
+      const framework::Scope &scope) const;
+  static std::vector<std::string> GradVarLists(
+      const std::vector<std::string> &var_names);
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -91,6 +91,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
    auto in_grad_var_name = Output(framework::GradVarName("X"));
    auto *in_grad_var = scope.FindVar(in_grad_var_name);
    PADDLE_ENFORCE(in_grad_var != nullptr,
                   "Cannot find in_grad_var in scope, name is %s",
                   in_grad_var_name);

--- a/paddle/fluid/string/string_helper.h
+++ b/paddle/fluid/string/string_helper.h
@@ -119,16 +119,18 @@ std::vector<T> split_string(const std::string& str) {
  return list;
 }
-template <class T>
+template <class Container>
-std::string join_strings(const std::vector<T>& strs, char delim) {
+std::string join_strings(const Container& strs, char delim) {
  std::string str;
-  for (size_t i = 0; i < strs.size(); i++) {
+  int i = 0;
+  for (auto& elem : strs) {
    if (i > 0) {
      str += delim;
    }
-    str += boost::lexical_cast<std::string>(strs[i]);
+    str += boost::lexical_cast<std::string>(elem);
+    ++i;
  }
  return str;

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py