[Cherry-pick] Support memory eager deletion on recurrent OP (#19411)

* Support memory eager deletion on recurrent OP (#17710) Test PaddingRNN on V100 GPU device. Test configuration: large model, padding mode (which is the mode using recurrentOp), one GPU. GPU memory (MiB): 6414 (this PR) vs 6837 (without this PR) Speed (steps/s): 10.28 (this PR) vs 9.89 (without this PR) * Fix random test_recurrent_op failure (#18718) The change includes 3 things: 1. Set CPU_NUM to 1 in the tests because the ParallelExecutor will print warning that CPU_NUM is not set and use default 1. 2. Old tests compare two RNNs, hand written simple RNN and same RNN built by Paddle, but initialized RNN weights in numpy random and Paddle random separately. Fixed it by setting weights and bias values. 3. Also set numpy random seed in the tests. Now the two RNNs diff can be smaller (rtol from 0.1, 0.2 to. 0.01) in the tests.

[Cherry-pick] Support memory eager deletion on recurrent OP (#19411)
* Support memory eager deletion on recurrent OP (#17710) Test PaddingRNN on V100 GPU device. Test configuration: large model, padding mode (which is the mode using recurrentOp), one GPU. GPU memory (MiB): 6414 (this PR) vs 6837 (without this PR) Speed (steps/s): 10.28 (this PR) vs 9.89 (without this PR) * Fix random test_recurrent_op failure (#18718) The change includes 3 things: 1. Set CPU_NUM to 1 in the tests because the ParallelExecutor will print warning that CPU_NUM is not set and use default 1. 2. Old tests compare two RNNs, hand written simple RNN and same RNN built by Paddle, but initialized RNN weights in numpy random and Paddle random separately. Fixed it by setting weights and bias values. 3. Also set numpy random seed in the tests. Now the two RNNs diff can be smaller (rtol from 0.1, 0.2 to. 0.01) in the tests.
cb74dac3 · Huihuang Zheng · GitHub · a7a4b72b · cb74dac3 · cb74dac3
19 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -196,7 +196,7 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
-target_link_libraries(executor while_op_helper executor_gc_helper)
+target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/trainer_factory.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/place.h"
@@ -410,6 +411,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    if (gc && ctx->prog_.Size() > 1) {
      operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_,
                                                                 ctx->ops_);
+      operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+          ctx->block_id_, ctx->ops_);
    }
  }

--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
 cc_library(op_graph_view SRCS op_graph_view.cc DEPS op_handle_base)
 cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle)
+cc_library(recurrent_op_eager_deletion_pass SRCS recurrent_op_eager_deletion_pass.cc DEPS recurrent_op_helper graph_helper pass computation_op_handle)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle var_handle)
 cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
@@ -14,5 +15,6 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_
 cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry)
-cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass reference_count_pass_helper)
+cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle
+    eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass recurrent_op_eager_deletion_pass reference_count_pass_helper)
 cc_library(record_skip_memory_opt_vars_pass SRCS record_skip_memory_opt_vars_pass.cc DEPS graph graph_helper)
--- a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
@@ -266,6 +266,10 @@ void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
  auto while_op_eager_deletion_pass =
      ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
  while_op_eager_deletion_pass->Apply(graph);
+  auto recurrent_op_eager_deletion_pass =
+      ir::PassRegistry::Instance().Get("recurrent_op_eager_deletion_pass");
+  recurrent_op_eager_deletion_pass->Apply(graph);
 }
 }  // namespace ir
@@ -279,3 +283,4 @@ REGISTER_PASS(eager_deletion_pass, paddle::framework::ir::EagerDeletionPass)
    .RequirePassAttr(paddle::framework::ir::kGarbageCollector);
 USE_PASS(while_op_eager_deletion_pass);
+USE_PASS(recurrent_op_eager_deletion_pass);
--- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h"
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+using paddle::operators::OpVariant;
+using paddle::operators::OpVariantSet;
+using paddle::operators::OpAndGradOpPair;
+void RecurrentOpEagerDeletionPass::ApplyImpl(Graph *graph) const {
+  // Find all recurrent_op and recurrent_grad_op in graph
+  // Note the graph only contains ops and block 0
+  std::unordered_map<size_t, OpAndGradOpPair> target_ops =
+      DeviceIdToRecurrentAndRecurrentGradOp(*graph);
+  for (auto &entry : target_ops) {
+    // Prepare safe eager deletion on different devices because the garbage
+    // collection may be different across devices
+    OpAndGradOpPair &op_pair = entry.second;
+    PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(&op_pair);
+  }
+}
+// Returns a std::unordered_map mapping from the device id to recurrent op and
+// grad op pair
+std::unordered_map<size_t, OpAndGradOpPair>
+RecurrentOpEagerDeletionPass::DeviceIdToRecurrentAndRecurrentGradOp(
+    const Graph &graph) const {
+  std::unordered_map<size_t, OpAndGradOpPair> ret;
+  std::vector<details::OpHandleBase *> all_ops =
+      FilterByNodeWrapper<details::OpHandleBase>(graph);
+  for (auto *op : all_ops) {
+    auto compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
+    if (compute_op == nullptr) continue;
+    if (compute_op->Name() == "recurrent") {
+      // GetScopeIdx() returns device/place id
+      ret[compute_op->GetScopeIdx()].first.emplace(compute_op->GetOp());
+    } else if (compute_op->Name() == "recurrent_grad") {
+      // GetScopeIdx() returns device/place id
+      ret[compute_op->GetScopeIdx()].second.emplace(compute_op->GetOp());
+    }
+  }
+  return ret;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(recurrent_op_eager_deletion_pass,
+              paddle::framework::ir::RecurrentOpEagerDeletionPass);
--- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <unordered_map>
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+// Pass class set skip eager deletion vars for recurrent ops
+class RecurrentOpEagerDeletionPass : public Pass {
+ protected:
+  void ApplyImpl(Graph *graph) const override;
+ private:
+  // Returns a std::unordered_map mapping from the device id to recurrent op and
+  // grad op pair
+  std::unordered_map<size_t, paddle::operators::OpAndGradOpPair>
+  DeviceIdToRecurrentAndRecurrentGradOp(const Graph &graph) const;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
 include(operators)
 register_operators(DEPS naive_executor)
-cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator) 
+cc_library(op_variant SRCS op_variant.cc DEPS operator proto_desc)
+cc_library(recurrent_op_helper SRCS recurrent_op_helper.cc DEPS operator op_variant recurrent_op)
+cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator op_variant) 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
--- a/paddle/fluid/operators/controlflow/op_variant.cc
+++ b/paddle/fluid/operators/controlflow/op_variant.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+namespace paddle {
+namespace operators {
+struct InputsVisitor
+    : public boost::static_visitor<const framework::VariableNameMap *> {
+  template <typename OpType>
+  const framework::VariableNameMap *operator()(const OpType *op) const {
+    return &(op->Inputs());
+  }
+};
+struct OutputsVisitor
+    : public boost::static_visitor<const framework::VariableNameMap *> {
+  template <typename OpType>
+  const framework::VariableNameMap *operator()(const OpType *op) const {
+    return &(op->Outputs());
+  }
+};
+struct AttributeMapVisitor
+    : public boost::static_visitor<const framework::AttributeMap *> {
+  const framework::AttributeMap *operator()(const framework::OpDesc *op) const {
+    return &(op->GetAttrMap());
+  }
+  const framework::AttributeMap *operator()(
+      const framework::OperatorBase *op) const {
+    return &(op->Attrs());
+  }
+};
+struct RawPointerVisitor : public boost::static_visitor<const void *> {
+  template <typename OpType>
+  const void *operator()(const OpType *op) const {
+    return op;
+  }
+};
+const framework::VariableNameMap &OpVariant::Inputs() const {
+  return *boost::apply_visitor(InputsVisitor(), op_);
+}
+const framework::VariableNameMap &OpVariant::Outputs() const {
+  return *boost::apply_visitor(OutputsVisitor(), op_);
+}
+const framework::AttributeMap &OpVariant::Attrs() const {
+  return *boost::apply_visitor(AttributeMapVisitor(), op_);
+}
+const void *OpVariant::RawPointer() const {
+  return boost::apply_visitor(RawPointerVisitor(), op_);
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/op_variant.h
+++ b/paddle/fluid/operators/controlflow/op_variant.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/variant.h"
+namespace paddle {
+namespace operators {
+// OpVariant is a wrapper class of OpDesc and OperatorBase pointer
+// So that API would be the same.
+class OpVariant {
+ public:
+  OpVariant(const framework::OperatorBase *op) : op_(op) {}  // NOLINT
+  OpVariant(const framework::OpDesc *op) : op_(op) {}  // NOLINT
+  const framework::VariableNameMap &Inputs() const;
+  const framework::VariableNameMap &Outputs() const;
+  const framework::AttributeMap &Attrs() const;
+  const void *RawPointer() const;
+  template <typename AttrType>
+  const AttrType &Attr(const std::string &name) const {
+    auto &attrs = Attrs();
+    auto it = attrs.find(name);
+    PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name);
+    return boost::get<AttrType>(it->second);
+  }
+  bool operator==(const OpVariant &other) const {
+    return RawPointer() == other.RawPointer();
+  }
+  int which() const { return static_cast<int>(op_.which()); }
+  struct Hasher {
+    size_t operator()(const OpVariant &op) const {
+      return reinterpret_cast<size_t>(op.RawPointer());
+    }
+  };
+ private:
+  const boost::variant<const framework::OperatorBase *,
+                       const framework::OpDesc *>
+      op_;
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/recurrent_op.h"
+namespace paddle {
+namespace operators {
+static bool IsMatchedRecurrentOpAndRecurrentGradOp(const OpVariant &fwd_op,
+                                                   const OpVariant &grad_op) {
+  return fwd_op.Inputs().at(RecurrentBase::kInputs) ==
+             grad_op.Inputs().at(RecurrentBase::kInputs) &&
+         fwd_op.Outputs().at(RecurrentBase::kOutputs) ==
+             grad_op.Inputs().at(RecurrentBase::kOutputs);
+}
+// Returns whether the variable is skippable in forward recurrent op
+// The variable is skippable in recurrent_op when the variable used in
+// recurrent_grad is not from grad_block.
+static bool IsSkippableVar(const std::string &name,
+                           framework::BlockDesc *grad_block) {
+  return name != framework::kEmptyVarName && !grad_block->HasVar(name);
+}
+static void ClearSkipVars(const OpVariant &op) {
+  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
+  std::vector<std::string> &attr_skip_vars =
+      boost::get<std::vector<std::string>>(
+          attrs[RecurrentBase::kSkipEagerDeletionVars]);
+  attr_skip_vars.clear();
+}
+// Add skip vars into op's attribute
+template <class Container>
+static void AddSkipVars(const OpVariant &op, const Container &skip_vars) {
+  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
+  VLOG(2) << "Prepare to add " << skip_vars.size()
+          << " skip var(s): " << paddle::string::join_strings(skip_vars, ' ');
+  std::vector<std::string> &attr_skip_vars =
+      boost::get<std::vector<std::string>>(
+          attrs[RecurrentBase::kSkipEagerDeletionVars]);
+  attr_skip_vars.insert(attr_skip_vars.end(), skip_vars.cbegin(),
+                        skip_vars.cend());
+}
+// Find all ops and grad ops with given type name. The ops and grad ops
+// may locate in different blocks so we should traverse all blocks in the
+// program and find them out
+static void FindAllOpAndGradOp(OpAndGradOpPair *op_and_grad_op,
+                               const std::string &type_name,
+                               const std::string &backward_type_name) {
+  OpVariantSet &ops = op_and_grad_op->first;
+  OpVariantSet &grad_ops = op_and_grad_op->second;
+  PADDLE_ENFORCE_GE(ops.size(), grad_ops.size(),
+                    "There are extra grad ops in the graph or program");
+  if (ops.empty()) return;
+  const auto *program =
+      ops.begin()
+          ->Attr<framework::BlockDesc *>(RecurrentBase::kStepBlock)
+          ->Program();
+  for (size_t i = 1; i < program->Size(); ++i) {
+    auto &block = program->Block(i);
+    for (size_t j = 0; j < block.OpSize(); ++j) {
+      auto *op = block.Op(j);
+      if (op->Type() == type_name) {
+        ops.emplace(op);
+      } else if (op->Type() == backward_type_name) {
+        grad_ops.emplace(op);
+      }
+    }
+  }
+  PADDLE_ENFORCE_GE(ops.size(), grad_ops.size(),
+                    "There are extra grad ops in the graph or program");
+}
+// Returns GradVarName of input var names
+static std::vector<std::string> GradVarLists(
+    const std::vector<std::string> &var_names) {
+  std::vector<std::string> retv;
+  retv.reserve(var_names.size());
+  std::transform(var_names.begin(), var_names.end(), std::back_inserter(retv),
+                 framework::GradVarName);
+  return retv;
+}
+// Add memory vars in recurrent op as skip vars.
+static void AddOpMemVarsAsSkip(const OpVariant &op, bool set_grad_mem_vars) {
+  bool has_state = op.Attr<bool>(RecurrentBase::kHasStates);
+  if (has_state) {
+    std::unordered_set<std::string> skip_vars;
+    auto &mem_vars = op.Attr<std::vector<std::string>>(RecurrentBase::kStates);
+    skip_vars.insert(mem_vars.begin(), mem_vars.end());
+    auto &pre_mem_vars =
+        op.Attr<std::vector<std::string>>(RecurrentBase::kExStates);
+    skip_vars.insert(pre_mem_vars.begin(), pre_mem_vars.end());
+    if (set_grad_mem_vars) {
+      auto mem_grad_vars = GradVarLists(mem_vars);
+      skip_vars.insert(mem_grad_vars.begin(), mem_grad_vars.end());
+      auto pre_mem_grad_vars = GradVarLists(pre_mem_vars);
+      skip_vars.insert(pre_mem_grad_vars.begin(), pre_mem_grad_vars.end());
+    }
+    AddSkipVars(op, skip_vars);
+  }
+}
+// Set outputs and memory vars of the input forward op as skip vars
+static void SetRecurrentForwardOpOnlySkipVarAttr(const OpVariant &fwd_op) {
+  ClearSkipVars(fwd_op);
+  AddOpMemVarsAsSkip(fwd_op, /* set_grad_mem_vars = */ false);
+  auto &output_vars = fwd_op.Outputs().at(RecurrentBase::kOutputs);
+  AddSkipVars(fwd_op, output_vars);
+}
+// Set skip vars of matched recurrent op and recurrent_grad op
+static void SetRecurrentOpAndRecurrentGradOpSkipVarAttr(
+    const OpVariant &fwd_op, const OpVariant &bwd_op) {
+  // Find all skippable variables in forward recurrent_op
+  ClearSkipVars(fwd_op);
+  AddOpMemVarsAsSkip(fwd_op, /* set_grad_mem_vars = */ false);
+  auto *grad_block =
+      bwd_op.Attr<framework::BlockDesc *>(RecurrentBase::kStepBlock);
+  std::unordered_set<std::string> fwd_skip_vars;
+  for (auto *op_desc : grad_block->AllOps()) {
+    for (auto &in_arg_name : op_desc->InputArgumentNames()) {
+      if (IsSkippableVar(in_arg_name, grad_block)) {
+        fwd_skip_vars.insert(in_arg_name);
+      }
+    }
+    for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
+      if (IsSkippableVar(out_arg_name, grad_block)) {
+        fwd_skip_vars.insert(out_arg_name);
+      }
+    }
+  }
+  AddSkipVars(fwd_op, fwd_skip_vars);
+  // Find all skippable variables in recurrent_grad_op
+  // The skippable variables are those which would be used across time steps
+  ClearSkipVars(bwd_op);
+  AddOpMemVarsAsSkip(bwd_op, /* set_grad_mem_vars = */ true);
+  std::unordered_set<std::string> bwd_skip_vars;
+  auto &fwd_input = fwd_op.Inputs().at(RecurrentBase::kInputs);
+  auto &in_grads =
+      bwd_op.Outputs().at(framework::GradVarName(RecurrentBase::kInputs));
+  PADDLE_ENFORCE_EQ(
+      fwd_input.size(), in_grads.size(),
+      "Backward input gradient number does not match forward input number.");
+  for (size_t i = 0; i < in_grads.size(); ++i) {
+    if (in_grads[i] == framework::kEmptyVarName) {
+      continue;
+    }
+    bwd_skip_vars.insert(in_grads[i]);
+    bwd_skip_vars.insert(framework::GradVarName(fwd_input[i]));
+  }
+  auto &fwd_param = fwd_op.Inputs().at(RecurrentBase::kParameters);
+  auto &param_grads =
+      bwd_op.Outputs().at(framework::GradVarName(RecurrentBase::kParameters));
+  PADDLE_ENFORCE_EQ(fwd_param.size(), param_grads.size(),
+                    "Backward parameter gradient number does not match forward "
+                    "parameter number.");
+  for (size_t i = 0; i < fwd_param.size(); ++i) {
+    if (param_grads[i] == framework::kEmptyVarName) {
+      continue;
+    }
+    bwd_skip_vars.insert(param_grads[i]);
+    bwd_skip_vars.insert(framework::GradVarName(fwd_param[i]));
+  }
+  AddSkipVars(bwd_op, bwd_skip_vars);
+}
+void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+    int block_id,
+    const std::vector<std::unique_ptr<paddle::framework::OperatorBase>>
+        &all_ops) {
+  // If block_id is not 0, returns
+  // This is because all recurrent_ops and recurrent_grad_ops in the whole
+  // program would be processed when block_id is 0 (i.e. when Executor::Run()
+  // or ParallelExecutor constructs).
+  // What's more, all recurrent_ops and recurrent_grad_ops must be processed
+  // when block_id is zero. If not, recurrent_op may run first and erase
+  // variables
+  // used in recurrent_grad_op, and in this moment, recurrent_grad_ops may be
+  // not constructed yet.
+  if (block_id != 0) return;
+  OpAndGradOpPair op_pair;
+  for (auto &op : all_ops) {
+    if (op->Type() == "recurrent") {
+      op_pair.first.emplace(op.get());
+    } else if (op->Type() == "recurrent_grad") {
+      op_pair.second.emplace(op.get());
+    }
+  }
+  PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(&op_pair);
+}
+void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+    OpAndGradOpPair *op_pair) {
+  // Find all ops and grad ops at all blocks
+  FindAllOpAndGradOp(op_pair, "recurrent", "recurrent_grad");
+  OpVariantSet &recurrent_ops = op_pair->first;
+  OpVariantSet &recurrent_grad_ops = op_pair->second;
+  VLOG(2) << "Found recurrent op num: " << recurrent_ops.size()
+          << ", recurrent grad op num: " << recurrent_grad_ops.size();
+  if (recurrent_ops.empty()) {
+    return;
+  }
+  for (auto &bwd_op : recurrent_grad_ops) {
+    const OpVariant *matched_fwd_op = nullptr;
+    for (auto &fwd_op : recurrent_ops) {
+      if (IsMatchedRecurrentOpAndRecurrentGradOp(fwd_op, bwd_op)) {
+        PADDLE_ENFORCE(matched_fwd_op == nullptr,
+                       "Found multiple matched recurrent op");
+        matched_fwd_op = &fwd_op;
+      }
+    }
+    PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, "Cannot find matched forward op");
+    SetRecurrentOpAndRecurrentGradOpSkipVarAttr(*matched_fwd_op, bwd_op);
+    recurrent_ops.erase(*matched_fwd_op);
+  }
+  for (auto &fwd_op : recurrent_ops) {
+    SetRecurrentForwardOpOnlySkipVarAttr(fwd_op);
+  }
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.h
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+#include "paddle/fluid/operators/recurrent_op.h"
+#include "paddle/fluid/platform/variant.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace operators {
+using OpVariantSet = std::unordered_set<OpVariant, OpVariant::Hasher>;
+using OpAndGradOpPair = std::pair<OpVariantSet, OpVariantSet>;
+// Set vars to skip eager deletion on input recurrent and recurrent_grad for
+// preparing safe eager deletion. Input contains all recurrent and
+// recurrent_grad ops at block 0 and the function will find all recurrent and
+// recurrent_grad ops across blocks.
+void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+    OpAndGradOpPair *op_pair);
+// Set vars to skip eager deletion on input recurrent and recurrent_grad for
+// preparing safe eager deletion. The input block_id must be 0 and caller can
+// input all ops in the block. The function will find all recurrent and
+// recurrent_grad ops across blocks.
+void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+    int block_id,
+    const std::vector<std::unique_ptr<paddle::framework::OperatorBase>>
+        &all_ops);
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -13,109 +13,18 @@
 // limitations under the License.
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+#include "paddle/fluid/string/string_helper.h"
 namespace paddle {
 namespace operators {
-// OpVariant is a wrapper class of OpDesc and OperatorBase
-// So that API would be the same.
-class OpVariant {
-  struct InputsVisitor
-      : public boost::static_visitor<const framework::VariableNameMap *> {
-    template <typename OpType>
-    const framework::VariableNameMap *operator()(const OpType *op) const {
-      return &(op->Inputs());
-    }
-  };
-  struct OutputsVisitor
-      : public boost::static_visitor<const framework::VariableNameMap *> {
-    template <typename OpType>
-    const framework::VariableNameMap *operator()(const OpType *op) const {
-      return &(op->Outputs());
-    }
-  };
-  struct AttributeMapVisitor
-      : public boost::static_visitor<const framework::AttributeMap *> {
-    const framework::AttributeMap *operator()(
-        const framework::OpDesc *op) const {
-      return &(op->GetAttrMap());
-    }
-    const framework::AttributeMap *operator()(
-        const framework::OperatorBase *op) const {
-      return &(op->Attrs());
-    }
-  };
-  struct RawPointerVisitor : public boost::static_visitor<const void *> {
-    template <typename OpType>
-    const void *operator()(const OpType *op) const {
-      return op;
-    }
-  };
- public:
-  OpVariant(const framework::OperatorBase *op) : op_(op) {}  // NOLINT
-  OpVariant(const framework::OpDesc *op) : op_(op) {}  // NOLINT
-  const framework::VariableNameMap &Inputs() const {
-    return *boost::apply_visitor(InputsVisitor(), op_);
-  }
-  const framework::VariableNameMap &Outputs() const {
-    return *boost::apply_visitor(OutputsVisitor(), op_);
-  }
-  const framework::AttributeMap &Attrs() const {
-    return *boost::apply_visitor(AttributeMapVisitor(), op_);
-  }
-  template <typename AttrType>
-  const AttrType &Attr(const std::string &name) const {
-    auto &attrs = Attrs();
-    auto it = attrs.find(name);
-    PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name);
-    return boost::get<AttrType>(it->second);
-  }
-  bool operator==(const OpVariant &other) const {
-    return RawPointer() == other.RawPointer();
-  }
-  const void *RawPointer() const {
-    return boost::apply_visitor(RawPointerVisitor(), op_);
-  }
-  int which() const { return static_cast<int>(op_.which()); }
-  struct Hasher {
-    size_t operator()(const OpVariant &op) const {
-      return reinterpret_cast<size_t>(op.RawPointer());
-    }
-  };
- private:
-  const boost::variant<const framework::OperatorBase *,
-                       const framework::OpDesc *>
-      op_;
-};
-static std::string GetDebugString(const std::vector<std::string> &names) {
-  if (names.empty()) return "";
-  std::string ret = names[0];
-  for (size_t i = 1; i < names.size(); ++i) {
-    ret += (" " + names[i]);
-  }
-  return ret;
-}
 // Set skip variables of while_op and while_grad_op
 // These variables should be skipped when eager deletion enables.
 // It is because:
@@ -124,7 +33,7 @@ static std::string GetDebugString(const std::vector<std::string> &names) {
 static void SetSkipVars(const OpVariant &op, std::vector<std::string> attr) {
  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
  VLOG(2) << "Prepare to skip " << attr.size()
-          << " var(s): " << GetDebugString(attr);
+          << " var(s): " << string::join_strings(attr, ' ');
  attrs[kSkipEagerDeletionVars] = std::move(attr);
 }

--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
--- a/paddle/fluid/operators/recurrent_op.h
+++ b/paddle/fluid/operators/recurrent_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+// StepScopes manages scopes inside RNN.
+//    StepScopes::CurScope() get the current scope
+//    StepScopes::ExScope() get the ex-scope, or scope in previous time step.
+//    StepScopes::Next() move to next time step.
+//
+// if is_train = False, then
+//   there are two scopes for the RNN and just support forward.
+// else
+//   the len(scopes) == seq_len
+//
+// if is_backward = True, then
+//   reversely access scopes
+// else
+//   access scopes from begin to end.
+class StepScopes {
+ public:
+  StepScopes(const platform::DeviceContext &dev_ctx,
+             const framework::Scope &parent,
+             std::vector<framework::Scope *> *scopes, bool is_train,
+             size_t seq_len, bool is_backward = false);
+  framework::Scope &CurScope();
+  framework::Scope &ExScope();
+  void Next();
+ private:
+  framework::Scope &GetScope(size_t scope_id) const;
+  size_t counter_;
+  std::vector<framework::Scope *> *scopes_;
+  bool is_train_;
+  bool is_backward_;
+};
+// Base class for RecurrentOp/RecurrentGradOp
+//    Some common protected functions for RecurrentOp/RecurrentGradOp
+class RecurrentBase : public framework::OperatorBase {
+ public:
+  static const char kInputs[];
+  static const char kInitialStates[];
+  static const char kParameters[];
+  static const char kOutputs[];
+  static const char kStepScopes[];
+  static const char kHasStates[];
+  static const char kExStates[];
+  static const char kStates[];
+  static const char kStepBlock[];
+  static const char kReverse[];
+  static const char kIsTrain[];
+  static const char kSkipEagerDeletionVars[];
+  static const char kInputGrads[];
+  static const char kOutputGrads[];
+  static const char kParamGrads[];
+  static const char kInitStateGrads[];
+  RecurrentBase(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs);
+ protected:
+  // Get SequenceLength from Scope
+  //   The sequence length is got from input tensor. The input tensor's
+  //   dimension should be [SEQ_LEN, ..., ...]. The first of the tensor's shape
+  //   is SEQ_LEN. The second of the tensor's shape could be the batch size or
+  //   nested sequence length.
+  int64_t GetSequenceLength(const framework::Scope &scope) const;
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.Var, dst_vars)):
+  //   dst_tensor.ShareDataWith(src_tensor)
+  static void LinkTensor(const framework::Scope &src_scope,
+                         const std::vector<std::string> &src_vars,
+                         framework::Scope *dst_scope,
+                         const std::vector<std::string> &dst_vars);
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.Var, dst_vars)):
+  //   callback(src_tensor, &dst_tensor)
+  template <typename Callback>
+  static void LinkTensorWithCallback(const framework::Scope &src_scope,
+                                     const std::vector<std::string> &src_vars,
+                                     framework::Scope *dst_scope,
+                                     const std::vector<std::string> &dst_vars,
+                                     Callback callback,
+                                     bool is_backward = false) {
+    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
+    for (size_t i = 0; i < dst_vars.size(); ++i) {
+      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback,
+                   is_backward);
+    }
+  }
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.FindVar, dst_vars)):
+  //   callback(src_tensor, &dst_tensor)
+  template <typename Callback>
+  static void LinkTensorWithCallback(const framework::Scope &src_scope,
+                                     const std::vector<std::string> &src_vars,
+                                     const framework::Scope &dst_scope,
+                                     const std::vector<std::string> &dst_vars,
+                                     Callback callback,
+                                     bool is_backward = false) {
+    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
+    for (size_t i = 0; i < dst_vars.size(); ++i) {
+      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback,
+                   is_backward);
+    }
+  }
+  // (seq_len, shape) -> return [seq_len] + list(shape)
+  static framework::DDim PrependDims(size_t seq_len,
+                                     const framework::DDim &src);
+ private:
+  template <typename Callback>
+  static void AccessTensor(const framework::Scope &src_scope,
+                           const std::string &src_var_name,
+                           framework::Scope *dst_scope,
+                           const std::string &dst_var_name, Callback callback,
+                           bool is_backward = false) {
+    auto *src_var = src_scope.FindVar(src_var_name);
+    if (is_backward && src_var == nullptr) {
+      return;
+    }
+    PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name);
+    auto &src_tensor = src_var->Get<framework::LoDTensor>();
+    auto *dst_var = dst_scope->Var(dst_var_name);
+    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
+    callback(src_tensor, dst_tensor);
+  }
+  template <typename Callback>
+  static void AccessTensor(const framework::Scope &src_scope,
+                           const std::string &src_var_name,
+                           const framework::Scope &dst_scope,
+                           const std::string &dst_var_name, Callback callback,
+                           bool is_backward = false) {
+    auto *dst_var = dst_scope.FindVar(dst_var_name);
+    if (is_backward && dst_var == nullptr) {
+      return;
+    }
+    auto *src_var = src_scope.FindVar(src_var_name);
+    PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name);
+    auto &src_tensor = src_var->Get<framework::LoDTensor>();
+    PADDLE_ENFORCE(dst_var != nullptr, "%s is not found.", dst_var_name);
+    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
+    callback(src_tensor, dst_tensor);
+  }
+};
+class RecurrentOp : public RecurrentBase {
+ public:
+  RecurrentOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs);
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override;
+ private:
+  StepScopes CreateStepScopes(const platform::DeviceContext &dev_ctx,
+                              const framework::Scope &scope,
+                              size_t seq_len) const;
+};
+class RecurrentGradOp : public RecurrentBase {
+ public:
+  RecurrentGradOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs);
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override;
+  StepScopes CreateStepScopes(const platform::DeviceContext &dev_ctx,
+                              const framework::Scope &scope,
+                              size_t seq_len) const;
+  std::unordered_set<std::string> List2Set(
+      const std::vector<std::string> &list) const;
+  std::unordered_set<std::string> LocalVarNames(
+      const framework::Scope &scope) const;
+  static std::vector<std::string> GradVarLists(
+      const std::vector<std::string> &var_names);
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -91,6 +91,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
    auto in_grad_var_name = Output(framework::GradVarName("X"));
    auto *in_grad_var = scope.FindVar(in_grad_var_name);
    PADDLE_ENFORCE(in_grad_var != nullptr,
                   "Cannot find in_grad_var in scope, name is %s",
                   in_grad_var_name);

--- a/paddle/fluid/string/string_helper.h
+++ b/paddle/fluid/string/string_helper.h
@@ -119,16 +119,18 @@ std::vector<T> split_string(const std::string& str) {
  return list;
 }
-template <class T>
+template <class Container>
-std::string join_strings(const std::vector<T>& strs, char delim) {
+std::string join_strings(const Container& strs, char delim) {
  std::string str;
-  for (size_t i = 0; i < strs.size(); i++) {
+  int i = 0;
+  for (auto& elem : strs) {
    if (i > 0) {
      str += delim;
    }
-    str += boost::lexical_cast<std::string>(strs[i]);
+    str += boost::lexical_cast<std::string>(elem);
+    ++i;
  }
  return str;

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -17,11 +17,13 @@ from __future__ import print_function
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid import ParamAttr
 from paddle.fluid.framework import Program, grad_var_name
 from paddle.fluid.executor import Executor
 from paddle.fluid.backward import append_backward
-import numpy as np
-import paddle.fluid.core as core
 class PyRNNBase(object):
@@ -67,8 +69,8 @@ class PySimpleRNN2(PyRNNBase):
        super(PySimpleRNN2, self).__init__(input_shape, output_shape)
        seq_len, batch_size, input_dim = input_shape
-        self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.W = np.ones(shape=(input_dim, input_dim)).astype("float32")
-        self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.U = np.zeros(shape=(input_dim, input_dim)).astype("float32")
        self.h_boot = np.ones(shape=(batch_size, input_dim)).astype("float32")
        men_dim = (seq_len, batch_size, input_dim)
@@ -182,7 +184,7 @@ class RecurrentOpTest1(unittest.TestCase):
                       fetch_list=fetch_list,
                       return_numpy=False)
-    def test_backward(self, rtol=0.1):
+    def test_backward(self, rtol=0.01):
        self.check_forward()
        with fluid.program_guard(self.main_program, self.startup_program):
@@ -204,7 +206,7 @@ class RecurrentOpTest1(unittest.TestCase):
        pd_output = self.forward()
        py_output = self.py_rnn.forward()
        self.assertEqual(pd_output.shape, py_output.shape)
-        self.assertTrue(np.isclose(pd_output, py_output, rtol=0.1).all())
+        self.assertTrue(np.isclose(pd_output, py_output, rtol=0.01).all())
    def get_numerical_gradient(self, delta=0.005):
        dloss_dout = 1.0
@@ -274,13 +276,19 @@ class RecurrentOpTest2(RecurrentOpTest1):
            h_pre = rnn.memory(init=h_boot)
            x_t = rnn.step_input(x)
-            temp_l = layers.fc(input=x_t,
+            temp_l = layers.fc(
+                input=x_t,
                size=self.input_dim,
-                               param_attr='W',
+                param_attr=ParamAttr(
+                    name='W',
+                    initializer=fluid.initializer.ConstantInitializer(1.0)),
                bias_attr=False)
-            temp_r = layers.fc(input=h_pre,
+            temp_r = layers.fc(
+                input=h_pre,
                size=self.input_dim,
-                               param_attr='U',
+                param_attr=ParamAttr(
+                    name='U',
+                    initializer=fluid.initializer.ConstantInitializer(0.0)),
                bias_attr=False)
            h = layers.sigmoid(x=layers.elementwise_add(x=temp_l, y=temp_r))
@@ -291,7 +299,7 @@ class RecurrentOpTest2(RecurrentOpTest1):
        return rnn()
    def test_backward(self):
-        super(RecurrentOpTest2, self).test_backward(rtol=0.2)
+        super(RecurrentOpTest2, self).test_backward(rtol=0.01)
 class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):