[Cherry-pick] Support memory eager deletion on recurrent OP (#19411)

* Support memory eager deletion on recurrent OP (#17710) Test PaddingRNN on V100 GPU device. Test configuration: large model, padding mode (which is the mode using recurrentOp), one GPU. GPU memory (MiB): 6414 (this PR) vs 6837 (without this PR) Speed (steps/s): 10.28 (this PR) vs 9.89 (without this PR) * Fix random test_recurrent_op failure (#18718) The change includes 3 things: 1. Set CPU_NUM to 1 in the tests because the ParallelExecutor will print warning that CPU_NUM is not set and use default 1. 2. Old tests compare two RNNs, hand written simple RNN and same RNN built by Paddle, but initialized RNN weights in numpy random and Paddle random separately. Fixed it by setting weights and bias values. 3. Also set numpy random seed in the tests. Now the two RNNs diff can be smaller (rtol from 0.1, 0.2 to. 0.01) in the tests.

[Cherry-pick] Support memory eager deletion on recurrent OP (#19411)
* Support memory eager deletion on recurrent OP (#17710) Test PaddingRNN on V100 GPU device. Test configuration: large model, padding mode (which is the mode using recurrentOp), one GPU. GPU memory (MiB): 6414 (this PR) vs 6837 (without this PR) Speed (steps/s): 10.28 (this PR) vs 9.89 (without this PR) * Fix random test_recurrent_op failure (#18718) The change includes 3 things: 1. Set CPU_NUM to 1 in the tests because the ParallelExecutor will print warning that CPU_NUM is not set and use default 1. 2. Old tests compare two RNNs, hand written simple RNN and same RNN built by Paddle, but initialized RNN weights in numpy random and Paddle random separately. Fixed it by setting weights and bias values. 3. Also set numpy random seed in the tests. Now the two RNNs diff can be smaller (rtol from 0.1, 0.2 to. 0.01) in the tests.
cb74dac3 · Huihuang Zheng · GitHub · a7a4b72b · cb74dac3 · cb74dac3
19 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -196,7 +196,7 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()

-target_link_libraries(executor while_op_helper executor_gc_helper)
+target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper)

 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/trainer_factory.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/place.h"
@@ -410,6 +411,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    if (gc && ctx->prog_.Size() > 1) {
      operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_,
                                                                 ctx->ops_);
+      operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+          ctx->block_id_, ctx->ops_);
    }
  }


--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
 cc_library(op_graph_view SRCS op_graph_view.cc DEPS op_handle_base)
 cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle)
+cc_library(recurrent_op_eager_deletion_pass SRCS recurrent_op_eager_deletion_pass.cc DEPS recurrent_op_helper graph_helper pass computation_op_handle)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle var_handle)
 cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)

@@ -14,5 +15,6 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_

 cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry)

-cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass reference_count_pass_helper)
+cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle
+    eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass recurrent_op_eager_deletion_pass reference_count_pass_helper)
 cc_library(record_skip_memory_opt_vars_pass SRCS record_skip_memory_opt_vars_pass.cc DEPS graph graph_helper)
--- a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
@@ -266,6 +266,10 @@ void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
  auto while_op_eager_deletion_pass =
      ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
  while_op_eager_deletion_pass->Apply(graph);
+
+  auto recurrent_op_eager_deletion_pass =
+      ir::PassRegistry::Instance().Get("recurrent_op_eager_deletion_pass");
+  recurrent_op_eager_deletion_pass->Apply(graph);
 }

 }  // namespace ir
@@ -279,3 +283,4 @@ REGISTER_PASS(eager_deletion_pass, paddle::framework::ir::EagerDeletionPass)
    .RequirePassAttr(paddle::framework::ir::kGarbageCollector);

 USE_PASS(while_op_eager_deletion_pass);
+USE_PASS(recurrent_op_eager_deletion_pass);
--- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h"
+
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using paddle::operators::OpVariant;
+using paddle::operators::OpVariantSet;
+using paddle::operators::OpAndGradOpPair;
+
+void RecurrentOpEagerDeletionPass::ApplyImpl(Graph *graph) const {
+  // Find all recurrent_op and recurrent_grad_op in graph
+  // Note the graph only contains ops and block 0
+  std::unordered_map<size_t, OpAndGradOpPair> target_ops =
+      DeviceIdToRecurrentAndRecurrentGradOp(*graph);
+
+  for (auto &entry : target_ops) {
+    // Prepare safe eager deletion on different devices because the garbage
+    // collection may be different across devices
+    OpAndGradOpPair &op_pair = entry.second;
+    PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(&op_pair);
+  }
+}
+
+// Returns a std::unordered_map mapping from the device id to recurrent op and
+// grad op pair
+std::unordered_map<size_t, OpAndGradOpPair>
+RecurrentOpEagerDeletionPass::DeviceIdToRecurrentAndRecurrentGradOp(
+    const Graph &graph) const {
+  std::unordered_map<size_t, OpAndGradOpPair> ret;
+  std::vector<details::OpHandleBase *> all_ops =
+      FilterByNodeWrapper<details::OpHandleBase>(graph);
+
+  for (auto *op : all_ops) {
+    auto compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
+    if (compute_op == nullptr) continue;
+
+    if (compute_op->Name() == "recurrent") {
+      // GetScopeIdx() returns device/place id
+      ret[compute_op->GetScopeIdx()].first.emplace(compute_op->GetOp());
+    } else if (compute_op->Name() == "recurrent_grad") {
+      // GetScopeIdx() returns device/place id
+      ret[compute_op->GetScopeIdx()].second.emplace(compute_op->GetOp());
+    }
+  }
+  return ret;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(recurrent_op_eager_deletion_pass,
+              paddle::framework::ir::RecurrentOpEagerDeletionPass);
--- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// Pass class set skip eager deletion vars for recurrent ops
+class RecurrentOpEagerDeletionPass : public Pass {
+ protected:
+  void ApplyImpl(Graph *graph) const override;
+
+ private:
+  // Returns a std::unordered_map mapping from the device id to recurrent op and
+  // grad op pair
+  std::unordered_map<size_t, paddle::operators::OpAndGradOpPair>
+  DeviceIdToRecurrentAndRecurrentGradOp(const Graph &graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
 include(operators)
 register_operators(DEPS naive_executor)
-cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator) 
+cc_library(op_variant SRCS op_variant.cc DEPS operator proto_desc)
+cc_library(recurrent_op_helper SRCS recurrent_op_helper.cc DEPS operator op_variant recurrent_op)
+cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator op_variant) 

 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
--- a/paddle/fluid/operators/controlflow/op_variant.cc
+++ b/paddle/fluid/operators/controlflow/op_variant.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+
+namespace paddle {
+namespace operators {
+
+struct InputsVisitor
+    : public boost::static_visitor<const framework::VariableNameMap *> {
+  template <typename OpType>
+  const framework::VariableNameMap *operator()(const OpType *op) const {
+    return &(op->Inputs());
+  }
+};
+
+struct OutputsVisitor
+    : public boost::static_visitor<const framework::VariableNameMap *> {
+  template <typename OpType>
+  const framework::VariableNameMap *operator()(const OpType *op) const {
+    return &(op->Outputs());
+  }
+};
+
+struct AttributeMapVisitor
+    : public boost::static_visitor<const framework::AttributeMap *> {
+  const framework::AttributeMap *operator()(const framework::OpDesc *op) const {
+    return &(op->GetAttrMap());
+  }
+
+  const framework::AttributeMap *operator()(
+      const framework::OperatorBase *op) const {
+    return &(op->Attrs());
+  }
+};
+
+struct RawPointerVisitor : public boost::static_visitor<const void *> {
+  template <typename OpType>
+  const void *operator()(const OpType *op) const {
+    return op;
+  }
+};
+
+const framework::VariableNameMap &OpVariant::Inputs() const {
+  return *boost::apply_visitor(InputsVisitor(), op_);
+}
+
+const framework::VariableNameMap &OpVariant::Outputs() const {
+  return *boost::apply_visitor(OutputsVisitor(), op_);
+}
+
+const framework::AttributeMap &OpVariant::Attrs() const {
+  return *boost::apply_visitor(AttributeMapVisitor(), op_);
+}
+
+const void *OpVariant::RawPointer() const {
+  return boost::apply_visitor(RawPointerVisitor(), op_);
+}
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/op_variant.h
+++ b/paddle/fluid/operators/controlflow/op_variant.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace operators {
+
+// OpVariant is a wrapper class of OpDesc and OperatorBase pointer
+// So that API would be the same.
+class OpVariant {
+ public:
+  OpVariant(const framework::OperatorBase *op) : op_(op) {}  // NOLINT
+
+  OpVariant(const framework::OpDesc *op) : op_(op) {}  // NOLINT
+
+  const framework::VariableNameMap &Inputs() const;
+
+  const framework::VariableNameMap &Outputs() const;
+
+  const framework::AttributeMap &Attrs() const;
+
+  const void *RawPointer() const;
+
+  template <typename AttrType>
+  const AttrType &Attr(const std::string &name) const {
+    auto &attrs = Attrs();
+    auto it = attrs.find(name);
+    PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name);
+    return boost::get<AttrType>(it->second);
+  }
+
+  bool operator==(const OpVariant &other) const {
+    return RawPointer() == other.RawPointer();
+  }
+
+  int which() const { return static_cast<int>(op_.which()); }
+
+  struct Hasher {
+    size_t operator()(const OpVariant &op) const {
+      return reinterpret_cast<size_t>(op.RawPointer());
+    }
+  };
+
+ private:
+  const boost::variant<const framework::OperatorBase *,
+                       const framework::OpDesc *>
+      op_;
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
+
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/recurrent_op.h"
+
+namespace paddle {
+namespace operators {
+
+static bool IsMatchedRecurrentOpAndRecurrentGradOp(const OpVariant &fwd_op,
+                                                   const OpVariant &grad_op) {
+  return fwd_op.Inputs().at(RecurrentBase::kInputs) ==
+             grad_op.Inputs().at(RecurrentBase::kInputs) &&
+         fwd_op.Outputs().at(RecurrentBase::kOutputs) ==
+             grad_op.Inputs().at(RecurrentBase::kOutputs);
+}
+
+// Returns whether the variable is skippable in forward recurrent op
+// The variable is skippable in recurrent_op when the variable used in
+// recurrent_grad is not from grad_block.
+static bool IsSkippableVar(const std::string &name,
+                           framework::BlockDesc *grad_block) {
+  return name != framework::kEmptyVarName && !grad_block->HasVar(name);
+}
+
+static void ClearSkipVars(const OpVariant &op) {
+  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
+  std::vector<std::string> &attr_skip_vars =
+      boost::get<std::vector<std::string>>(
+          attrs[RecurrentBase::kSkipEagerDeletionVars]);
+  attr_skip_vars.clear();
+}
+
+// Add skip vars into op's attribute
+template <class Container>
+static void AddSkipVars(const OpVariant &op, const Container &skip_vars) {
+  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
+  VLOG(2) << "Prepare to add " << skip_vars.size()
+          << " skip var(s): " << paddle::string::join_strings(skip_vars, ' ');
+  std::vector<std::string> &attr_skip_vars =
+      boost::get<std::vector<std::string>>(
+          attrs[RecurrentBase::kSkipEagerDeletionVars]);
+  attr_skip_vars.insert(attr_skip_vars.end(), skip_vars.cbegin(),
+                        skip_vars.cend());
+}
+
+// Find all ops and grad ops with given type name. The ops and grad ops
+// may locate in different blocks so we should traverse all blocks in the
+// program and find them out
+static void FindAllOpAndGradOp(OpAndGradOpPair *op_and_grad_op,
+                               const std::string &type_name,
+                               const std::string &backward_type_name) {
+  OpVariantSet &ops = op_and_grad_op->first;
+  OpVariantSet &grad_ops = op_and_grad_op->second;
+
+  PADDLE_ENFORCE_GE(ops.size(), grad_ops.size(),
+                    "There are extra grad ops in the graph or program");
+
+  if (ops.empty()) return;
+
+  const auto *program =
+      ops.begin()
+          ->Attr<framework::BlockDesc *>(RecurrentBase::kStepBlock)
+          ->Program();
+  for (size_t i = 1; i < program->Size(); ++i) {
+    auto &block = program->Block(i);
+    for (size_t j = 0; j < block.OpSize(); ++j) {
+      auto *op = block.Op(j);
+      if (op->Type() == type_name) {
+        ops.emplace(op);
+      } else if (op->Type() == backward_type_name) {
+        grad_ops.emplace(op);
+      }
+    }
+  }
+
+  PADDLE_ENFORCE_GE(ops.size(), grad_ops.size(),
+                    "There are extra grad ops in the graph or program");
+}
+
+// Returns GradVarName of input var names
+static std::vector<std::string> GradVarLists(
+    const std::vector<std::string> &var_names) {
+  std::vector<std::string> retv;
+  retv.reserve(var_names.size());
+  std::transform(var_names.begin(), var_names.end(), std::back_inserter(retv),
+                 framework::GradVarName);
+  return retv;
+}
+
+// Add memory vars in recurrent op as skip vars.
+static void AddOpMemVarsAsSkip(const OpVariant &op, bool set_grad_mem_vars) {
+  bool has_state = op.Attr<bool>(RecurrentBase::kHasStates);
+  if (has_state) {
+    std::unordered_set<std::string> skip_vars;
+
+    auto &mem_vars = op.Attr<std::vector<std::string>>(RecurrentBase::kStates);
+    skip_vars.insert(mem_vars.begin(), mem_vars.end());
+
+    auto &pre_mem_vars =
+        op.Attr<std::vector<std::string>>(RecurrentBase::kExStates);
+    skip_vars.insert(pre_mem_vars.begin(), pre_mem_vars.end());
+
+    if (set_grad_mem_vars) {
+      auto mem_grad_vars = GradVarLists(mem_vars);
+      skip_vars.insert(mem_grad_vars.begin(), mem_grad_vars.end());
+      auto pre_mem_grad_vars = GradVarLists(pre_mem_vars);
+      skip_vars.insert(pre_mem_grad_vars.begin(), pre_mem_grad_vars.end());
+    }
+    AddSkipVars(op, skip_vars);
+  }
+}
+
+// Set outputs and memory vars of the input forward op as skip vars
+static void SetRecurrentForwardOpOnlySkipVarAttr(const OpVariant &fwd_op) {
+  ClearSkipVars(fwd_op);
+
+  AddOpMemVarsAsSkip(fwd_op, /* set_grad_mem_vars = */ false);
+  auto &output_vars = fwd_op.Outputs().at(RecurrentBase::kOutputs);
+  AddSkipVars(fwd_op, output_vars);
+}
+
+// Set skip vars of matched recurrent op and recurrent_grad op
+static void SetRecurrentOpAndRecurrentGradOpSkipVarAttr(
+    const OpVariant &fwd_op, const OpVariant &bwd_op) {
+  // Find all skippable variables in forward recurrent_op
+  ClearSkipVars(fwd_op);
+  AddOpMemVarsAsSkip(fwd_op, /* set_grad_mem_vars = */ false);
+
+  auto *grad_block =
+      bwd_op.Attr<framework::BlockDesc *>(RecurrentBase::kStepBlock);
+  std::unordered_set<std::string> fwd_skip_vars;
+  for (auto *op_desc : grad_block->AllOps()) {
+    for (auto &in_arg_name : op_desc->InputArgumentNames()) {
+      if (IsSkippableVar(in_arg_name, grad_block)) {
+        fwd_skip_vars.insert(in_arg_name);
+      }
+    }
+    for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
+      if (IsSkippableVar(out_arg_name, grad_block)) {
+        fwd_skip_vars.insert(out_arg_name);
+      }
+    }
+  }
+  AddSkipVars(fwd_op, fwd_skip_vars);
+
+  // Find all skippable variables in recurrent_grad_op
+  // The skippable variables are those which would be used across time steps
+  ClearSkipVars(bwd_op);
+  AddOpMemVarsAsSkip(bwd_op, /* set_grad_mem_vars = */ true);
+  std::unordered_set<std::string> bwd_skip_vars;
+
+  auto &fwd_input = fwd_op.Inputs().at(RecurrentBase::kInputs);
+  auto &in_grads =
+      bwd_op.Outputs().at(framework::GradVarName(RecurrentBase::kInputs));
+
+  PADDLE_ENFORCE_EQ(
+      fwd_input.size(), in_grads.size(),
+      "Backward input gradient number does not match forward input number.");
+  for (size_t i = 0; i < in_grads.size(); ++i) {
+    if (in_grads[i] == framework::kEmptyVarName) {
+      continue;
+    }
+    bwd_skip_vars.insert(in_grads[i]);
+    bwd_skip_vars.insert(framework::GradVarName(fwd_input[i]));
+  }
+
+  auto &fwd_param = fwd_op.Inputs().at(RecurrentBase::kParameters);
+  auto &param_grads =
+      bwd_op.Outputs().at(framework::GradVarName(RecurrentBase::kParameters));
+  PADDLE_ENFORCE_EQ(fwd_param.size(), param_grads.size(),
+                    "Backward parameter gradient number does not match forward "
+                    "parameter number.");
+  for (size_t i = 0; i < fwd_param.size(); ++i) {
+    if (param_grads[i] == framework::kEmptyVarName) {
+      continue;
+    }
+    bwd_skip_vars.insert(param_grads[i]);
+    bwd_skip_vars.insert(framework::GradVarName(fwd_param[i]));
+  }
+
+  AddSkipVars(bwd_op, bwd_skip_vars);
+}
+
+void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+    int block_id,
+    const std::vector<std::unique_ptr<paddle::framework::OperatorBase>>
+        &all_ops) {
+  // If block_id is not 0, returns
+  // This is because all recurrent_ops and recurrent_grad_ops in the whole
+  // program would be processed when block_id is 0 (i.e. when Executor::Run()
+  // or ParallelExecutor constructs).
+
+  // What's more, all recurrent_ops and recurrent_grad_ops must be processed
+  // when block_id is zero. If not, recurrent_op may run first and erase
+  // variables
+  // used in recurrent_grad_op, and in this moment, recurrent_grad_ops may be
+  // not constructed yet.
+  if (block_id != 0) return;
+
+  OpAndGradOpPair op_pair;
+  for (auto &op : all_ops) {
+    if (op->Type() == "recurrent") {
+      op_pair.first.emplace(op.get());
+    } else if (op->Type() == "recurrent_grad") {
+      op_pair.second.emplace(op.get());
+    }
+  }
+  PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(&op_pair);
+}
+
+void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+    OpAndGradOpPair *op_pair) {
+  // Find all ops and grad ops at all blocks
+  FindAllOpAndGradOp(op_pair, "recurrent", "recurrent_grad");
+
+  OpVariantSet &recurrent_ops = op_pair->first;
+  OpVariantSet &recurrent_grad_ops = op_pair->second;
+
+  VLOG(2) << "Found recurrent op num: " << recurrent_ops.size()
+          << ", recurrent grad op num: " << recurrent_grad_ops.size();
+
+  if (recurrent_ops.empty()) {
+    return;
+  }
+
+  for (auto &bwd_op : recurrent_grad_ops) {
+    const OpVariant *matched_fwd_op = nullptr;
+    for (auto &fwd_op : recurrent_ops) {
+      if (IsMatchedRecurrentOpAndRecurrentGradOp(fwd_op, bwd_op)) {
+        PADDLE_ENFORCE(matched_fwd_op == nullptr,
+                       "Found multiple matched recurrent op");
+        matched_fwd_op = &fwd_op;
+      }
+    }
+    PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, "Cannot find matched forward op");
+    SetRecurrentOpAndRecurrentGradOpSkipVarAttr(*matched_fwd_op, bwd_op);
+    recurrent_ops.erase(*matched_fwd_op);
+  }
+
+  for (auto &fwd_op : recurrent_ops) {
+    SetRecurrentForwardOpOnlySkipVarAttr(fwd_op);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.h
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+#include "paddle/fluid/operators/recurrent_op.h"
+#include "paddle/fluid/platform/variant.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using OpVariantSet = std::unordered_set<OpVariant, OpVariant::Hasher>;
+using OpAndGradOpPair = std::pair<OpVariantSet, OpVariantSet>;
+
+// Set vars to skip eager deletion on input recurrent and recurrent_grad for
+// preparing safe eager deletion. Input contains all recurrent and
+// recurrent_grad ops at block 0 and the function will find all recurrent and
+// recurrent_grad ops across blocks.
+void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+    OpAndGradOpPair *op_pair);
+
+// Set vars to skip eager deletion on input recurrent and recurrent_grad for
+// preparing safe eager deletion. The input block_id must be 0 and caller can
+// input all ops in the block. The function will find all recurrent and
+// recurrent_grad ops across blocks.
+void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
+    int block_id,
+    const std::vector<std::unique_ptr<paddle::framework::OperatorBase>>
+        &all_ops);
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -13,109 +13,18 @@
 // limitations under the License.

 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
+
 #include <string>
 #include <unordered_set>
 #include <utility>
+
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+#include "paddle/fluid/string/string_helper.h"

 namespace paddle {
 namespace operators {

-// OpVariant is a wrapper class of OpDesc and OperatorBase
-// So that API would be the same.
-class OpVariant {
-  struct InputsVisitor
-      : public boost::static_visitor<const framework::VariableNameMap *> {
-    template <typename OpType>
-    const framework::VariableNameMap *operator()(const OpType *op) const {
-      return &(op->Inputs());
-    }
-  };
-
-  struct OutputsVisitor
-      : public boost::static_visitor<const framework::VariableNameMap *> {
-    template <typename OpType>
-    const framework::VariableNameMap *operator()(const OpType *op) const {
-      return &(op->Outputs());
-    }
-  };
-
-  struct AttributeMapVisitor
-      : public boost::static_visitor<const framework::AttributeMap *> {
-    const framework::AttributeMap *operator()(
-        const framework::OpDesc *op) const {
-      return &(op->GetAttrMap());
-    }
-
-    const framework::AttributeMap *operator()(
-        const framework::OperatorBase *op) const {
-      return &(op->Attrs());
-    }
-  };
-
-  struct RawPointerVisitor : public boost::static_visitor<const void *> {
-    template <typename OpType>
-    const void *operator()(const OpType *op) const {
-      return op;
-    }
-  };
-
- public:
-  OpVariant(const framework::OperatorBase *op) : op_(op) {}  // NOLINT
-
-  OpVariant(const framework::OpDesc *op) : op_(op) {}  // NOLINT
-
-  const framework::VariableNameMap &Inputs() const {
-    return *boost::apply_visitor(InputsVisitor(), op_);
-  }
-
-  const framework::VariableNameMap &Outputs() const {
-    return *boost::apply_visitor(OutputsVisitor(), op_);
-  }
-
-  const framework::AttributeMap &Attrs() const {
-    return *boost::apply_visitor(AttributeMapVisitor(), op_);
-  }
-
-  template <typename AttrType>
-  const AttrType &Attr(const std::string &name) const {
-    auto &attrs = Attrs();
-    auto it = attrs.find(name);
-    PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name);
-    return boost::get<AttrType>(it->second);
-  }
-
-  bool operator==(const OpVariant &other) const {
-    return RawPointer() == other.RawPointer();
-  }
-
-  const void *RawPointer() const {
-    return boost::apply_visitor(RawPointerVisitor(), op_);
-  }
-
-  int which() const { return static_cast<int>(op_.which()); }
-
-  struct Hasher {
-    size_t operator()(const OpVariant &op) const {
-      return reinterpret_cast<size_t>(op.RawPointer());
-    }
-  };
-
- private:
-  const boost::variant<const framework::OperatorBase *,
-                       const framework::OpDesc *>
-      op_;
-};
-
-static std::string GetDebugString(const std::vector<std::string> &names) {
-  if (names.empty()) return "";
-  std::string ret = names[0];
-  for (size_t i = 1; i < names.size(); ++i) {
-    ret += (" " + names[i]);
-  }
-  return ret;
-}
-
 // Set skip variables of while_op and while_grad_op
 // These variables should be skipped when eager deletion enables.
 // It is because:
@@ -124,7 +33,7 @@ static std::string GetDebugString(const std::vector<std::string> &names) {
 static void SetSkipVars(const OpVariant &op, std::vector<std::string> attr) {
  auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
  VLOG(2) << "Prepare to skip " << attr.size()
-          << " var(s): " << GetDebugString(attr);
+          << " var(s): " << string::join_strings(attr, ' ');
  attrs[kSkipEagerDeletionVars] = std::move(attr);
 }


--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -12,31 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <vector>
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/recurrent_op.h"
+
+#include <algorithm>
+#include "paddle/fluid/string/string_helper.h"

 namespace paddle {
 namespace operators {
-constexpr char kInputs[] = "inputs";
-constexpr char kInitialStates[] = "initial_states";
-constexpr char kParameters[] = "parameters";
-constexpr char kOutputs[] = "outputs";
-constexpr char kStepScopes[] = "step_scopes";
-constexpr char kHasStates[] = "has_states";
-constexpr char kExStates[] = "ex_states";
-constexpr char kStates[] = "states";
-constexpr char kStepBlock[] = "sub_block";
-constexpr char kReverse[] = "reverse";
-constexpr char kIsTrain[] = "is_train";
-#define GRAD_SUFFIX "@GRAD"
-constexpr char kInputGrads[] = "inputs" GRAD_SUFFIX;
-constexpr char kOutputGrads[] = "outputs" GRAD_SUFFIX;
-constexpr char kParamGrads[] = "parameters" GRAD_SUFFIX;
-constexpr char kInitStateGrads[] = "initial_states" GRAD_SUFFIX;

 using StepScopeVar = std::vector<framework::Scope *>;

+const char RecurrentBase::kInputs[] = "inputs";
+const char RecurrentBase::kInitialStates[] = "initial_states";
+const char RecurrentBase::kParameters[] = "parameters";
+const char RecurrentBase::kOutputs[] = "outputs";
+const char RecurrentBase::kStepScopes[] = "step_scopes";
+const char RecurrentBase::kHasStates[] = "has_states";
+const char RecurrentBase::kExStates[] = "ex_states";
+const char RecurrentBase::kStates[] = "states";
+const char RecurrentBase::kStepBlock[] = "sub_block";
+const char RecurrentBase::kReverse[] = "reverse";
+const char RecurrentBase::kIsTrain[] = "is_train";
+const char RecurrentBase::kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
+#define GRAD_SUFFIX "@GRAD"
+const char RecurrentBase::kInputGrads[] = "inputs" GRAD_SUFFIX;
+const char RecurrentBase::kOutputGrads[] = "outputs" GRAD_SUFFIX;
+const char RecurrentBase::kParamGrads[] = "parameters" GRAD_SUFFIX;
+const char RecurrentBase::kInitStateGrads[] = "initial_states" GRAD_SUFFIX;
+
 static void ClearStepScopes(const platform::DeviceContext &dev_ctx,
                            framework::Scope *parent_scope,
                            StepScopeVar *step_scopes) {
@@ -65,11 +68,9 @@ static void ClearStepScopes(const platform::DeviceContext &dev_ctx,
 //   reversely access scopes
 // else
 //   access scopes from begin to end.
-class StepScopes {
- public:
-  StepScopes(const platform::DeviceContext &dev_ctx,
+StepScopes::StepScopes(const platform::DeviceContext &dev_ctx,
                       const framework::Scope &parent, StepScopeVar *scopes,
-             bool is_train, size_t seq_len, bool is_backward = false)
+                       bool is_train, size_t seq_len, bool is_backward)
    : counter_(is_backward ? seq_len - 1 : 0UL),
      scopes_(scopes),
      is_train_(is_train),
@@ -84,55 +85,43 @@ class StepScopes {
      scopes->emplace_back(&parent.NewScope());
    }
  }
-  }
+}

-  framework::Scope &CurScope() { return GetScope(counter_); }
+framework::Scope &StepScopes::CurScope() { return GetScope(counter_); }

-  framework::Scope &ExScope() {
+framework::Scope &StepScopes::ExScope() {
  auto &scope = GetScope(is_backward_ ? counter_ + 1 : counter_ - 1);
  return scope;
-  }
+}

-  void Next() {
+void StepScopes::Next() {
  if (is_backward_) {
    --counter_;
  } else {
    ++counter_;
  }
-  }
+}

- private:
-  framework::Scope &GetScope(size_t scope_id) const {
+framework::Scope &StepScopes::GetScope(size_t scope_id) const {
  if (!is_train_) {
    scope_id %= 2;
  }
  PADDLE_ENFORCE_LT(scope_id, scopes_->size());
  return *(*scopes_)[scope_id];
-  }
-
-  size_t counter_;
-  StepScopeVar *scopes_;
-  bool is_train_;
-  bool is_backward_;
-};
+}

-// Base class for RecurrentOp/RecurrentGradOp
-//    Some common protected functions for RecurrentOp/RecurrentGradOp
-class RecurrentBase : public framework::OperatorBase {
- public:
-  RecurrentBase(const std::string &type,
+RecurrentBase::RecurrentBase(const std::string &type,
                             const framework::VariableNameMap &inputs,
                             const framework::VariableNameMap &outputs,
                             const framework::AttributeMap &attrs)
    : OperatorBase(type, inputs, outputs, attrs) {}

- protected:
-  // Get SequenceLength from Scope
-  //   The sequence length is got from input tensor. The input tensor's
-  //   dimension should be [SEQ_LEN, ..., ...]. The first of the tensor's shape
-  //   is SEQ_LEN. The second of the tensor's shape could be the batch size or
-  //   nested sequence length.
-  int64_t GetSequenceLength(const framework::Scope &scope) const {
+// Get SequenceLength from Scope
+//   The sequence length is got from input tensor. The input tensor's
+//   dimension should be [SEQ_LEN, ..., ...]. The first of the tensor's shape
+//   is SEQ_LEN. The second of the tensor's shape could be the batch size or
+//   nested sequence length.
+int64_t RecurrentBase::GetSequenceLength(const framework::Scope &scope) const {
  // Dim format SEQ_LEN, BATCH_SIZE, ...
  int64_t seq_len = -1;
  auto &all_inputs = Inputs(kInputs);
@@ -149,12 +138,12 @@ class RecurrentBase : public framework::OperatorBase {
    }
  }
  return seq_len;
-  }
+}

-  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
-  //                                   map(dst_scope.Var, dst_vars)):
-  //   dst_tensor.ShareDataWith(src_tensor)
-  static void LinkTensor(const framework::Scope &src_scope,
+// for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+//                                   map(dst_scope.Var, dst_vars)):
+//   dst_tensor.ShareDataWith(src_tensor)
+void RecurrentBase::LinkTensor(const framework::Scope &src_scope,
                               const std::vector<std::string> &src_vars,
                               framework::Scope *dst_scope,
                               const std::vector<std::string> &dst_vars) {
@@ -163,100 +152,24 @@ class RecurrentBase : public framework::OperatorBase {
      [&](const framework::Tensor &src, framework::Tensor *dst) {
        dst->ShareDataWith(src);
      });
-  }
-
-  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
-  //                                   map(dst_scope.Var, dst_vars)):
-  //   callback(src_tensor, &dst_tensor)
-  template <typename Callback>
-  static void LinkTensorWithCallback(const framework::Scope &src_scope,
-                                     const std::vector<std::string> &src_vars,
-                                     framework::Scope *dst_scope,
-                                     const std::vector<std::string> &dst_vars,
-                                     Callback callback,
-                                     bool is_backward = false) {
-    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
-    for (size_t i = 0; i < dst_vars.size(); ++i) {
-      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
-      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback,
-                   is_backward);
-    }
-  }
-
-  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
-  //                                   map(dst_scope.FindVar, dst_vars)):
-  //   callback(src_tensor, &dst_tensor)
-  template <typename Callback>
-  static void LinkTensorWithCallback(const framework::Scope &src_scope,
-                                     const std::vector<std::string> &src_vars,
-                                     const framework::Scope &dst_scope,
-                                     const std::vector<std::string> &dst_vars,
-                                     Callback callback,
-                                     bool is_backward = false) {
-    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
-    for (size_t i = 0; i < dst_vars.size(); ++i) {
-      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
-      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback,
-                   is_backward);
-    }
-  }
+}

-  // (seq_len, shape) -> return [seq_len] + list(shape)
-  static framework::DDim PrependDims(size_t seq_len,
+// (seq_len, shape) -> return [seq_len] + list(shape)
+framework::DDim RecurrentBase::PrependDims(size_t seq_len,
                                           const framework::DDim &src) {
  auto dims = framework::vectorize(src);
  dims.insert(dims.begin(), static_cast<int64_t>(seq_len));
  return framework::make_ddim(dims);
-  }
-
- private:
-  template <typename Callback>
-  static void AccessTensor(const framework::Scope &src_scope,
-                           const std::string &src_var_name,
-                           framework::Scope *dst_scope,
-                           const std::string &dst_var_name, Callback callback,
-                           bool is_backward = false) {
-    auto *src_var = src_scope.FindVar(src_var_name);
-    if (is_backward && src_var == nullptr) {
-      return;
-    }
-    PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name);
-    auto &src_tensor = src_var->Get<framework::LoDTensor>();
-
-    auto *dst_var = dst_scope->Var(dst_var_name);
-    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
-    callback(src_tensor, dst_tensor);
-  }
-
-  template <typename Callback>
-  static void AccessTensor(const framework::Scope &src_scope,
-                           const std::string &src_var_name,
-                           const framework::Scope &dst_scope,
-                           const std::string &dst_var_name, Callback callback,
-                           bool is_backward = false) {
-    auto *dst_var = dst_scope.FindVar(dst_var_name);
-    if (is_backward && dst_var == nullptr) {
-      return;
-    }
-    auto *src_var = src_scope.FindVar(src_var_name);
-    PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name);
-    auto &src_tensor = src_var->Get<framework::LoDTensor>();
-    PADDLE_ENFORCE(dst_var != nullptr, "%s is not found.", dst_var_name);
-    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
-    callback(src_tensor, dst_tensor);
-  }
-};
+}

-class RecurrentOp : public RecurrentBase {
- public:
-  RecurrentOp(const std::string &type, const framework::VariableNameMap &inputs,
+RecurrentOp::RecurrentOp(const std::string &type,
+                         const framework::VariableNameMap &inputs,
                         const framework::VariableNameMap &outputs,
                         const framework::AttributeMap &attrs)
    : RecurrentBase(type, inputs, outputs, attrs) {}

- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
+void RecurrentOp::RunImpl(const framework::Scope &scope,
+                          const platform::Place &place) const {
  bool has_state = Attr<bool>(kHasStates);
  auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));

@@ -273,8 +186,8 @@ class RecurrentOp : public RecurrentBase {

  auto *program = block->Program();
  auto ctx = executor.Prepare(
-        *program, block->ID(), std::vector<std::string>() /*skip_ref_cnt_vars*/,
-        true /*force_disable_gc*/);
+      *program, block->ID(), Attr<std::vector<std::string>>(
+                                 kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/);

  for (size_t i = 0; i < seq_len; ++i) {
    size_t seq_offset = reverse ? seq_len - i - 1 : i;
@@ -331,30 +244,25 @@ class RecurrentOp : public RecurrentBase {

    scopes.Next();
  }
-  }
+}

- private:
-  StepScopes CreateStepScopes(const platform::DeviceContext &dev_ctx,
+StepScopes RecurrentOp::CreateStepScopes(const platform::DeviceContext &dev_ctx,
                                         const framework::Scope &scope,
                                         size_t seq_len) const {
  auto *var = scope.FindVar(Output(kStepScopes));
  PADDLE_ENFORCE(var != nullptr);
  return StepScopes(dev_ctx, scope, var->GetMutable<StepScopeVar>(),
                    Attr<bool>(kIsTrain), seq_len);
-  }
-};
+}

-class RecurrentGradOp : public RecurrentBase {
- public:
-  RecurrentGradOp(const std::string &type,
+RecurrentGradOp::RecurrentGradOp(const std::string &type,
                                 const framework::VariableNameMap &inputs,
                                 const framework::VariableNameMap &outputs,
                                 const framework::AttributeMap &attrs)
    : RecurrentBase(type, inputs, outputs, attrs) {}

- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
+void RecurrentGradOp::RunImpl(const framework::Scope &scope,
+                              const platform::Place &place) const {
  bool has_state = Attr<bool>(kHasStates);
  const size_t seq_len = static_cast<size_t>(GetSequenceLength(scope));

@@ -369,8 +277,8 @@ class RecurrentGradOp : public RecurrentBase {
  auto *block = Attr<framework::BlockDesc *>(kStepBlock);
  auto *program = block->Program();
  auto ctx = executor.Prepare(
-        *program, block->ID(), std::vector<std::string>() /*skip_ref_cnt_vars*/,
-        true /*force_disable_gc*/);
+      *program, block->ID(), Attr<std::vector<std::string>>(
+                                 kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/);

  for (size_t step_id = 0; step_id < seq_len; ++step_id) {
    size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
@@ -456,8 +364,8 @@ class RecurrentGradOp : public RecurrentBase {

        // zero gradient variable in step 0
        if (step_id == 0) {
-            auto &inside_tensor = cur_scope.FindVar(inside_grad_name)
-                                      ->Get<framework::LoDTensor>();
+          auto &inside_tensor =
+              cur_scope.FindVar(inside_grad_name)->Get<framework::LoDTensor>();
          framework::AttributeMap attrs;
          attrs["dtype"] = inside_tensor.type();
          attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
@@ -487,8 +395,7 @@ class RecurrentGradOp : public RecurrentBase {
    //   outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad
    LinkTensorWithCallback(
        cur_scope, GradVarLists(Inputs(kInputs)), scope, Outputs(kInputGrads),
-          [&](const framework::LoDTensor &inside,
-              framework::LoDTensor *outside) {
+        [&](const framework::LoDTensor &inside, framework::LoDTensor *outside) {
          if (inside.memory_size() == 0) {  // IG is not created.
            return;
          }
@@ -507,9 +414,8 @@ class RecurrentGradOp : public RecurrentBase {
      if (step_id + 1 == seq_len) {  // at_end
        // copy initialize states gradient from inside to outside
        LinkTensorWithCallback(
-              cur_scope,
-              GradVarLists(Attr<std::vector<std::string>>(kExStates)), scope,
-              Outputs(kInitStateGrads),
+            cur_scope, GradVarLists(Attr<std::vector<std::string>>(kExStates)),
+            scope, Outputs(kInitStateGrads),
            [&](const framework::LoDTensor &inside,
                framework::LoDTensor *outside) {
              outside->Resize(inside.dims());
@@ -526,21 +432,19 @@ class RecurrentGradOp : public RecurrentBase {
  auto *var = scope.FindVar(Input(kStepScopes));
  PADDLE_ENFORCE(var != nullptr);
  auto *step_scopes = var->GetMutable<StepScopeVar>();
-    ClearStepScopes(dev_ctx, const_cast<framework::Scope *>(&scope),
-                    step_scopes);
-  }
+  ClearStepScopes(dev_ctx, const_cast<framework::Scope *>(&scope), step_scopes);
+}

- private:
-  StepScopes CreateStepScopes(const platform::DeviceContext &dev_ctx,
-                              const framework::Scope &scope,
+StepScopes RecurrentGradOp::CreateStepScopes(
+    const platform::DeviceContext &dev_ctx, const framework::Scope &scope,
    size_t seq_len) const {
  auto *var = scope.FindVar(Input(kStepScopes));
  PADDLE_ENFORCE(var != nullptr);
  return StepScopes(dev_ctx, scope, var->GetMutable<StepScopeVar>(),
                    Attr<bool>(kIsTrain), seq_len, true /*is_backward*/);
-  }
+}

-  std::unordered_set<std::string> List2Set(
+std::unordered_set<std::string> RecurrentGradOp::List2Set(
    const std::vector<std::string> &list) const {
  std::unordered_set<std::string> local_var_name_set;
  local_var_name_set.reserve(list.size());
@@ -548,51 +452,56 @@ class RecurrentGradOp : public RecurrentBase {
    local_var_name_set.insert(each);
  }
  return local_var_name_set;
-  }
+}

-  std::unordered_set<std::string> LocalVarNames(
+std::unordered_set<std::string> RecurrentGradOp::LocalVarNames(
    const framework::Scope &scope) const {
  return this->List2Set(scope.LocalVarNames());
-  }
-  static std::vector<std::string> GradVarLists(
+}
+std::vector<std::string> RecurrentGradOp::GradVarLists(
    const std::vector<std::string> &var_names) {
  std::vector<std::string> retv;
  retv.reserve(var_names.size());
  std::transform(var_names.begin(), var_names.end(), std::back_inserter(retv),
                 framework::GradVarName);
  return retv;
-  }
-};
+}

 class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput(kInputs, "rnn inputs").AsDuplicable();
-    AddInput(kInitialStates, "rnn initial states").AsDuplicable();
-    AddInput(kParameters,
+    AddInput(RecurrentBase::kInputs, "rnn inputs").AsDuplicable();
+    AddInput(RecurrentBase::kInitialStates, "rnn initial states")
+        .AsDuplicable();
+    AddInput(RecurrentBase::kParameters,
             "Parameters are used by step block as its input. However, the "
             "input is not a sequence tensor. Every time step, each operator "
             "in step block just use the parameter directly.")
        .AsDuplicable();
-    AddOutput(kOutputs,
+    AddOutput(RecurrentBase::kOutputs,
              "The output sequence of RNN. The sequence length must be same.")
        .AsDuplicable();
-    AddOutput(kStepScopes,
+    AddOutput(RecurrentBase::kStepScopes,
              "StepScopes contain all local variables in each time step.");
-    AddAttr<bool>(kHasStates, "Whether has states.").SetDefault(false);
-    AddAttr<std::vector<std::string>>(kExStates,
+    AddAttr<bool>(RecurrentBase::kHasStates, "Whether has states.")
+        .SetDefault(false);
+    AddAttr<std::vector<std::string>>(
+        RecurrentBase::kExStates,
        string::Sprintf(
            R"DOC(The ex-state variable names.
 The ex-state means the state value in the ex-timestep or the previous time step
 [%s, %s, %s] must be the same order)DOC",
-                                          kExStates, kStates, kInitStateGrads));
+            RecurrentBase::kExStates, RecurrentBase::kStates,
+            RecurrentBase::kInitStateGrads));
    AddAttr<std::vector<std::string>>(
-        kStates,
+        RecurrentBase::kStates,
        string::Sprintf(
            "The state variable names. [%s, %s, %s] must be the same order",
-            kExStates, kStates, kInitStateGrads));
-    AddAttr<framework::BlockDesc *>(kStepBlock, "The step block inside RNN");
-    AddAttr<bool>(kReverse, R"DOC(Calculate RNN reversely or not.
+            RecurrentBase::kExStates, RecurrentBase::kStates,
+            RecurrentBase::kInitStateGrads));
+    AddAttr<framework::BlockDesc *>(RecurrentBase::kStepBlock,
+                                    "The step block inside RNN");
+    AddAttr<bool>(RecurrentBase::kReverse, R"DOC(Calculate RNN reversely or not.
 By default reverse=False

 Assume the input data is [A, B, C, D]
@@ -617,7 +526,12 @@ if reverse is True
      v          v          v         v
      o          o          o         o
 )DOC").SetDefault(false);
-    AddAttr<bool>(kIsTrain, "").SetDefault(true);
+    AddAttr<bool>(RecurrentBase::kIsTrain, "").SetDefault(true);
+    AddAttr<std::vector<std::string>>(RecurrentBase::kSkipEagerDeletionVars,
+                                      "Vars that would skip eager deletion."
+                                      "Users should not set this manually.")
+        .SetDefault(std::vector<std::string>());
+
    AddComment(R"DOC(
 Static Length Recurrent Operator.

@@ -643,7 +557,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
    }

    for (auto &output_param : this->OutputNames()) {
-      if (output_param == kStepScopes) {
+      if (output_param == RecurrentBase::kStepScopes) {
        grad->SetInput(output_param, this->Output(output_param));
        grad->SetInput(framework::GradVarName(output_param),
                       this->Output(output_param));
@@ -654,7 +568,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
      }
    }
    grad->SetAttrMap(this->Attrs());
-    grad->SetBlockAttr(kStepBlock, grad_block_[0]);
+    grad->SetBlockAttr(RecurrentBase::kStepBlock, grad_block_[0]);

    return std::unique_ptr<framework::OpDesc>(grad);
  }
@@ -663,46 +577,55 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
 class RecurrentGradOpShapeInference : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext *ctx) const override {
-    std::vector<std::string> output{kOutputs};
+    std::vector<std::string> output{RecurrentBase::kOutputs};

    // In some case the kInitialStates is empty.
    // If the kInitialStates is empty, all the states should be empty.
-    if (!ctx->HasInputs(kInitialStates)) {
+    if (!ctx->HasInputs(RecurrentBase::kInitialStates)) {
      PADDLE_ENFORCE_EQ(
-          ctx->Attrs().Get<std::vector<std::string>>(kExStates).size(), 0,
-          "The Attr(%s) should be empty.", kExStates);
+          ctx->Attrs()
+              .Get<std::vector<std::string>>(RecurrentBase::kExStates)
+              .size(),
+          0, "The Attr(%s) should be empty.", RecurrentBase::kExStates);
      PADDLE_ENFORCE_EQ(
-          ctx->Attrs().Get<std::vector<std::string>>(kStates).size(), 0,
-          "The Attr(%s) should be empty.", kStates);
+          ctx->Attrs()
+              .Get<std::vector<std::string>>(RecurrentBase::kStates)
+              .size(),
+          0, "The Attr(%s) should be empty.", RecurrentBase::kStates);
    }

-    PADDLE_ENFORCE(ctx->HasInputs(kInputs),
-                   "The input(%s) should not be empty.", kInputs);
-    PADDLE_ENFORCE(ctx->HasInputs(kOutputs),
-                   "The input(%s) should not be empty.", kOutputs);
+    PADDLE_ENFORCE(ctx->HasInputs(RecurrentBase::kInputs),
+                   "The input(%s) should not be empty.",
+                   RecurrentBase::kInputs);
+    PADDLE_ENFORCE(ctx->HasInputs(RecurrentBase::kOutputs),
+                   "The input(%s) should not be empty.",
+                   RecurrentBase::kOutputs);

    // In some case the kInitialStates is empty.
-    if (ctx->HasInputs(kInitialStates)) {
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInitialStates)),
+    if (ctx->HasInputs(RecurrentBase::kInitialStates)) {
+      PADDLE_ENFORCE(ctx->HasOutputs(
+                         framework::GradVarName(RecurrentBase::kInitialStates)),
                     "The output of(%s) should not be empty.",
-                     framework::GradVarName(kInitialStates));
-      ctx->SetOutputsDim(framework::GradVarName(kInitialStates),
-                         ctx->GetInputsDim(kInitialStates));
+                     framework::GradVarName(RecurrentBase::kInitialStates));
+      ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kInitialStates),
+                         ctx->GetInputsDim(RecurrentBase::kInitialStates));
    }

-    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInputs)),
+    PADDLE_ENFORCE(
+        ctx->HasOutputs(framework::GradVarName(RecurrentBase::kInputs)),
        "The output of(%s) should not be empty.",
-                   framework::GradVarName(kInputs));
-    ctx->SetOutputsDim(framework::GradVarName(kInputs),
-                       ctx->GetInputsDim(kInputs));
+        framework::GradVarName(RecurrentBase::kInputs));
+    ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kInputs),
+                       ctx->GetInputsDim(RecurrentBase::kInputs));

    // In some case the kParameters is empty.
-    if (ctx->HasInputs(kParameters)) {
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)),
+    if (ctx->HasInputs(RecurrentBase::kParameters)) {
+      PADDLE_ENFORCE(
+          ctx->HasOutputs(framework::GradVarName(RecurrentBase::kParameters)),
          "The output of(%s) should not be empty.",
-                     framework::GradVarName(kParameters));
-      ctx->SetOutputsDim(framework::GradVarName(kParameters),
-                         ctx->GetInputsDim(kParameters));
+          framework::GradVarName(RecurrentBase::kParameters));
+      ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kParameters),
+                         ctx->GetInputsDim(RecurrentBase::kParameters));
    }
  }
 };

--- a/paddle/fluid/operators/recurrent_op.h
+++ b/paddle/fluid/operators/recurrent_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+// StepScopes manages scopes inside RNN.
+//    StepScopes::CurScope() get the current scope
+//    StepScopes::ExScope() get the ex-scope, or scope in previous time step.
+//    StepScopes::Next() move to next time step.
+//
+// if is_train = False, then
+//   there are two scopes for the RNN and just support forward.
+// else
+//   the len(scopes) == seq_len
+//
+// if is_backward = True, then
+//   reversely access scopes
+// else
+//   access scopes from begin to end.
+class StepScopes {
+ public:
+  StepScopes(const platform::DeviceContext &dev_ctx,
+             const framework::Scope &parent,
+             std::vector<framework::Scope *> *scopes, bool is_train,
+             size_t seq_len, bool is_backward = false);
+
+  framework::Scope &CurScope();
+
+  framework::Scope &ExScope();
+
+  void Next();
+
+ private:
+  framework::Scope &GetScope(size_t scope_id) const;
+
+  size_t counter_;
+  std::vector<framework::Scope *> *scopes_;
+  bool is_train_;
+  bool is_backward_;
+};
+
+// Base class for RecurrentOp/RecurrentGradOp
+//    Some common protected functions for RecurrentOp/RecurrentGradOp
+class RecurrentBase : public framework::OperatorBase {
+ public:
+  static const char kInputs[];
+  static const char kInitialStates[];
+  static const char kParameters[];
+  static const char kOutputs[];
+  static const char kStepScopes[];
+  static const char kHasStates[];
+  static const char kExStates[];
+  static const char kStates[];
+  static const char kStepBlock[];
+  static const char kReverse[];
+  static const char kIsTrain[];
+  static const char kSkipEagerDeletionVars[];
+  static const char kInputGrads[];
+  static const char kOutputGrads[];
+  static const char kParamGrads[];
+  static const char kInitStateGrads[];
+
+  RecurrentBase(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs);
+
+ protected:
+  // Get SequenceLength from Scope
+  //   The sequence length is got from input tensor. The input tensor's
+  //   dimension should be [SEQ_LEN, ..., ...]. The first of the tensor's shape
+  //   is SEQ_LEN. The second of the tensor's shape could be the batch size or
+  //   nested sequence length.
+  int64_t GetSequenceLength(const framework::Scope &scope) const;
+
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.Var, dst_vars)):
+  //   dst_tensor.ShareDataWith(src_tensor)
+  static void LinkTensor(const framework::Scope &src_scope,
+                         const std::vector<std::string> &src_vars,
+                         framework::Scope *dst_scope,
+                         const std::vector<std::string> &dst_vars);
+
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.Var, dst_vars)):
+  //   callback(src_tensor, &dst_tensor)
+  template <typename Callback>
+  static void LinkTensorWithCallback(const framework::Scope &src_scope,
+                                     const std::vector<std::string> &src_vars,
+                                     framework::Scope *dst_scope,
+                                     const std::vector<std::string> &dst_vars,
+                                     Callback callback,
+                                     bool is_backward = false) {
+    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
+    for (size_t i = 0; i < dst_vars.size(); ++i) {
+      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback,
+                   is_backward);
+    }
+  }
+
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.FindVar, dst_vars)):
+  //   callback(src_tensor, &dst_tensor)
+  template <typename Callback>
+  static void LinkTensorWithCallback(const framework::Scope &src_scope,
+                                     const std::vector<std::string> &src_vars,
+                                     const framework::Scope &dst_scope,
+                                     const std::vector<std::string> &dst_vars,
+                                     Callback callback,
+                                     bool is_backward = false) {
+    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
+    for (size_t i = 0; i < dst_vars.size(); ++i) {
+      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback,
+                   is_backward);
+    }
+  }
+
+  // (seq_len, shape) -> return [seq_len] + list(shape)
+  static framework::DDim PrependDims(size_t seq_len,
+                                     const framework::DDim &src);
+
+ private:
+  template <typename Callback>
+  static void AccessTensor(const framework::Scope &src_scope,
+                           const std::string &src_var_name,
+                           framework::Scope *dst_scope,
+                           const std::string &dst_var_name, Callback callback,
+                           bool is_backward = false) {
+    auto *src_var = src_scope.FindVar(src_var_name);
+    if (is_backward && src_var == nullptr) {
+      return;
+    }
+    PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name);
+    auto &src_tensor = src_var->Get<framework::LoDTensor>();
+
+    auto *dst_var = dst_scope->Var(dst_var_name);
+    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
+    callback(src_tensor, dst_tensor);
+  }
+
+  template <typename Callback>
+  static void AccessTensor(const framework::Scope &src_scope,
+                           const std::string &src_var_name,
+                           const framework::Scope &dst_scope,
+                           const std::string &dst_var_name, Callback callback,
+                           bool is_backward = false) {
+    auto *dst_var = dst_scope.FindVar(dst_var_name);
+    if (is_backward && dst_var == nullptr) {
+      return;
+    }
+    auto *src_var = src_scope.FindVar(src_var_name);
+    PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name);
+    auto &src_tensor = src_var->Get<framework::LoDTensor>();
+    PADDLE_ENFORCE(dst_var != nullptr, "%s is not found.", dst_var_name);
+    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
+    callback(src_tensor, dst_tensor);
+  }
+};
+
+class RecurrentOp : public RecurrentBase {
+ public:
+  RecurrentOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs);
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override;
+
+ private:
+  StepScopes CreateStepScopes(const platform::DeviceContext &dev_ctx,
+                              const framework::Scope &scope,
+                              size_t seq_len) const;
+};
+
+class RecurrentGradOp : public RecurrentBase {
+ public:
+  RecurrentGradOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs);
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override;
+
+  StepScopes CreateStepScopes(const platform::DeviceContext &dev_ctx,
+                              const framework::Scope &scope,
+                              size_t seq_len) const;
+
+  std::unordered_set<std::string> List2Set(
+      const std::vector<std::string> &list) const;
+
+  std::unordered_set<std::string> LocalVarNames(
+      const framework::Scope &scope) const;
+
+  static std::vector<std::string> GradVarLists(
+      const std::vector<std::string> &var_names);
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -91,6 +91,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {

    auto in_grad_var_name = Output(framework::GradVarName("X"));
    auto *in_grad_var = scope.FindVar(in_grad_var_name);
+
    PADDLE_ENFORCE(in_grad_var != nullptr,
                   "Cannot find in_grad_var in scope, name is %s",
                   in_grad_var_name);

--- a/paddle/fluid/string/string_helper.h
+++ b/paddle/fluid/string/string_helper.h
@@ -119,16 +119,18 @@ std::vector<T> split_string(const std::string& str) {
  return list;
 }

-template <class T>
-std::string join_strings(const std::vector<T>& strs, char delim) {
+template <class Container>
+std::string join_strings(const Container& strs, char delim) {
  std::string str;

-  for (size_t i = 0; i < strs.size(); i++) {
+  int i = 0;
+  for (auto& elem : strs) {
    if (i > 0) {
      str += delim;
    }

-    str += boost::lexical_cast<std::string>(strs[i]);
+    str += boost::lexical_cast<std::string>(elem);
+    ++i;
  }

  return str;

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.layers as layers
+import time
+import os
+
+from paddle.fluid import ParamAttr
+from paddle.fluid.contrib.layers import basic_lstm
+from paddle.fluid.executor import Executor
+from paddle.fluid.layers.control_flow import StaticRNN as PaddingRNN
+
+os.environ["CPU_NUM"] = "1"
+
+
+class RnnConfig(object):
+    def __init__(self, model_type, rnn_model):
+        self.model_type = model_type
+        self.rnn_model = rnn_model
+
+        self.vocab_size = 10000
+        if self.model_type == "test":
+            self.num_layers = 1
+            self.batch_size = 2
+            self.hidden_size = 10
+            self.num_steps = 3
+            self.init_scale = 0.1
+            self.max_grad_norm = 5.0
+            self.epoch_start_decay = 1
+            self.max_epoch = 1
+            self.dropout = 0.0
+            self.lr_decay = 0.5
+            self.base_learning_rate = 1.0
+        elif self.model_type == "small":
+            self.num_layers = 2
+            self.batch_size = 20
+            self.hidden_size = 200
+            self.num_steps = 20
+            self.init_scale = 0.1
+            self.max_grad_norm = 5.0
+            self.epoch_start_decay = 4
+            self.max_epoch = 13
+            self.dropout = 0.0
+            self.lr_decay = 0.5
+            self.base_learning_rate = 1.0
+        elif self.model_type == "medium":
+            self.num_layers = 2
+            self.batch_size = 20
+            self.hidden_size = 650
+            self.num_steps = 35
+            self.init_scale = 0.05
+            self.max_grad_norm = 5.0
+            self.epoch_start_decay = 6
+            self.max_epoch = 39
+            self.dropout = 0.5
+            self.lr_decay = 0.8
+            self.base_learning_rate = 1.0
+        elif self.model_type == "large":
+            self.num_layers = 2
+            self.batch_size = 20
+            self.hidden_size = 1500
+            self.num_steps = 35
+            self.init_scale = 0.04
+            self.max_grad_norm = 10.0
+            self.epoch_start_decay = 14
+            self.max_epoch = 55
+            self.dropout = 0.65
+            self.lr_decay = 1.0 / 1.15
+            self.base_learning_rate = 1.0
+        else:
+            raise ValueError('Unsupported model_type.')
+
+        if rnn_model not in ('static', 'padding', 'cudnn', 'basic_lstm'):
+            raise ValueError('Unsupported rnn_model.')
+
+        self.batch_size = 12
+        self.max_epoch = 3
+        self.random_seed = 123
+
+
+# Fake data reader for test
+class Reader(object):
+    def get_data_iter(self, rnn_config):
+        for i in range(rnn_config.max_epoch):
+            x = np.zeros(
+                shape=(rnn_config.batch_size, rnn_config.num_steps),
+                dtype='int64')
+            y = np.ones(
+                shape=(rnn_config.batch_size, rnn_config.num_steps),
+                dtype='int64')
+            yield (x, y)
+
+
+# Model from PaddleNLP/models/language_model/lm_model.py in Paddle Models repo
+def lm_model(hidden_size,
+             vocab_size,
+             batch_size,
+             num_layers=2,
+             num_steps=20,
+             init_scale=0.1,
+             dropout=None,
+             rnn_model='static',
+             use_py_reader=False):
+    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
+        weight_1_arr = []
+        weight_2_arr = []
+        bias_arr = []
+        hidden_array = []
+        cell_array = []
+        mask_array = []
+        for i in range(num_layers):
+            weight_1 = layers.create_parameter(
+                [hidden_size * 2, hidden_size * 4],
+                dtype="float32",
+                name="fc_weight1_" + str(i),
+                default_initializer=fluid.initializer.UniformInitializer(
+                    low=-init_scale, high=init_scale))
+            weight_1_arr.append(weight_1)
+            bias_1 = layers.create_parameter(
+                [hidden_size * 4],
+                dtype="float32",
+                name="fc_bias1_" + str(i),
+                default_initializer=fluid.initializer.Constant(0.0))
+            bias_arr.append(bias_1)
+
+            pre_hidden = layers.slice(
+                init_hidden, axes=[0], starts=[i], ends=[i + 1])
+            pre_cell = layers.slice(
+                init_cell, axes=[0], starts=[i], ends=[i + 1])
+            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
+            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
+            hidden_array.append(pre_hidden)
+            cell_array.append(pre_cell)
+
+        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
+        rnn = PaddingRNN()
+
+        with rnn.step():
+            input = rnn.step_input(input_embedding)
+            for k in range(num_layers):
+                pre_hidden = rnn.memory(init=hidden_array[k])
+                pre_cell = rnn.memory(init=cell_array[k])
+                weight_1 = weight_1_arr[k]
+                bias = bias_arr[k]
+
+                nn = layers.concat([input, pre_hidden], 1)
+                gate_input = layers.matmul(x=nn, y=weight_1)
+
+                gate_input = layers.elementwise_add(gate_input, bias)
+                i = layers.slice(
+                    gate_input, axes=[1], starts=[0], ends=[hidden_size])
+                j = layers.slice(
+                    gate_input,
+                    axes=[1],
+                    starts=[hidden_size],
+                    ends=[hidden_size * 2])
+                f = layers.slice(
+                    gate_input,
+                    axes=[1],
+                    starts=[hidden_size * 2],
+                    ends=[hidden_size * 3])
+                o = layers.slice(
+                    gate_input,
+                    axes=[1],
+                    starts=[hidden_size * 3],
+                    ends=[hidden_size * 4])
+
+                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
+                    i) * layers.tanh(j)
+                m = layers.tanh(c) * layers.sigmoid(o)
+
+                rnn.update_memory(pre_hidden, m)
+                rnn.update_memory(pre_cell, c)
+
+                rnn.step_output(m)
+                rnn.step_output(c)
+
+                input = m
+
+                if dropout != None and dropout > 0.0:
+                    input = layers.dropout(
+                        input,
+                        dropout_prob=dropout,
+                        dropout_implementation='upscale_in_train')
+
+            rnn.step_output(input)
+        rnnout = rnn()
+
+        last_hidden_array = []
+        last_cell_array = []
+        real_res = rnnout[-1]
+        for i in range(num_layers):
+            m = rnnout[i * 2]
+            c = rnnout[i * 2 + 1]
+            m.stop_gradient = True
+            c.stop_gradient = True
+            last_h = layers.slice(
+                m, axes=[0], starts=[num_steps - 1], ends=[num_steps])
+            last_hidden_array.append(last_h)
+            last_c = layers.slice(
+                c, axes=[0], starts=[num_steps - 1], ends=[num_steps])
+            last_cell_array.append(last_c)
+        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
+        last_hidden = layers.concat(last_hidden_array, 0)
+        last_cell = layers.concat(last_cell_array, 0)
+
+        return real_res, last_hidden, last_cell
+
+    def encoder_static(input_embedding, len=3, init_hidden=None,
+                       init_cell=None):
+
+        weight_1_arr = []
+        weight_2_arr = []
+        bias_arr = []
+        hidden_array = []
+        cell_array = []
+        mask_array = []
+        for i in range(num_layers):
+            weight_1 = layers.create_parameter(
+                [hidden_size * 2, hidden_size * 4],
+                dtype="float32",
+                name="fc_weight1_" + str(i),
+                default_initializer=fluid.initializer.UniformInitializer(
+                    low=-init_scale, high=init_scale))
+            weight_1_arr.append(weight_1)
+            bias_1 = layers.create_parameter(
+                [hidden_size * 4],
+                dtype="float32",
+                name="fc_bias1_" + str(i),
+                default_initializer=fluid.initializer.Constant(0.0))
+            bias_arr.append(bias_1)
+
+            pre_hidden = layers.slice(
+                init_hidden, axes=[0], starts=[i], ends=[i + 1])
+            pre_cell = layers.slice(
+                init_cell, axes=[0], starts=[i], ends=[i + 1])
+            pre_hidden = layers.reshape(
+                pre_hidden, shape=[-1, hidden_size], inplace=True)
+            pre_cell = layers.reshape(
+                pre_cell, shape=[-1, hidden_size], inplace=True)
+            hidden_array.append(pre_hidden)
+            cell_array.append(pre_cell)
+
+        res = []
+        sliced_inputs = layers.split(
+            input_embedding, num_or_sections=len, dim=1)
+
+        for index in range(len):
+            input = sliced_inputs[index]
+            input = layers.reshape(input, shape=[-1, hidden_size], inplace=True)
+            for k in range(num_layers):
+                pre_hidden = hidden_array[k]
+                pre_cell = cell_array[k]
+                weight_1 = weight_1_arr[k]
+                bias = bias_arr[k]
+
+                nn = layers.concat([input, pre_hidden], 1)
+                gate_input = layers.matmul(x=nn, y=weight_1)
+
+                gate_input = layers.elementwise_add(gate_input, bias)
+                i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
+
+                try:
+                    from paddle.fluid.contrib.layers import fused_elemwise_activation
+                    # fluid.contrib.layers.fused_elemwise_activation can do a fused
+                    # operation, like:
+                    # 1) x + sigmoid(y); x + tanh(y)
+                    # 2) tanh(x + y)
+                    # Now the unary operation supported in this fused op is limit, and
+                    # we will extent this operation to support more unary operations and
+                    # do this kind of fusion automitically in future version of paddle.fluid.
+                    # layers.sigmoid(i) * layers.tanh(j)
+                    tmp0 = fused_elemwise_activation(
+                        x=layers.tanh(j),
+                        y=i,
+                        functor_list=['elementwise_mul', 'sigmoid'],
+                        save_intermediate_out=False)
+                    # pre_cell * layers.sigmoid(f)
+                    tmp1 = fused_elemwise_activation(
+                        x=pre_cell,
+                        y=f,
+                        functor_list=['elementwise_mul', 'sigmoid'],
+                        save_intermediate_out=False)
+                    c = tmp0 + tmp1
+                    # layers.tanh(c) * layers.sigmoid(o)
+                    m = fused_elemwise_activation(
+                        x=layers.tanh(c),
+                        y=o,
+                        functor_list=['elementwise_mul', 'sigmoid'],
+                        save_intermediate_out=False)
+                except ImportError:
+                    c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
+                        i) * layers.tanh(j)
+                    m = layers.tanh(c) * layers.sigmoid(o)
+
+                hidden_array[k] = m
+                cell_array[k] = c
+                input = m
+
+                if dropout != None and dropout > 0.0:
+                    input = layers.dropout(
+                        input,
+                        dropout_prob=dropout,
+                        dropout_implementation='upscale_in_train')
+
+            res.append(input)
+
+        last_hidden = layers.concat(hidden_array, 1)
+        last_hidden = layers.reshape(
+            last_hidden, shape=[-1, num_layers, hidden_size], inplace=True)
+        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])
+
+        last_cell = layers.concat(cell_array, 1)
+        last_cell = layers.reshape(
+            last_cell, shape=[-1, num_layers, hidden_size])
+        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])
+
+        real_res = layers.concat(res, 0)
+        real_res = layers.reshape(
+            real_res, shape=[len, -1, hidden_size], inplace=True)
+        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
+
+        return real_res, last_hidden, last_cell
+
+    batch_size_each = batch_size
+    if use_py_reader:
+        feed_shapes = [[batch_size_each, num_steps, 1],
+                       [batch_size_each * num_steps, 1]]
+        py_reader = fluid.layers.py_reader(
+            capacity=16, shapes=feed_shapes, dtypes=['int64', 'int64'])
+        x, y = fluid.layers.read_file(py_reader)
+    else:
+        x = layers.data(
+            name="x",
+            shape=[batch_size_each, num_steps, 1],
+            dtype='int64',
+            append_batch_size=False)
+        y = layers.data(
+            name="y",
+            shape=[batch_size_each * num_steps, 1],
+            dtype='int64',
+            append_batch_size=False)
+
+    init_hidden = layers.data(
+        name="init_hidden",
+        shape=[num_layers, batch_size_each, hidden_size],
+        dtype='float32',
+        append_batch_size=False)
+    init_cell = layers.data(
+        name="init_cell",
+        shape=[num_layers, batch_size_each, hidden_size],
+        dtype='float32',
+        append_batch_size=False)
+
+    init_cell.persistable = True
+    init_hidden.persistable = True
+
+    init_hidden_reshape = layers.reshape(
+        init_hidden, shape=[num_layers, -1, hidden_size])
+    init_cell_reshape = layers.reshape(
+        init_cell, shape=[num_layers, -1, hidden_size])
+
+    x_emb = layers.embedding(
+        input=x,
+        size=[vocab_size, hidden_size],
+        dtype='float32',
+        is_sparse=False,
+        param_attr=fluid.ParamAttr(
+            name='embedding_para',
+            initializer=fluid.initializer.UniformInitializer(
+                low=-init_scale, high=init_scale)))
+
+    x_emb = layers.reshape(
+        x_emb, shape=[-1, num_steps, hidden_size], inplace=True)
+    if dropout != None and dropout > 0.0:
+        x_emb = layers.dropout(
+            x_emb,
+            dropout_prob=dropout,
+            dropout_implementation='upscale_in_train')
+
+    if rnn_model == "padding":
+        rnn_out, last_hidden, last_cell = padding_rnn(
+            x_emb,
+            len=num_steps,
+            init_hidden=init_hidden_reshape,
+            init_cell=init_cell_reshape)
+    elif rnn_model == "static":
+        rnn_out, last_hidden, last_cell = encoder_static(
+            x_emb,
+            len=num_steps,
+            init_hidden=init_hidden_reshape,
+            init_cell=init_cell_reshape)
+    elif rnn_model == "cudnn":
+        x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
+        rnn_out, last_hidden, last_cell = layers.lstm(
+            x_emb,
+            init_hidden_reshape,
+            init_cell_reshape,
+            num_steps,
+            hidden_size,
+            num_layers,
+            is_bidirec=False,
+            default_initializer=fluid.initializer.UniformInitializer(
+                low=-init_scale, high=init_scale))
+        rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2])
+    elif rnn_model == "basic_lstm":
+        rnn_out, last_hidden, last_cell = basic_lstm( x_emb, init_hidden, init_cell, hidden_size, \
+                num_layers=num_layers, batch_first=True, dropout_prob=dropout, \
+                param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) ), \
+                bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) ), \
+                forget_bias = 0.0)
+    else:
+        print("type not support")
+        return
+
+    rnn_out = layers.reshape(
+        rnn_out, shape=[-1, num_steps, hidden_size], inplace=True)
+
+    softmax_weight = layers.create_parameter(
+        [hidden_size, vocab_size],
+        dtype="float32",
+        name="softmax_weight",
+        default_initializer=fluid.initializer.UniformInitializer(
+            low=-init_scale, high=init_scale))
+    softmax_bias = layers.create_parameter(
+        [vocab_size],
+        dtype="float32",
+        name='softmax_bias',
+        default_initializer=fluid.initializer.UniformInitializer(
+            low=-init_scale, high=init_scale))
+
+    projection = layers.matmul(rnn_out, softmax_weight)
+    projection = layers.elementwise_add(projection, softmax_bias)
+    projection = layers.reshape(
+        projection, shape=[-1, vocab_size], inplace=True)
+
+    loss = layers.softmax_with_cross_entropy(
+        logits=projection, label=y, soft_label=False)
+
+    loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True)
+    loss = layers.reduce_mean(loss, dim=[0])
+    loss = layers.reduce_sum(loss)
+
+    loss.persistable = True
+    last_cell.persistable = True
+    last_hidden.persistable = True
+
+    # This will feed last_hidden, last_cell to init_hidden, init_cell, which
+    # can be used directly in next batch. This can avoid the fetching of
+    # last_hidden and last_cell and feeding of init_hidden and init_cell in
+    # each training step.
+    layers.assign(input=last_cell, output=init_cell)
+    layers.assign(input=last_hidden, output=init_hidden)
+
+    feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
+    if use_py_reader:
+        return loss, last_hidden, last_cell, feeding_list, py_reader
+    else:
+        return loss, last_hidden, last_cell, feeding_list
+
+
+class EagerDeletionPaddingRnnTest(unittest.TestCase):
+    def setUp(self):
+        self.reader = Reader()
+
+    def prepare_program(self, config):
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+        self.startup_program.random_seed = config.random_seed
+        with fluid.program_guard(self.main_program, self.startup_program):
+            with fluid.unique_name.guard():
+                res_vars = lm_model(
+                    config.hidden_size,
+                    config.vocab_size,
+                    config.batch_size,
+                    num_layers=config.num_layers,
+                    num_steps=config.num_steps,
+                    init_scale=config.init_scale,
+                    dropout=config.dropout,
+                    rnn_model=config.rnn_model,
+                    use_py_reader=False)
+                self.loss, self.last_hidden, self.last_cell, self.feed_order = res_vars
+
+                fluid.clip.set_gradient_clip(
+                    clip=fluid.clip.GradientClipByGlobalNorm(
+                        clip_norm=config.max_grad_norm))
+
+                self.learning_rate = fluid.layers.create_global_var(
+                    name="learning_rate",
+                    shape=[1],
+                    value=1.0,
+                    dtype='float32',
+                    persistable=True)
+
+                optimizer = fluid.optimizer.SGD(
+                    learning_rate=self.learning_rate)
+                optimizer.minimize(self.loss)
+        self.exe = Executor(fluid.CPUPlace())
+        self.exe.run(self.startup_program)
+
+        self.device_count = 1
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.num_threads = self.device_count
+        exec_strategy.num_iteration_per_drop_scope = 100
+
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.enable_inplace = True
+        build_strategy.memory_optimize = False
+        build_strategy.fuse_all_optimizer_ops = True
+
+        self.train_program = fluid.compiler.CompiledProgram(
+            self.main_program).with_data_parallel(
+                loss_name=self.loss.name,
+                build_strategy=build_strategy,
+                exec_strategy=exec_strategy)
+
+    def generate_init_data(self):
+        init_hidden = np.zeros(
+            (self.config.num_layers, self.config.batch_size,
+             self.config.hidden_size),
+            dtype='float32')
+        init_cell = np.zeros(
+            (self.config.num_layers, self.config.batch_size,
+             self.config.hidden_size),
+            dtype='float32')
+        return init_hidden, init_cell
+
+    def generate_new_lr(self, epoch_id=0, device_count=1):
+        new_lr = self.config.base_learning_rate * (self.config.lr_decay**max(
+            epoch_id + 1 - self.config.epoch_start_decay, 0.0))
+        lr = np.ones((self.device_count), dtype='float32') * new_lr
+        return lr
+
+    def prepare_input(self,
+                      batch,
+                      init_hidden=None,
+                      init_cell=None,
+                      epoch_id=0,
+                      with_lr=True,
+                      device_count=1):
+        x, y = batch
+        x = x.reshape((-1, self.config.num_steps, 1))
+        y = y.reshape((-1, 1))
+
+        res = {}
+        res['x'] = x
+        res['y'] = y
+        if init_hidden is not None:
+            res['init_hidden'] = init_hidden
+        if init_cell is not None:
+            res['init_cell'] = init_cell
+        if with_lr:
+            res['learning_rate'] = self.generate_new_lr(epoch_id, device_count)
+        return res
+
+    def train_an_epoch(self, epoch_id, batch_times):
+        train_data_iter = self.reader.get_data_iter(self.config)
+
+        total_loss = 0
+        iters = 0
+
+        init_hidden, init_cell = self.generate_init_data()
+        ppl = np.zeros(shape=(0))
+        for batch_id, batch in enumerate(train_data_iter):
+            input_data_feed = self.prepare_input(
+                batch,
+                init_hidden=init_hidden,
+                init_cell=init_cell,
+                epoch_id=epoch_id,
+                with_lr=True,
+                device_count=self.device_count)
+
+            batch_start_time = time.time()
+            fetch_outs = self.exe.run(self.train_program,
+                                      feed=input_data_feed,
+                                      fetch_list=[
+                                          self.loss.name, "learning_rate",
+                                          self.last_hidden.name,
+                                          self.last_cell.name
+                                      ],
+                                      use_program_cache=True)
+            batch_time = time.time() - batch_start_time
+            batch_times.append(batch_time)
+
+            cost_train = np.array(fetch_outs[0])
+            lr = np.array(fetch_outs[1])
+            init_hidden = np.array(fetch_outs[2])
+            init_cell = np.array(fetch_outs[3])
+
+            total_loss += cost_train
+            iters += self.config.num_steps
+
+            batch_ppl = np.exp(total_loss / iters)
+            ppl = np.append(ppl, batch_ppl)
+        return ppl
+
+    def train(self, config):
+        self.config = config
+        self.prepare_program(config)
+        total_time = 0.0
+        ppl = np.zeros(shape=(0, config.batch_size))
+        for epoch_id in range(config.max_epoch):
+            batch_times = []
+            epoch_start_time = time.time()
+            train_ppl = self.train_an_epoch(epoch_id, batch_times)
+            epoch_time = time.time() - epoch_start_time
+            total_time += epoch_time
+            ppl = np.append(ppl, train_ppl)
+        return ppl
+
+    def compare_padding_static_mode(self):
+        '''
+          Test that train ppl of padding mode is same to that of static mode 
+        '''
+        config = RnnConfig('test', 'padding')
+        with fluid.scope_guard(fluid.Scope()):
+            padding_rnn_ppl = self.train(config)
+        config = RnnConfig('test', 'static')
+        with fluid.scope_guard(fluid.Scope()):
+            static_rnn_ppl = self.train(config)
+        self.assertTrue(
+            np.isclose(
+                padding_rnn_ppl, static_rnn_ppl, rtol=0.001).all())
+
+    def test_padding_mode_no_eager_deletion(self):
+        '''
+           Test that train ppl of padding mode is same to that of static mode without eager deletion
+        '''
+        fluid.core._set_eager_deletion_mode(-1.0, 1.0, True)
+        self.compare_padding_static_mode()
+
+    def test_padding_mode_eager_deletion(self):
+        '''
+          Test that train ppl of padding mode is same to that of static mode under eager deletion
+        '''
+        fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
+        self.compare_padding_static_mode()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.compiler as compiler
+import paddle.fluid.core as core
+import paddle.fluid.layers as layers
+import unittest
+
+from paddle.fluid import ParamAttr
+from paddle.fluid.framework import Program, grad_var_name
+from paddle.fluid.executor import Executor
+from paddle.fluid.backward import append_backward
+
+np.random.seed(123)
+os.environ["CPU_NUM"] = "1"
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
+
+
+class PyRNNBase(object):
+    def __init__(self, input_shape, output_shape):
+        self.x = np.ones(shape=input_shape).astype("float32")
+        self.y = np.zeros(shape=output_shape).astype("float32")
+
+    def step(self, step_id, x):
+        raise NotImplementedError
+
+    def forward(self):
+        for step_id in range(self.x.shape[0]):
+            self.step(step_id, self.x[step_id])
+        return np.array([np.mean(self.y)])
+
+    def segment_inputs(self):
+        return [self.x[i] for i in range(self.x.shape[0])]
+
+
+class PySimpleRNN1(PyRNNBase):
+    def __init__(self, input_shape, output_shape):
+        super(PySimpleRNN1, self).__init__(input_shape, output_shape)
+
+        seq_len, batch_size, input_dim = input_shape
+        self.h_boot = np.random.normal(size=(batch_size,
+                                             input_dim)).astype("float32")
+
+        self.scale = 1.0 / 2.0
+        men_dim = (seq_len, batch_size, input_dim)
+        self.mems = np.zeros(shape=men_dim).astype("float32")
+
+    def step(self, step_id, x):
+        if step_id == 0:
+            pre_mem = self.h_boot
+        else:
+            pre_mem = self.mems[step_id - 1]
+        self.mems[step_id] = (pre_mem + x) * self.scale
+        self.y[step_id] = self.mems[step_id]
+
+
+class PySimpleRNN2(PyRNNBase):
+    def __init__(self, input_shape, output_shape):
+        super(PySimpleRNN2, self).__init__(input_shape, output_shape)
+
+        seq_len, batch_size, input_dim = input_shape
+        self.W = np.ones(shape=(input_dim, input_dim)).astype("float32")
+        self.U = np.zeros(shape=(input_dim, input_dim)).astype("float32")
+        self.h_boot = np.ones(shape=(batch_size, input_dim)).astype("float32")
+
+        men_dim = (seq_len, batch_size, input_dim)
+        self.mems = np.zeros(shape=men_dim).astype("float32")
+
+    def step(self, step_id, x):
+        if step_id > 0:
+            pre_mem = self.mems[step_id - 1]
+        else:
+            pre_mem = self.h_boot
+        xW = np.matmul(x, self.W).astype("float32")
+        hU = np.matmul(pre_mem, self.U).astype("float32")
+
+        def py_sigmoid(x):
+            return 1. / (1. + np.exp(-x))
+
+        self.mems[step_id] = py_sigmoid(xW + hU)
+        self.y[step_id] = self.mems[step_id]
+
+
+def create_tensor(np_data, place):
+    tensor = core.LoDTensor()
+    tensor.set(np_data, place)
+    return tensor
+
+
+class EagerDeletionRecurrentOpTest1(unittest.TestCase):
+    '''
+    Test RNNOp
+    equation:
+        h_t = ( x_t + h_{t-1} ) / scale
+    vars:
+        - x
+    memories:
+        - h
+    outputs:
+        - h
+    '''
+
+    input_dim = 2
+    batch_size = 1
+    sent_len = 1
+
+    def setup_program(self):
+        self.main_program = Program()
+        self.startup_program = Program()
+        self.place = core.CPUPlace()
+
+    def setUp(self):
+        self.setup_program()
+        self.data_field = {"x", "h_boot"}
+
+        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.output = layers.mean(self.create_rnn_op())
+
+    def create_rnn_op(self):
+        x = layers.data(
+            shape=[self.sent_len, self.batch_size, self.input_dim],
+            dtype='float32',
+            name='x',
+            append_batch_size=False)
+        x.stop_gradient = False
+        h_boot = layers.data(
+            shape=[self.input_dim], dtype='float32', name='h_boot')
+        h_boot.stop_gradient = False
+
+        rnn = layers.StaticRNN()
+        with rnn.step():
+            h_pre = rnn.memory(init=h_boot)
+            x_t = rnn.step_input(x)
+
+            h = layers.scale(
+                x=layers.elementwise_add(
+                    x=h_pre, y=x_t),
+                scale=self.py_rnn.scale)
+
+            rnn.update_memory(h_pre, h)
+            rnn.output(h)
+
+        return rnn()
+
+    def forward(self):
+        self.feed_map = {
+            x: create_tensor(getattr(self.py_rnn, x), self.place)
+            for x in self.data_field
+        }
+        exe = Executor(self.place)
+        out = exe.run(self.main_program,
+                      feed=self.feed_map,
+                      fetch_list=[self.output])
+
+        return out[0]
+
+    def backward(self):
+        self.feed_map = {
+            x: create_tensor(getattr(self.py_rnn, x), self.place)
+            for x in self.data_field
+        }
+        fetch_list = [
+            self.main_program.global_block().var(grad_var_name(x))
+            for x in self.data_field
+        ]
+
+        exe = Executor(self.place)
+        return exe.run(self.main_program,
+                       feed=self.feed_map,
+                       fetch_list=fetch_list,
+                       return_numpy=False)
+
+    def test_backward(self, rtol=0.01):
+        self.check_forward()
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            append_backward(self.output)
+
+        ana_grad = [np.array(x) for x in self.backward()]
+
+        num_grad = self.get_numerical_gradient()
+        for idx, name in enumerate(self.data_field):
+            self.assertEqual(num_grad[idx].shape, ana_grad[idx].shape)
+            self.assertTrue(
+                np.isclose(
+                    num_grad[idx], ana_grad[idx], rtol=rtol).all(),
+                "num_grad (" + name + ") has diff at " + str(self.place) +
+                "\nExpect " + str(num_grad[idx]) + "\n" + "But Got" +
+                str(ana_grad[idx]) + " in class " + self.__class__.__name__)
+
+    def check_forward(self):
+        pd_output = self.forward()
+        py_output = self.py_rnn.forward()
+        self.assertEqual(pd_output.shape, py_output.shape)
+        self.assertTrue(np.isclose(pd_output, py_output, rtol=0.01).all())
+
+    def get_numerical_gradient(self, delta=0.005):
+        dloss_dout = 1.0
+        feed_list = [getattr(self.py_rnn, x) for x in self.data_field]
+        grad_list = [np.zeros_like(x) for x in feed_list]
+        for feed, grad in zip(feed_list, grad_list):
+            for f, g in np.nditer([feed, grad], op_flags=['readwrite']):
+                o = float(f)
+                f[...] = o + delta
+                y_pos = self.forward()
+
+                f[...] = o - delta
+                y_neg = self.forward()
+
+                f[...] = o
+                dout_dfeed = (y_pos - y_neg) / (delta * 2)
+                g[...] = dout_dfeed[0]
+
+        return grad_list
+
+
+class EagerDeletionRecurrentOpTest2(EagerDeletionRecurrentOpTest1):
+    '''
+    Test RNNOp
+    equation:
+        h_t = \sigma (W x_t + U h_{t-1})
+    weights:
+        - W
+        - U
+    vars:
+        - x
+    memories:
+        - h
+    outputs:
+       - h
+    '''
+
+    input_dim = 2
+    batch_size = 10
+    sent_len = 2
+
+    def setUp(self):
+        self.setup_program()
+
+        self.data_field = {"x", "h_boot", "W", "U"}
+
+        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.output = layers.mean(self.create_rnn_op())
+
+    def create_rnn_op(self):
+        x = layers.data(
+            shape=[self.sent_len, self.batch_size, self.input_dim],
+            dtype='float32',
+            name='x',
+            append_batch_size=False)
+        x.stop_gradient = False
+        h_boot = layers.data(
+            shape=[self.input_dim], dtype='float32', name='h_boot')
+        h_boot.stop_gradient = False
+
+        rnn = layers.StaticRNN()
+        with rnn.step():
+            h_pre = rnn.memory(init=h_boot)
+            x_t = rnn.step_input(x)
+
+            temp_l = layers.fc(
+                input=x_t,
+                size=self.input_dim,
+                param_attr=ParamAttr(
+                    name='W',
+                    initializer=fluid.initializer.ConstantInitializer(1.0)),
+                bias_attr=False)
+            temp_r = layers.fc(
+                input=h_pre,
+                size=self.input_dim,
+                param_attr=ParamAttr(
+                    name='U',
+                    initializer=fluid.initializer.ConstantInitializer(0.0)),
+                bias_attr=False)
+
+            h = layers.sigmoid(x=layers.elementwise_add(x=temp_l, y=temp_r))
+
+            rnn.update_memory(h_pre, h)
+            rnn.output(h)
+
+        return rnn()
+
+    def test_backward(self):
+        super(EagerDeletionRecurrentOpTest2, self).test_backward(rtol=0.01)
+
+
+class EagerDeletionRecurrentOpMultipleMemoryTest(EagerDeletionRecurrentOpTest1):
+    '''
+    Test RNNOp with two memories
+    equation:
+        h_1 = h_pre_1
+        h_2 = h_pre_2
+        y = h_1 + h_2
+    vars:
+        - x
+    memories:
+        - h_1, h_2
+    outputs:
+       - y
+    '''
+
+    class PySimpleRNN3(PyRNNBase):
+        def __init__(self, input_shape, output_shape):
+            super(EagerDeletionRecurrentOpMultipleMemoryTest.PySimpleRNN3,
+                  self).__init__(input_shape, output_shape)
+
+            seq_len, batch_size, input_dim = input_shape
+            self.h_boot1 = np.random.normal(size=(batch_size,
+                                                  input_dim)).astype("float32")
+            self.h_boot2 = np.random.normal(size=(batch_size,
+                                                  input_dim)).astype("float32")
+
+            men_dim = (seq_len, batch_size, input_dim)
+            self.mems1 = np.zeros(shape=men_dim).astype("float32")
+            self.mems2 = np.zeros(shape=men_dim).astype("float32")
+
+        def step(self, step_id, x):
+            if step_id == 0:
+                pre_mem1 = self.h_boot1
+                pre_mem2 = self.h_boot2
+            else:
+                pre_mem1 = self.mems1[step_id - 1]
+                pre_mem2 = self.mems2[step_id - 1]
+            self.mems1[step_id] = pre_mem1
+            self.mems2[step_id] = pre_mem2
+            self.y[step_id] = self.mems1[step_id] + self.mems2[step_id] + x
+
+    input_dim = 1
+    batch_size = 1
+    sent_len = 2
+
+    def setUp(self):
+        self.setup_program()
+
+        self.data_field = {"x", "h_boot1", "h_boot2"}
+
+        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.py_rnn = EagerDeletionRecurrentOpMultipleMemoryTest.PySimpleRNN3(
+            self.input_shape, self.output_shape)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.output = layers.mean(self.create_rnn_op())
+
+    def create_rnn_op(self):
+        x = layers.data(
+            shape=[self.sent_len, self.batch_size, self.input_dim],
+            dtype='float32',
+            name='x',
+            append_batch_size=False)
+        x.stop_gradient = False
+        h_boot1 = layers.data(
+            shape=[self.batch_size, self.input_dim],
+            dtype='float32',
+            name='h_boot1',
+            append_batch_size=False)
+        h_boot1.stop_gradient = False
+        h_boot2 = layers.data(
+            shape=[self.batch_size, self.input_dim],
+            dtype='float32',
+            name='h_boot2',
+            append_batch_size=False)
+        h_boot2.stop_gradient = False
+
+        rnn = layers.StaticRNN()
+        with rnn.step():
+            h_pre1 = rnn.memory(init=h_boot1)
+            h_pre2 = rnn.memory(init=h_boot2)
+            x_t = rnn.step_input(x)
+
+            mem1 = layers.scale(x=h_pre1, scale=1.0)
+            mem2 = layers.scale(x=h_pre2, scale=1.0)
+            out = layers.sums(input=[mem1, x_t, mem2])
+
+            rnn.update_memory(h_pre1, mem1)
+            rnn.update_memory(h_pre2, mem2)
+            rnn.output(out)
+
+        return rnn()
+
+
+class EagerDeletionRecurrentOpNoMemBootTest(EagerDeletionRecurrentOpTest1):
+    '''
+    Test RNNOp without memory boot
+    equation:
+        mem = x + mem_pre
+        y = mem
+    vars:
+        - x
+    memories:
+        - mem
+    outputs:
+       - y
+    '''
+
+    class PySimpleRNN4(PyRNNBase):
+        def __init__(self, input_shape, output_shape):
+            super(EagerDeletionRecurrentOpNoMemBootTest.PySimpleRNN4,
+                  self).__init__(input_shape, output_shape)
+            men_dim = input_shape
+            self.mems = np.zeros(shape=men_dim).astype("float32")
+
+        def step(self, step_id, x):
+            if step_id == 0:
+                pre_mem = np.zeros_like(x)
+            else:
+                pre_mem = self.mems[step_id - 1]
+            self.mems[step_id] = pre_mem + x
+            self.y[step_id] = self.mems[step_id]
+
+    input_dim = 1
+    batch_size = 1
+    sent_len = 2
+
+    def setUp(self):
+        self.setup_program()
+
+        self.data_field = {"x"}
+
+        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.py_rnn = EagerDeletionRecurrentOpNoMemBootTest.PySimpleRNN4(
+            self.input_shape, self.output_shape)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.output = layers.mean(self.create_rnn_op())
+
+    def create_rnn_op(self):
+        x = layers.data(
+            shape=[self.sent_len, self.batch_size, self.input_dim],
+            dtype='float32',
+            name='x',
+            append_batch_size=False)
+        x.stop_gradient = False
+
+        rnn = layers.StaticRNN()
+        with rnn.step():
+            mem_pre = rnn.memory(shape=[-1, self.input_dim], batch_ref=x)
+            x_t = rnn.step_input(x)
+            mem = layers.elementwise_add(x=mem_pre, y=x_t)
+            rnn.update_memory(mem_pre, mem)
+            rnn.output(mem)
+
+        return rnn()
+
+
+class EagerDeletionTwoRecurrentOpsTest(EagerDeletionRecurrentOpTest1):
+    '''
+    Test RNNOp with two recurrent ops
+    equation:
+        first_rnn:
+            mem_inside = x + mem_pre_inside
+            first_inside_out = mem_inside
+        second_rnn:
+            mem = x + reduce_sum(rnn_inside_out)
+            y = mem + mem_pre
+    vars:
+        - x
+    memories:
+        - mem_inside
+        - mem
+    outputs:
+       - y
+    '''
+
+    class PySimpleRNN5(PyRNNBase):
+        def __init__(self, input_shape, output_shape):
+            super(EagerDeletionTwoRecurrentOpsTest.PySimpleRNN5,
+                  self).__init__(input_shape, output_shape)
+            self.mem_0 = np.zeros(shape=input_shape).astype("float32")
+            self.mem_1 = np.zeros(shape=input_shape).astype("float32")
+            self.rnn_0_output = np.zeros(shape=input_shape).astype("float32")
+
+        def step(self, step_id, x):
+            # First Rnn
+            for step in range(self.x.shape[0]):
+                x_t = self.x[step]
+                pre_mem = np.zeros_like(x_t) if step == 0 else self.mem_0[step -
+                                                                          1]
+                self.mem_0[step] = x_t + pre_mem
+                self.rnn_0_output[step] = self.mem_0[step]
+            # Second RNN
+            pre_mem = np.zeros_like(x) if step_id == 0 else self.mem_1[step_id -
+                                                                       1]
+            self.mem_1[step_id] = x + np.sum(self.rnn_0_output)
+            self.y[step_id] = self.mem_1[step_id] + pre_mem
+
+    input_dim = 1
+    batch_size = 1
+    sent_len = 1
+
+    def setUp(self):
+        self.setup_program()
+
+        self.data_field = {"x"}
+
+        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.py_rnn = EagerDeletionTwoRecurrentOpsTest.PySimpleRNN5(
+            self.input_shape, self.output_shape)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.output = layers.mean(self.create_rnn_op())
+
+    def create_rnn_op(self):
+        x = layers.data(
+            shape=[self.sent_len, self.batch_size, self.input_dim],
+            dtype='float32',
+            name='x',
+            append_batch_size=False)
+        x.stop_gradient = False
+
+        rnn_0 = layers.StaticRNN()
+        with rnn_0.step():
+            x_t = rnn_0.step_input(x)
+            mem_pre = rnn_0.memory(shape=[-1, self.input_dim], batch_ref=x)
+            mem = layers.elementwise_add(x=mem_pre, y=x_t)
+            rnn_0.update_memory(mem_pre, mem)
+            rnn_0.output(mem)
+
+        rnn_1 = layers.StaticRNN()
+        with rnn_1.step():
+            mem_pre = rnn_1.memory(shape=[-1, self.input_dim], batch_ref=x)
+            x_t = rnn_1.step_input(x)
+            last_rnn_output = rnn_0()
+            last_rnn_sum = fluid.layers.reduce_sum(last_rnn_output)
+            mem = layers.elementwise_add(x=x_t, y=last_rnn_sum)
+            y = layers.elementwise_add(x=mem_pre, y=mem)
+            rnn_1.update_memory(mem_pre, mem)
+            rnn_1.output(y)
+        return rnn_1()
+
+
+class EagerDeletionRecurrentOpParallelExecutorTest(
+        EagerDeletionRecurrentOpTest1):
+    '''
+    Test RNNOp with ParallelExecutor
+    equation:
+        h_t = ( x_t + h_{t-1} ) / scale
+    vars:
+        - x
+    memories:
+        - h
+    outputs:
+        - h
+    '''
+
+    def forward(self):
+        self.feed_map = {
+            x: create_tensor(getattr(self.py_rnn, x), self.place)
+            for x in self.data_field
+        }
+
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.enable_inplace = True
+        exec_strategy = fluid.ExecutionStrategy()
+        parallel_exe = fluid.ParallelExecutor(
+            use_cuda=False,
+            main_program=self.main_program,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
+        out = parallel_exe.run(feed=self.feed_map, fetch_list=[self.output])
+        return out[0]
+
+    def backward(self):
+        self.feed_map = {
+            x: create_tensor(getattr(self.py_rnn, x), self.place)
+            for x in self.data_field
+        }
+        fetch_list = [
+            self.main_program.global_block().var(grad_var_name(x))
+            for x in self.data_field
+        ]
+
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.enable_inplace = True
+        exec_strategy = fluid.ExecutionStrategy()
+        parallel_exe = fluid.ParallelExecutor(
+            use_cuda=False,
+            main_program=self.main_program,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
+        return parallel_exe.run(feed=self.feed_map,
+                                fetch_list=fetch_list,
+                                return_numpy=False)
+
+
+class EagerDeletionFarwardOnlyRnnAndBackwardRnnTest(
+        EagerDeletionRecurrentOpTest1):
+    '''
+      Test one forward only RNN and one backward RNN in one program
+    '''
+
+    def setUp(self):
+        self.setup_program()
+        self.data_field = {"x", "h_boot"}
+
+        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            x = layers.data(
+                shape=[self.sent_len, self.batch_size, self.input_dim],
+                dtype='float32',
+                name='x',
+                append_batch_size=False)
+            x.stop_gradient = False
+            h_boot = layers.data(
+                shape=[self.input_dim], dtype='float32', name='h_boot')
+            h_boot.stop_gradient = False
+
+            forward_only_rnn = layers.StaticRNN()
+            with forward_only_rnn.step():
+                h_pre = forward_only_rnn.memory(init=h_boot)
+                x_t = forward_only_rnn.step_input(x)
+
+                h = layers.scale(
+                    x=layers.elementwise_add(
+                        x=h_pre, y=x_t),
+                    scale=self.py_rnn.scale)
+
+                forward_only_rnn.update_memory(h_pre, h)
+                forward_only_rnn.output(h)
+            forward_only_output = forward_only_rnn()
+            forward_only_output.stop_gradient = True
+            self.forward_only_output = layers.mean(forward_only_output)
+
+            rnn = layers.StaticRNN()
+            with rnn.step():
+                h_pre = rnn.memory(init=h_boot)
+                x_t = rnn.step_input(x)
+
+                h = layers.scale(
+                    x=layers.elementwise_add(
+                        x=h_pre, y=x_t),
+                    scale=self.py_rnn.scale)
+
+                rnn.update_memory(h_pre, h)
+                rnn.output(h)
+
+            self.output = layers.mean(rnn())
+
+    def forward_two_rnn(self):
+        self.feed_map = {
+            x: create_tensor(getattr(self.py_rnn, x), self.place)
+            for x in self.data_field
+        }
+        exe = Executor(self.place)
+        out = exe.run(self.main_program,
+                      feed=self.feed_map,
+                      fetch_list=[self.forward_only_output, self.output])
+
+        return out[0], out[1]
+
+    def check_forward(self):
+        forward_only_output, pd_output = self.forward_two_rnn()
+        py_output = self.py_rnn.forward()
+        self.assertEqual(forward_only_output.shape, py_output.shape)
+        self.assertEqual(pd_output.shape, py_output.shape)
+        self.assertTrue(
+            np.isclose(
+                forward_only_output, py_output, rtol=0.01).all)
+        self.assertTrue(np.isclose(pd_output, py_output, rtol=0.01).all())
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -17,11 +17,13 @@ from __future__ import print_function
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
+import numpy as np
+import paddle.fluid.core as core
+
+from paddle.fluid import ParamAttr
 from paddle.fluid.framework import Program, grad_var_name
 from paddle.fluid.executor import Executor
 from paddle.fluid.backward import append_backward
-import numpy as np
-import paddle.fluid.core as core


 class PyRNNBase(object):
@@ -67,8 +69,8 @@ class PySimpleRNN2(PyRNNBase):
        super(PySimpleRNN2, self).__init__(input_shape, output_shape)

        seq_len, batch_size, input_dim = input_shape
-        self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
-        self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.W = np.ones(shape=(input_dim, input_dim)).astype("float32")
+        self.U = np.zeros(shape=(input_dim, input_dim)).astype("float32")
        self.h_boot = np.ones(shape=(batch_size, input_dim)).astype("float32")

        men_dim = (seq_len, batch_size, input_dim)
@@ -182,7 +184,7 @@ class RecurrentOpTest1(unittest.TestCase):
                       fetch_list=fetch_list,
                       return_numpy=False)

-    def test_backward(self, rtol=0.1):
+    def test_backward(self, rtol=0.01):
        self.check_forward()

        with fluid.program_guard(self.main_program, self.startup_program):
@@ -204,7 +206,7 @@ class RecurrentOpTest1(unittest.TestCase):
        pd_output = self.forward()
        py_output = self.py_rnn.forward()
        self.assertEqual(pd_output.shape, py_output.shape)
-        self.assertTrue(np.isclose(pd_output, py_output, rtol=0.1).all())
+        self.assertTrue(np.isclose(pd_output, py_output, rtol=0.01).all())

    def get_numerical_gradient(self, delta=0.005):
        dloss_dout = 1.0
@@ -274,13 +276,19 @@ class RecurrentOpTest2(RecurrentOpTest1):
            h_pre = rnn.memory(init=h_boot)
            x_t = rnn.step_input(x)

-            temp_l = layers.fc(input=x_t,
+            temp_l = layers.fc(
+                input=x_t,
                size=self.input_dim,
-                               param_attr='W',
+                param_attr=ParamAttr(
+                    name='W',
+                    initializer=fluid.initializer.ConstantInitializer(1.0)),
                bias_attr=False)
-            temp_r = layers.fc(input=h_pre,
+            temp_r = layers.fc(
+                input=h_pre,
                size=self.input_dim,
-                               param_attr='U',
+                param_attr=ParamAttr(
+                    name='U',
+                    initializer=fluid.initializer.ConstantInitializer(0.0)),
                bias_attr=False)

            h = layers.sigmoid(x=layers.elementwise_add(x=temp_l, y=temp_r))
@@ -291,7 +299,7 @@ class RecurrentOpTest2(RecurrentOpTest1):
        return rnn()

    def test_backward(self):
-        super(RecurrentOpTest2, self).test_backward(rtol=0.2)
+        super(RecurrentOpTest2, self).test_backward(rtol=0.01)


 class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):