diff --git a/doc/design/block.md b/doc/design/block.md
index 7cbf0d55b1faeb2093ee7cf234d1c2ad1905885b..4066122c0e8dfa33776796c3d205ba5aec9e0f52 100644
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -189,7 +189,7 @@ OpDesc {
   inputs = {0} // the index of x in vars of BlockDesc above
   outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
   attrs {
-    "memories" : {1} // the index of h
+    "states" : {1} // the index of h
     "step_net" : <above step net>
   }
 };
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index fb552fe3448b3f17e97e1262b5c9a0842f68f8b9..1ae7fb60f01e4925ceb310f661171eb231eb6c96 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -21,6 +21,7 @@
 
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 
@@ -220,8 +221,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
     // process recurrent gradient op as a special operator.
     if (forwardOp.Type() == "recurrent") {
       // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
-      // or
-      // this will result in infinite loop.
+      // or this will result in infinite loop.
       const auto& rnnop =
           *static_cast<const operators::RecurrentOp*>(&forwardOp);
       auto rnn_grad_op =
@@ -231,6 +231,18 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
       // create stepnet's gradient op
       rnn_grad_op->set_stepnet(
           BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
+    } else if (forwardOp.Type() == "dynamic_recurrent") {
+      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
+      // or this will result in infinite loop.
+      const auto& rnnop =
+          *static_cast<const operators::DynamicRecurrentOp*>(&forwardOp);
+      auto rnn_grad_op =
+          static_cast<operators::DynamicRecurrentGradientOp*>(grad_op.get());
+      const auto& stepnet_op =
+          *static_cast<const OperatorBase*>(&rnnop.rnn.GetStepUnit());
+      // create stepnet's gradient op
+      rnn_grad_op->rnn.SetStepUnit(
+          BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
     }
 
     if (net->ops_.empty()) {  // Current no aux op is added to network
diff --git a/paddle/operators/dynamic_recurrent_op.cc b/paddle/operators/dynamic_recurrent_op.cc
index 62962be205c10458634411b060caa12890c5fdc9..dce8c8d835679595060f21b81301eb26defe7d04 100644
--- a/paddle/operators/dynamic_recurrent_op.cc
+++ b/paddle/operators/dynamic_recurrent_op.cc
@@ -23,6 +23,7 @@ using framework::Scope;
 using framework::TensorArray;
 using framework::LoDTensor;
 using framework::Variable;
+using framework::OperatorBase;
 using framework::DySeqMetaBatch;
 
 namespace detail {
@@ -43,10 +44,9 @@ inline void CreateVariables(Scope& scope,
  * be reordered, but the RNN op should not change the `boot_state` as an input
  * variable's content.
  */
-template <typename T>
-inline void ReorderBootState(const DySeqMetaBatch& metas,
-                             const LoDTensor& boot_state, LoDTensor* tensor,
-                             const platform::Place& dst_place) {
+inline void ReorderInitialState(const DySeqMetaBatch& metas,
+                                const LoDTensor& boot_state, LoDTensor* tensor,
+                                const platform::Place& dst_place) {
   for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
     auto slice = tensor->Slice(seq_id, seq_id + 1);
     auto boot_slice =
@@ -56,58 +56,60 @@ inline void ReorderBootState(const DySeqMetaBatch& metas,
   }
 }
 
-}  // namespace detail
-
-class DynamicRecurrentOpProtoAndCheckerMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto,
-                                         framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    const auto& name = DynamicRecurrentOp::kArgName;
-    // inputs and outputs stored in proto
-    AddInput(name.inlinks,
-             "the inputs that need to be segmented for each step.")
-        .AsDuplicable();
-    AddInput(name.boot_memories, "variables to initialize memories.")
-        .AsDuplicable();
-
-    AddOutput(name.outlinks, "the outputs that need to concated for all steps.")
-        .AsDuplicable();
-    AddOutput(name.step_scopes, "step scopes");
-
-    // Attributes stored in AttributeMap
-    AddAttr<std::vector<std::string>>(name.pre_memories,
-                                      "names of pre-memories");
-    AddAttr<std::vector<std::string>>(name.memories, "names of memories");
-
-    AddComment("This is a RNN operator for varience-length sequences.");
+inline void RestoreInitialState(const DySeqMetaBatch& metas,
+                                const LoDTensor& tensor, LoDTensor* boot_state,
+                                const platform::Place& dst_place) {
+  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
+    auto slice = tensor.Slice(seq_id, seq_id + 1);
+    auto boot_slice =
+        boot_state->Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
+    boot_slice.CopyFrom(slice, dst_place, platform::CPUDeviceContext());
   }
-};
+}
 
-void DynamicRecurrentOp::Run(const Scope& scope,
-                             const platform::DeviceContext& dev_ctx) const {
-  cache_.Init(kArgName, *this, scope, &arg_);
+}  // namespace detail
+
+// Implementation for forward propagation.
+template <>
+void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kForward>(
+    const framework::Scope& scope, const framework::OperatorBase& op,
+    const platform::DeviceContext& dev_ctx) {
+  SetComputeMode(ComputeMode::kForward);
+  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
   SplitInputs();
   CreateScopes();
   WriteStepInputs();
   InitStates();
   WriteStepOutputs();
+  RunSteps();
+  ConcatOutputs();
+}
 
-  // call stepnet in all the time steps
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    auto& step_scope = cache_.GetScope(step);
-    stepnet_->Run(step_scope, dev_ctx);
+// Implementation for backward propagation.
+template <>
+void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kBackward>(
+    const framework::Scope& scope, const framework::OperatorBase& op,
+    const platform::DeviceContext& dev_ctx) {
+  SetComputeMode(ComputeMode::kBackward);
+  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
+  SplitInputs();
+  WriteStepInputs();
+  InitStates();
+  WriteStepOutputs();
+  RunSteps();
+  // copy boot-states' gradients back.
+  for (const auto& state : arg_.states) {
+    ExportInitialStateGradient(state);
   }
 
   ConcatOutputs();
 }
 
-void DynamicRecurrentOp::SplitInputs() const {
+void RNNAlgorithm::SplitInputs() {
   // TODO(superjom) make level a config
   // TODO(superjom) check all the inputs has the same LoD
   int level = 0;
-  for (const auto& item : cache_.inlinks) {
+  for (const auto& item : cache_.inputs) {
     const auto& var = item.second;
     const auto& tensor = var->Get<LoDTensor>();
     TensorArray& ta = step_inputs_[item.first];
@@ -124,8 +126,8 @@ void DynamicRecurrentOp::SplitInputs() const {
   }
 }
 
-void DynamicRecurrentOp::WriteStepInputs() const {
-  for (const auto& item : cache_.inlinks) {
+void RNNAlgorithm::WriteStepInputs() {
+  for (const auto& item : cache_.inputs) {
     auto ta_it = step_inputs_.find(item.first);
     PADDLE_ENFORCE(ta_it != step_inputs_.end(),
                    "step_inputs_ not compatible with memory set");
@@ -142,15 +144,15 @@ void DynamicRecurrentOp::WriteStepInputs() const {
   }
 }
 
-void DynamicRecurrentOp::WriteStepOutputs() const {
+void RNNAlgorithm::WriteStepOutputs() {
   // initialize step outputs
-  for (const auto& item : cache_.outlinks) {
+  for (const auto& item : cache_.outputs) {
     step_outputs_.emplace(item.first, TensorArray());
   }
   PADDLE_ENFORCE_GT(step_outputs_.size(), 0UL);
 }
 
-void DynamicRecurrentOp::CreateScopes() const {
+void RNNAlgorithm::CreateScopes() {
   PADDLE_ENFORCE_GT(cache_.num_steps, 0);
   // resize scopes
   size_t num_scopes_need_create = cache_.num_steps - cache_.scopes->size();
@@ -159,19 +161,19 @@ void DynamicRecurrentOp::CreateScopes() const {
   }
 
   // init temporary inputs
-  PADDLE_ENFORCE_NOT_NULL(stepnet_, "stepnet should be set first");
-  std::vector<std::string> memories;
-  std::vector<std::string> pre_memories;
-  std::vector<std::string> stepnet_outputs;
-  std::transform(arg_.memories.begin(), arg_.memories.end(),
-                 std::back_inserter(memories),
-                 [](const rnn::MemoryAttr& m) { return m.var; });
-  std::transform(arg_.memories.begin(), arg_.memories.end(),
-                 std::back_inserter(pre_memories),
-                 [](const rnn::MemoryAttr& m) { return m.pre_var; });
-  for (const auto& item : stepnet_->Outputs()) {
+  PADDLE_ENFORCE_NOT_NULL(step_unit_, "stepnet should be set first");
+  std::vector<std::string> states;
+  std::vector<std::string> ex_states;
+  std::vector<std::string> step_unit_outputs;
+  std::transform(arg_.states.begin(), arg_.states.end(),
+                 std::back_inserter(states),
+                 [](const rnn::StateAttr& m) { return m.var; });
+  std::transform(arg_.states.begin(), arg_.states.end(),
+                 std::back_inserter(ex_states),
+                 [](const rnn::StateAttr& m) { return m.pre_var; });
+  for (const auto& item : step_unit_->Outputs()) {
     for (const auto& var : item.second) {
-      stepnet_outputs.push_back(var);
+      step_unit_outputs.push_back(var);
     }
   }
 
@@ -179,13 +181,13 @@ void DynamicRecurrentOp::CreateScopes() const {
     auto& scope = cache_.GetScope(step);
     detail::CreateVariables(scope, arg_.inlinks);
     detail::CreateVariables(scope, arg_.outlinks);
-    detail::CreateVariables(scope, memories);
-    detail::CreateVariables(scope, pre_memories);
-    detail::CreateVariables(scope, stepnet_outputs);
+    detail::CreateVariables(scope, states);
+    detail::CreateVariables(scope, ex_states);
+    detail::CreateVariables(scope, step_unit_outputs);
   }
 }
 
-void DynamicRecurrentOp::ConcatOutputs() const {
+void RNNAlgorithm::ConcatOutputs() {
   // TODO(superjom) transform this to a config
   int level = 0;
   for (size_t step = 0; step < cache_.num_steps; step++) {
@@ -198,31 +200,45 @@ void DynamicRecurrentOp::ConcatOutputs() const {
       item.second.WriteShared(step, *tensor);
     }
   }
-  // the inlinks' lods should be the same, so randomly get one lod.
+  // the inputs' lods should be the same, so randomly get one lod.
   const auto& some_lod =
       cache_.scope->FindVar(arg_.inlinks.front())->Get<LoDTensor>().lod();
   const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
   for (auto& item : step_outputs_) {
     auto tensor = item.second.Pack(level, some_meta, some_lod);
-    auto* output = cache_.outlinks[item.first]->GetMutable<LoDTensor>();
+    auto* output = cache_.outputs[item.first]->GetMutable<LoDTensor>();
     const_cast<LoDTensor*>(output)->ShareDataWith(tensor);
   }
 }
 
-void DynamicRecurrentOp::InitStates() const {
+void RNNAlgorithm::RunSteps() {
+  if (IsBackward()) {
+    // call stepnet in all the time steps reversely
+    for (int step = cache_.num_steps - 1; step >= 0; step--) {
+      auto& step_scope = cache_.GetScope(step);
+      step_unit_->Run(step_scope, *cache_.dev_ctx);
+    }
+  } else {
+    for (size_t step = 0; step < cache_.num_steps; step++) {
+      auto& step_scope = cache_.GetScope(step);
+      step_unit_->Run(step_scope, *cache_.dev_ctx);
+    }
+  }
+}
+
+void RNNAlgorithm::InitStates() {
   for (size_t step = 0; step < cache_.num_steps; step++) {
-    for (const auto& memory : arg_.memories) {
-      CreateState(memory, step);
-      LinkState(memory, step);
+    for (const auto& state : arg_.states) {
+      CreateState(state, step);
+      LinkState(state, step);
     }
   }
 }
 
-void DynamicRecurrentOp::CreateState(const rnn::MemoryAttr& memory,
-                                     size_t step) const {
+void RNNAlgorithm::CreateState(const rnn::StateAttr& state_attr, size_t step) {
   auto& scope = cache_.GetScope(step);
-  auto& state = *cache_.GetTensor(scope, memory.var);
-  auto& boot_state = *cache_.GetTensor(*cache_.scope, memory.boot_var);
+  auto& state = *cache_.GetTensor(scope, state_attr.var);
+  auto& boot_state = *cache_.GetTensor(*cache_.scope, state_attr.boot_var);
 
   size_t num_instances =
       step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
@@ -231,56 +247,79 @@ void DynamicRecurrentOp::CreateState(const rnn::MemoryAttr& memory,
 
   state.Resize(dims);
   state.mutable_data<value_type>(platform::CPUPlace());
-  states_[memory.var].WriteShared(step, state);
+  states_[state_attr.var].WriteShared(step, state);
 }
 
-void DynamicRecurrentOp::LinkState(const rnn::MemoryAttr& memory,
-                                   size_t step) const {
+void RNNAlgorithm::LinkState(const rnn::StateAttr& state, size_t step) {
   auto& scope = cache_.GetScope(step);
-  auto& state_pre = *cache_.GetTensor(scope, memory.pre_var);
+  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
+
+  // process the first state's boot-state(the 0-step in forward mode or the
+  // last step in backward mode)
+  // Only forward mode need to link the boot-state to the `pre-state` in first
+  // time step. In backward mode, need to copy the gradient of `pre-state` in
+  // first time step to the gradient of `boot-state`.
+  if (step == 0 && IsForward()) {
+    LinkInitialState(state);
+  } else {
+    size_t num_instances =
+        step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
+    auto* pre_state = cache_.GetTensor(cache_.GetScope(step - 1), state.var);
+    // shink and share from previous state
+    auto shrinked_pre_state = pre_state->Slice(0, num_instances);
+    state_pre.ShareDataWith(shrinked_pre_state);
+  }
+}
 
+void RNNAlgorithm::LinkInitialState(const rnn::StateAttr& state) {
   // all the step_inputs' metas should be the same, just randomly select one
   // and get the dyseq meta.
   const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  size_t num_instances =
-      step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
+  auto& scope = cache_.GetScope(0);
+  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
+  auto* pre_state = cache_.GetTensor(*cache_.scope, state.boot_var);
+  pre_state->mutable_data<float>(platform::CPUPlace());
+  // allocate state
+  state_pre.Resize(pre_state->dims());
+  state_pre.mutable_data<value_type>(platform::CPUPlace());
+  detail::ReorderInitialState(some_meta, *pre_state, &state_pre,
+                              pre_state->place());
+}
 
-  LoDTensor* pre_state{nullptr};
-  if (step == 0) {
-    pre_state = cache_.GetTensor(*cache_.scope, memory.boot_var);
-    pre_state->mutable_data<float>(platform::CPUPlace());
-    // allocate memory
-    state_pre.Resize(pre_state->dims());
-    state_pre.mutable_data<value_type>(platform::CPUPlace());
-    detail::ReorderBootState<value_type>(some_meta, *pre_state, &state_pre,
-                                         pre_state->place());
-  } else {
-    pre_state = cache_.GetTensor(cache_.GetScope(step - 1), memory.var);
-  }
+void RNNAlgorithm::ExportInitialStateGradient(const rnn::StateAttr& state) {
+  // all the step_inputs' metas should be the same, just randomly select one
+  // and get the dyseq meta.
+  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
+  auto& scope = cache_.GetScope(0);
 
-  // shink and share from previous state
-  auto shrinked_pre_state = pre_state->Slice(0, num_instances);
-  state_pre.ShareDataWith(shrinked_pre_state);
+  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
+  auto& pre_state = *cache_.GetTensor(*cache_.scope, state.boot_var);
+  pre_state.Resize(state_pre.dims());
+  detail::RestoreInitialState(some_meta, state_pre, &pre_state,
+                              pre_state.place());
 }
 
-void DynamicRecurrentOp::ArgCache::Init(
-    const rnn::ArgumentName& name, const paddle::framework::OperatorBase& op,
-    const paddle::framework::Scope& scope, rnn::Argument* arg) {
+void RNNAlgorithm::ArgCache::Init(const rnn::ArgumentName& name,
+                                  const paddle::framework::OperatorBase& op,
+                                  const paddle::framework::Scope& scope,
+                                  platform::DeviceContext const* dev_ctx,
+                                  rnn::Argument* arg) {
   this->scope = &scope;
   InitArgument(name, op, arg);
   CacheScopes(scope, *arg);
   CacheInlinks(scope, arg->inlinks);
   CacheOutlinks(scope, arg->outlinks);
+  this->dev_ctx = dev_ctx;
 }
 
-void DynamicRecurrentOp::ArgCache::InitArgument(const rnn::ArgumentName& name,
-                                                const OperatorBase& op,
-                                                rnn::Argument* arg) {
+void RNNAlgorithm::ArgCache::InitArgument(const rnn::ArgumentName& name,
+                                          const OperatorBase& op,
+                                          rnn::Argument* arg) {
   rnn::InitArgument(name, arg, op, false /*is_grad*/);
 }
 
-void DynamicRecurrentOp::ArgCache::CacheScopes(const Scope& scope,
-                                               const rnn::Argument& arg) {
+void RNNAlgorithm::ArgCache::CacheScopes(const Scope& scope,
+                                         const rnn::Argument& arg) {
   auto scopes_var = scope.FindVar(arg.step_scopes);
   PADDLE_ENFORCE(scopes_var != nullptr,
                  "the step_scopes output argument [%s] should be created first "
@@ -289,45 +328,85 @@ void DynamicRecurrentOp::ArgCache::CacheScopes(const Scope& scope,
   this->scopes = scopes_var->GetMutable<std::vector<Scope*>>();
 }
 
-void DynamicRecurrentOp::ArgCache::CacheInlinks(
+void RNNAlgorithm::ArgCache::CacheInlinks(
     const Scope& scope, const std::vector<std::string>& names) {
   for (auto name : names) {
     auto* var = GetVariable(scope, name);
-    inlinks[name] = var;
+    inputs[name] = var;
   }
 }
 
-void DynamicRecurrentOp::ArgCache::CacheOutlinks(
+void RNNAlgorithm::ArgCache::CacheOutlinks(
     const Scope& scope, const std::vector<std::string>& names) {
   for (auto name : names) {
     auto* var = GetVariable(scope, name);
-    outlinks[name] = var;
+    outputs[name] = var;
   }
 }
 
-Variable* DynamicRecurrentOp::ArgCache::GetVariable(const Scope& scope,
-                                                    const std::string& name) {
+Variable* RNNAlgorithm::ArgCache::GetVariable(const Scope& scope,
+                                              const std::string& name) {
   auto* var = scope.FindVar(name);
   PADDLE_ENFORCE_NOT_NULL(var, "variable [%s] not exist in scope", name);
   return var;
 }
 
-LoDTensor* DynamicRecurrentOp::ArgCache::GetTensor(
-    const framework::Scope& scope, const std::string& name) {
+LoDTensor* RNNAlgorithm::ArgCache::GetTensor(const framework::Scope& scope,
+                                             const std::string& name) {
   auto* var = GetVariable(scope, name);
   return var->GetMutable<LoDTensor>();
 }
 
-const rnn::ArgumentName DynamicRecurrentOp::kArgName{
-    "step_net", "step_scopes",  "inlinks",      "outlinks",
-    "memories", "pre_memories", "boot_memories"};
+const std::array<rnn::ArgumentName, 2> RNNAlgorithm::kArgNames{
+    rnn::ArgumentName{"step_unit", "step_scopes", "inputs", "outputs", "states",
+                      "ex_states", "initial_states"},
+    rnn::ArgumentName{"step_unit", "step_scopes@GRAD", "outputs@GRAD",
+                      "inputs@GRAD", "states", "ex_states",
+                      "initial_states@GRAD"}};
+
+void DynamicRecurrentOp::Run(const framework::Scope& scope,
+                             const platform::DeviceContext& dev_ctx) const {
+  rnn.Run<RNNAlgorithm::ComputeMode::kForward>(
+      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
+}
 
 void DynamicRecurrentGradientOp::Run(
-    const Scope& scope, const platform::DeviceContext& dev_ctx) const {}
+    const Scope& scope, const platform::DeviceContext& dev_ctx) const {
+  rnn.Run<RNNAlgorithm::ComputeMode::kBackward>(
+      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
+}
+
+class DynamicRecurrentOpProtoAndCheckerMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto,
+                                         framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    const auto& name =
+        RNNAlgorithm::kArgNames[RNNAlgorithm::ComputeMode::kForward];
+    // inputs and outputs stored in proto
+    AddInput(name.inlinks,
+             "the inputs that need to be segmented for each step.")
+        .AsDuplicable();
+    AddInput(name.initial_states, "variables to initialize states.")
+        .AsDuplicable();
+
+    AddOutput(name.outlinks, "the outputs that need to concated for all steps.")
+        .AsDuplicable();
+    AddOutput(name.step_scopes, "step scopes");
+
+    // Attributes stored in AttributeMap
+    AddAttr<std::vector<std::string>>(name.ex_states, "names of ex_states");
+    AddAttr<std::vector<std::string>>(name.states, "names of states");
+
+    AddComment("This is a RNN operator for varience-length sequences.");
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(
-    dynamic_recurrent, paddle::operators::DynamicRecurrentOp,
-    paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker);
+REGISTER_OP(dynamic_recurrent, paddle::operators::DynamicRecurrentOp,
+            paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker,
+            dynamic_recurrent_grad,
+            paddle::operators::DynamicRecurrentGradientOp);
diff --git a/paddle/operators/dynamic_recurrent_op.h b/paddle/operators/dynamic_recurrent_op.h
index ec80a1c90eee3a655febe0dd3d6c67c16ec6c64b..5b0548c3a44c9f58838ecc567ee41a587883c26a 100644
--- a/paddle/operators/dynamic_recurrent_op.h
+++ b/paddle/operators/dynamic_recurrent_op.h
@@ -27,47 +27,39 @@
 namespace paddle {
 namespace operators {
 
-class DynamicRecurrentOp : public framework::OperatorBase {
+class RNNAlgorithm {
  public:
-  static const rnn::ArgumentName kArgName;
+  enum ComputeMode { kForward = 0, kBackward = 1 };
+  static const std::array<rnn::ArgumentName, 2> kArgNames;
   using value_type = float;
 
-  DynamicRecurrentOp(const std::string& type,
-                     const framework::VariableNameMap& inputs,
-                     const framework::VariableNameMap& outputs,
-                     const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  DynamicRecurrentOp(const DynamicRecurrentOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    // TODO(yuyang18): Implement copy ctor well.
-    PADDLE_THROW("Not implemented");
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
-
+  /*
+   * Different `Run` method for forward and backward, `_` is just for template
+   * specifialization.
+   */
+  template <ComputeMode _>
+  void Run(const framework::Scope& scope, const framework::OperatorBase& op,
+           const platform::DeviceContext& dev_ctx);
   /*
    * Split the inputs(LoDTensors) to segments for each time step.
    */
-  void SplitInputs() const;
+  void SplitInputs();
 
   /*
    * Create step-scopes to store temporary outputs in each time steps.
    */
-  void CreateScopes() const;
+  void CreateScopes();
 
   /*
    * Link TensorArray steps to the corresponding variables located in
    * step-scopes.
    */
-  void WriteStepInputs() const;
+  void WriteStepInputs();
 
   /*
    * Write output of each step to the corresponding TensorArray.
    */
-  void WriteStepOutputs() const;
+  void WriteStepOutputs();
 
   /*
    * Initialize the states, each state will have a corresponding pre-state,
@@ -75,54 +67,83 @@ class DynamicRecurrentOp : public framework::OperatorBase {
    * pre-state in the first time step will be initialized with an zero tensor or
    * a tensor in parent scope if is provided.
    */
-  void InitStates() const;
+  void InitStates();
 
   /*
    * Create state variables for each time step.
    */
-  void CreateState(const rnn::MemoryAttr& memory, size_t step) const;
+  void CreateState(const rnn::StateAttr& state, size_t step);
 
   /*
    * Link pre-state variable in current scope to the state variable in the
-   * previous time step (scope).
+   * previous time step (scope) by reference.
+   */
+  void LinkState(const rnn::StateAttr& state, size_t step);
+
+  /*
+   * Link the pre-state of the first time step to the `boot-state` in parent's
+   * scope.
+   */
+  void LinkInitialState(const rnn::StateAttr& state);
+
+  /*
+   * Copy the gradient from `pre-state` in the first step-scope to the
+   * `boot-state` in parent's scope.
+   */
+  void ExportInitialStateGradient(const rnn::StateAttr& state);
+
+  /*
+   * Calculate time steps.
    */
-  void LinkState(const rnn::MemoryAttr& memory, size_t step) const;
+  void RunSteps();
 
   /*
    * Concatenate outputs in each time step and generate a LoDTensor.
    */
-  void ConcatOutputs() const;
+  void ConcatOutputs();
+
+  void SetComputeMode(ComputeMode mode) { mode_ = mode; }
+  bool IsForward() const { return mode_ == ComputeMode::kForward; }
+  bool IsBackward() const { return mode_ == ComputeMode::kBackward; }
 
   /*
-   * set a stepnet that is created according to a RecurrentOp's stepnet.
+   * set a step unit that is created according to a RecurrentOp's step unit.
    */
-  void SetStepNet(std::unique_ptr<OperatorBase> net) {
-    PADDLE_ENFORCE_NOT_NULL(net);
-    stepnet_ = std::move(net);
+  void SetStepUnit(std::unique_ptr<framework::OperatorBase> step_unit) {
+    PADDLE_ENFORCE_NOT_NULL(step_unit);
+    step_unit_ = std::move(step_unit);
   }
-  const OperatorBase& GetStepNet() const { return *stepnet_; }
+  const framework::OperatorBase& GetStepUnit() const { return *step_unit_; }
 
   const framework::TensorArray& state(const std::string& name) const {
-    return states_[name];
+    auto it = states_.find(name);
+    PADDLE_ENFORCE(it != states_.end());
+    return it->second;
   }
   const framework::TensorArray& step_input(const std::string& name) const {
-    return step_inputs_[name];
+    auto it = step_inputs_.find(name);
+    PADDLE_ENFORCE(it != step_inputs_.end());
+    return it->second;
   }
   const framework::TensorArray& step_output(const std::string& name) const {
-    return step_outputs_[name];
+    auto it = step_outputs_.find(name);
+    PADDLE_ENFORCE(it != step_outputs_.end());
+    return it->second;
   }
 
  protected:
   struct ArgCache {
     framework::Scope const* scope;
     std::vector<framework::Scope*>* scopes;
-    std::map<std::string, framework::Variable*> inlinks;
-    std::map<std::string, framework::Variable*> outlinks;
+    std::map<std::string, framework::Variable*> inputs;
+    std::map<std::string, framework::Variable*> outputs;
+    platform::DeviceContext const* dev_ctx;
 
     size_t num_steps{0};
 
-    void Init(const rnn::ArgumentName& name, const OperatorBase& op,
-              const framework::Scope& scope, rnn::Argument* arg);
+    void Init(const rnn::ArgumentName& name, const framework::OperatorBase& op,
+              const framework::Scope& scope,
+              platform::DeviceContext const* dev_ctx, rnn::Argument* arg);
 
     framework::Scope& GetScope(size_t index) {
       PADDLE_ENFORCE_LT(index, num_steps);
@@ -133,8 +154,8 @@ class DynamicRecurrentOp : public framework::OperatorBase {
                                     const std::string& name);
 
    private:
-    void InitArgument(const rnn::ArgumentName& name, const OperatorBase& op,
-                      rnn::Argument* arg);
+    void InitArgument(const rnn::ArgumentName& name,
+                      const framework::OperatorBase& op, rnn::Argument* arg);
     void CacheScopes(const framework::Scope& scope, const rnn::Argument& arg);
     void CacheInlinks(const framework::Scope& scope,
                       const std::vector<std::string>& names);
@@ -145,27 +166,49 @@ class DynamicRecurrentOp : public framework::OperatorBase {
   };
 
  private:
-  std::unique_ptr<OperatorBase> stepnet_;
-  mutable std::map<std::string, framework::TensorArray> states_;
-  mutable std::map<std::string, framework::TensorArray> step_inputs_;
-  mutable std::map<std::string, framework::TensorArray> step_outputs_;
-  mutable std::map<std::string, std::vector<framework::DySeqMeta>>
-      dy_seq_metas_;
-  mutable rnn::Argument arg_;
-  mutable ArgCache cache_;
+  std::unique_ptr<framework::OperatorBase> step_unit_;
+  std::map<std::string, framework::TensorArray> states_;
+  std::map<std::string, framework::TensorArray> step_inputs_;
+  std::map<std::string, framework::TensorArray> step_outputs_;
+  std::map<std::string, std::vector<framework::DySeqMeta>> dy_seq_metas_;
+  rnn::Argument arg_;
+  ArgCache cache_;
+  ComputeMode mode_{ComputeMode::kForward};
 
 #ifdef PADDLE_WITH_TESTING
-  friend class DynamicRecurrentOpTestHelper;
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, SplitInputs);
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, CreateCache);
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, CreateScopes);
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, WriteStepInputs);
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, WriteStepOutputs);
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, InitStates);
-  FRIEND_TEST(DynamicRecurrentOpTestHelper, ConcatOutputs);
+  // test forward
+  friend class RNNAlgorithmTestHelper;
+  FRIEND_TEST(RNNAlgorithmTestHelper, SplitInputs);
+  FRIEND_TEST(RNNAlgorithmTestHelper, CreateCache);
+  FRIEND_TEST(RNNAlgorithmTestHelper, CreateScopes);
+  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepInputs);
+  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepOutputs);
+  FRIEND_TEST(RNNAlgorithmTestHelper, InitStates);
+  FRIEND_TEST(RNNAlgorithmTestHelper, ConcatOutputs);
+// TODO(superjom) test backward
 #endif
 };
 
+class DynamicRecurrentOp : public framework::OperatorBase {
+ public:
+  DynamicRecurrentOp(const std::string& type,
+                     const framework::VariableNameMap& inputs,
+                     const framework::VariableNameMap& outputs,
+                     const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  DynamicRecurrentOp(const DynamicRecurrentOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not implemented");
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override;
+
+  mutable RNNAlgorithm rnn;
+};
+
 class DynamicRecurrentGradientOp : public framework::OperatorBase {
  public:
   DynamicRecurrentGradientOp(const std::string& type,
@@ -174,8 +217,16 @@ class DynamicRecurrentGradientOp : public framework::OperatorBase {
                              const framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
+  DynamicRecurrentGradientOp(const DynamicRecurrentGradientOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not implemented");
+  }
+
   void Run(const framework::Scope& scope,
            const platform::DeviceContext& dev_ctx) const override;
+
+  mutable RNNAlgorithm rnn;
 };
 
 }  // namespace operators
diff --git a/paddle/operators/dynamic_recurrent_op_test.cc b/paddle/operators/dynamic_recurrent_op_test.cc
index 36f405568d7e4ed9a469c3af7a80192b83142b7a..fff63efb24c70b7e864e2d5b011a22883c13dede 100644
--- a/paddle/operators/dynamic_recurrent_op_test.cc
+++ b/paddle/operators/dynamic_recurrent_op_test.cc
@@ -43,16 +43,16 @@ LoDTensor* CreateVar(Scope& scope, std::string name, framework::DDim dims,
   return tensor;
 }
 
-class DynamicRecurrentOpTestHelper : public ::testing::Test {
+class RNNAlgorithmTestHelper : public ::testing::Test {
  protected:
-  const rnn::ArgumentName argname = DynamicRecurrentOp::kArgName;
+  const rnn::ArgumentName argname = RNNAlgorithm::kArgNames[0];
 
   virtual void SetUp() override {
     CreateGlobalVariables();
 
     auto op_desc = CreateOpDesc();
     op = paddle::framework::OpRegistry::CreateOp(op_desc, nullptr);
-    dop = dynamic_cast<DynamicRecurrentOp*>(op.get());
+    dop = &(dynamic_cast<DynamicRecurrentOp*>(op.get())->rnn);
     InitCacheManually();
     InitStepNet();
   }
@@ -63,20 +63,20 @@ class DynamicRecurrentOpTestHelper : public ::testing::Test {
     op_desc.set_type("dynamic_recurrent");
 
     OpDescNewVar(argname.inlinks, {"in0"}, op_desc.add_inputs());
-    OpDescNewVar(argname.boot_memories, {"boot_mem"}, op_desc.add_inputs());
+    OpDescNewVar(argname.initial_states, {"boot_mem"}, op_desc.add_inputs());
     OpDescNewVar(argname.step_scopes, {"step_scopes"}, op_desc.add_outputs());
     OpDescNewVar(argname.outlinks, {"out0"}, op_desc.add_outputs());
 
-    // set pre-memories
+    // set pre-states
     auto pre_memories = op_desc.mutable_attrs()->Add();
-    pre_memories->set_name(argname.pre_memories);
+    pre_memories->set_name(argname.ex_states);
     pre_memories->set_type(paddle::framework::AttrType::STRINGS);
     auto pre_memories_item = pre_memories->add_strings();
     *pre_memories_item = "mem@pre";
 
-    // set memories
+    // set states
     auto memories = op_desc.mutable_attrs()->Add();
-    memories->set_name(argname.memories);
+    memories->set_name(argname.states);
     memories->set_type(paddle::framework::AttrType::STRINGS);
     auto memories_item = memories->add_strings();
     *memories_item = "mem";
@@ -113,32 +113,33 @@ class DynamicRecurrentOpTestHelper : public ::testing::Test {
   }
 
   void InitCacheManually() {
-    dop->cache_.Init(DynamicRecurrentOp::kArgName, *dop, scope, &dop->arg_);
+    dop->cache_.Init(RNNAlgorithm::kArgNames[0], *op, scope, &device_context,
+                     &dop->arg_);
   }
 
   void InitStepNet() {
     std::unique_ptr<framework::OperatorBase> stepnet{new NetOp};
     dynamic_cast<NetOp*>(stepnet.get())
         ->AppendOp(std::unique_ptr<TestOp>(new TestOp(
-            "test", {{"inlinks", {"in0"}}, {"boot_memories", {"boot_mem"}}},
-            {{"outlinks", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {})));
-    dop->SetStepNet(std::move(stepnet));
+            "test", {{"inputs", {"in0"}}, {"initial_states", {"boot_mem"}}},
+            {{"outputs", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {})));
+    dop->SetStepUnit(std::move(stepnet));
   }
 
  protected:
-  DynamicRecurrentOp* dop;
+  RNNAlgorithm* dop;
   std::unique_ptr<framework::OperatorBase> op;
   paddle::platform::CPUDeviceContext device_context;
   paddle::framework::Scope scope;
 };
 
-TEST_F(DynamicRecurrentOpTestHelper, CreateCache) {
+TEST_F(RNNAlgorithmTestHelper, CreateCache) {
   const rnn::Argument& arg = dop->arg_;
   ASSERT_EQ(arg.inlinks.size(), 1UL);
   ASSERT_EQ(arg.outlinks.size(), 1UL);
 }
 
-TEST_F(DynamicRecurrentOpTestHelper, SplitInputs) {
+TEST_F(RNNAlgorithmTestHelper, SplitInputs) {
   dop->SplitInputs();
   auto& in0_ta = dop->step_inputs_["in0"];
   ASSERT_EQ(in0_ta.size(), 4UL);
@@ -153,14 +154,14 @@ TEST_F(DynamicRecurrentOpTestHelper, SplitInputs) {
   EXPECT_EQ(batch3.dims()[0], 1);
 }
 
-TEST_F(DynamicRecurrentOpTestHelper, CreateScopes) {
+TEST_F(RNNAlgorithmTestHelper, CreateScopes) {
   dop->SplitInputs();
   dop->CreateScopes();
   ASSERT_EQ(dop->cache_.num_steps, 4UL);
   ASSERT_EQ(dop->cache_.scopes->size(), 4UL);
 }
 
-TEST_F(DynamicRecurrentOpTestHelper, WriteStepInputs) {
+TEST_F(RNNAlgorithmTestHelper, WriteStepInputs) {
   dop->SplitInputs();
   dop->CreateScopes();
   dop->WriteStepInputs();
@@ -173,7 +174,7 @@ TEST_F(DynamicRecurrentOpTestHelper, WriteStepInputs) {
   }
 }
 
-TEST_F(DynamicRecurrentOpTestHelper, WriteStepOutputs) {
+TEST_F(RNNAlgorithmTestHelper, WriteStepOutputs) {
   dop->SplitInputs();
   dop->CreateScopes();
   dop->WriteStepInputs();
@@ -187,11 +188,12 @@ TEST_F(DynamicRecurrentOpTestHelper, WriteStepOutputs) {
   }
 }
 
-TEST_F(DynamicRecurrentOpTestHelper, ConcatOutputs) {
+TEST_F(RNNAlgorithmTestHelper, ConcatOutputs) {
   // Let's leave this test to python unittest.
 }
 
-TEST_F(DynamicRecurrentOpTestHelper, InitStates) {
+TEST_F(RNNAlgorithmTestHelper, InitStates) {
+  dop->SetComputeMode(RNNAlgorithm::ComputeMode::kForward);
   dop->SplitInputs();
   dop->CreateScopes();
   dop->WriteStepInputs();
@@ -208,12 +210,6 @@ TEST_F(DynamicRecurrentOpTestHelper, InitStates) {
 
     auto* boot_state = scope.FindVar("boot_mem");
     ASSERT_TRUE(boot_state != nullptr);
-
-    if (step == 0) {
-      // check pre_state is a reference of boot_state
-      ASSERT_EQ(boot_state->Get<LoDTensor>().data<float>(),
-                pre_state->Get<LoDTensor>().data<float>());
-    }
   }
 }
 
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index dcc90e5d87c9d54df520fcee1b48198bcd953eb1..40303e3adf4db7e8336ed72667fe69afa56c3f69 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -42,7 +42,7 @@ void RecurrentAlgorithm::Run(const Scope& scope,
 
   for (size_t step_id = 0; step_id < seq_len; step_id++) {
     if (step_id > 0) {
-      rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1);
+      rnn::LinkMemories(step_scopes, arg_->states, step_id, -1);
     }
     (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
   }
@@ -59,7 +59,8 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope,
 
   // Now all variables in scope must be created outside of op.
   PADDLE_ENFORCE_NOT_NULL(stepnet_);
-  PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "stepnet_ op has no outputs");
+  PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(),
+                 "step_unit_ op has no outputs");
 
   if (seq_len > step_scopes->size()) {
     for (size_t i = step_scopes->size(); i < seq_len; ++i) {
@@ -86,7 +87,7 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope,
 }
 
 void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
-  for (auto& attr : arg_->memories) {
+  for (auto& attr : arg_->states) {
     auto* pre_mem = step_scope->Var(attr.pre_var)->GetMutable<LoDTensor>();
     PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                    "memory [%s]'s boot variable [%s] not exists", attr.var,
@@ -100,12 +101,12 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
 }
 
 const rnn::ArgumentName RecurrentOp::kArgName{
-    "step_net", "step_scopes",  "inlinks",      "outlinks",
-    "memories", "pre_memories", "boot_memories"};
+    "step_net", "step_scopes", "inputs",        "outputs",
+    "states",   "ex_states",   "initial_states"};
 
 const rnn::ArgumentName RecurrentGradientOp::kArgName{
-    "step_net", "step_scopes@GRAD", "outlinks@GRAD",     "inlinks@GRAD",
-    "memories", "pre_memories",     "boot_memories@GRAD"};
+    "step_net", "step_scopes@GRAD", "outputs@GRAD",       "inputs@GRAD",
+    "states",   "ex_states",        "initial_states@GRAD"};
 
 RecurrentOp::RecurrentOp(const std::string& type,
                          const framework::VariableNameMap& inputs,
@@ -127,7 +128,7 @@ class RecurrentAlgorithmProtoAndCheckerMaker
     AddInput(name.inlinks,
              "the inputs that need to be segmented for each step.")
         .AsDuplicable();
-    AddInput(name.boot_memories, "variables to initialize memories.")
+    AddInput(name.initial_states, "variables to initialize states.")
         .AsDuplicable();
 
     AddOutput(name.outlinks, "the outputs that need to concated for all steps.")
@@ -135,9 +136,8 @@ class RecurrentAlgorithmProtoAndCheckerMaker
     AddOutput(name.step_scopes, "step scopes");
 
     // Attributes stored in AttributeMap
-    AddAttr<std::vector<std::string>>(name.pre_memories,
-                                      "names of pre-memories");
-    AddAttr<std::vector<std::string>>(name.memories, "names of memories");
+    AddAttr<std::vector<std::string>>(name.ex_states, "names of pre-states");
+    AddAttr<std::vector<std::string>>(name.states, "names of states");
 
     AddComment("This is a recurrent group operator.");
   }
@@ -152,7 +152,7 @@ void RecurrentGradientAlgorithm::Run(
   rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len);
   for (int step_id = seq_len - 1; step_id >= 0; --step_id) {
     if (static_cast<size_t>(step_id) != seq_len - 1) {
-      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1);
+      rnn::LinkMemories(step_scopes, arg_->states, step_id, 1);
     }
     (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
   }
@@ -162,7 +162,7 @@ void RecurrentGradientAlgorithm::Run(
 
 void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
     Scope* step_scope) const {
-  for (auto& attr : arg_->memories) {
+  for (auto& attr : arg_->states) {
     PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr,
                    "memory variable [%s] does not exists", attr.var);
     PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index d0725f50230f70e927fd2bf55b5932dfd2347d6a..ee61ea300c33722471189d06eb09f67a083d2a4d 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -36,7 +36,7 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
     LoDTensor* input = input_var->GetMutable<LoDTensor>();
     f::DDim dims = input->dims();
     PADDLE_ENFORCE_EQ(static_cast<size_t>(dims[0]), seq_len,
-                      "all the inlinks be the same length");
+                      "all the inputs be the same length");
     f::DDim step_dims = slice_ddim(dims, 1, dims.size());
     for (size_t j = 0; j < seq_len; j++) {
       Tensor* step_input =
@@ -78,7 +78,7 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
 }
 
 void LinkMemories(const std::vector<Scope*>& scopes,
-                  const std::vector<rnn::MemoryAttr>& memories,
+                  const std::vector<rnn::StateAttr>& memories,
                   const size_t step_id, const int offset) {
   PADDLE_ENFORCE_LT(step_id, scopes.size(),
                     "step [%d] is out of range of step scopes' size [%d]",
@@ -106,26 +106,26 @@ void InitArgument(const ArgumentName& name, Argument* arg,
   arg->inlinks = op.Inputs(name.inlinks);
   arg->outlinks = op.Outputs(name.outlinks);
 
-  auto& boot_memories =
-      is_grad ? op.Outputs(name.boot_memories) : op.Inputs(name.boot_memories);
+  auto& boot_memories = is_grad ? op.Outputs(name.initial_states)
+                                : op.Inputs(name.initial_states);
   // attributes
-  auto& memories = op.Attr<std::vector<std::string>>(name.memories);
-  auto& pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories);
+  auto& memories = op.Attr<std::vector<std::string>>(name.states);
+  auto& pre_memories = op.Attr<std::vector<std::string>>(name.ex_states);
 
   PADDLE_ENFORCE(memories.size() == boot_memories.size(),
-                 "the size of memories, boot_memories don't match:%d,%d",
+                 "the size of states, initial_states don't match:%d,%d",
                  memories.size(), boot_memories.size());
   PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
-                 "the size of pre_memories, boot_memories don't match:%d,%d",
+                 "the size of ex_states, initial_states don't match:%d,%d",
                  pre_memories.size(), boot_memories.size());
-  PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set");
+  PADDLE_ENFORCE(memories.size() > 0, "more than 1 states should be set");
 
   for (size_t i = 0; i < memories.size(); ++i) {
-    rnn::MemoryAttr mem_attr;
+    rnn::StateAttr mem_attr;
     mem_attr.var = memories[i];
     mem_attr.pre_var = pre_memories[i];
     mem_attr.boot_var = boot_memories[i];
-    (arg->memories).push_back(mem_attr);
+    (arg->states).push_back(mem_attr);
   }
 }
 
diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h
index fe173edb24ad015b9546546565027358f9b93476..fb0e158e07745d58c6211d33e385b324e492b95e 100644
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
@@ -31,7 +31,7 @@ using Scope = framework::Scope;
  * boot memories in father scope. Other attributes are copied from Op's proto
  * attributes.
  */
-struct MemoryAttr {
+struct StateAttr {
   // name of current state variable
   std::string var;
   // name of previous step's state variable
@@ -46,7 +46,7 @@ struct Argument {
   std::string step_scopes;
   std::vector<std::string> inlinks;
   std::vector<std::string> outlinks;
-  std::vector<rnn::MemoryAttr> memories;
+  std::vector<rnn::StateAttr> states;
 };
 
 struct ArgumentName {
@@ -54,9 +54,9 @@ struct ArgumentName {
   std::string step_scopes;
   std::string inlinks;
   std::string outlinks;
-  std::string memories;       // the memory name
-  std::string pre_memories;   // the previous memory name
-  std::string boot_memories;  // the boot memory name
+  std::string states;          // the memory name
+  std::string ex_states;       // the previous memory name
+  std::string initial_states;  // the boot memory name
 };
 
 /**
@@ -74,7 +74,7 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
                    const size_t seq_len, const platform::DeviceContext& ctx);
 
 void LinkMemories(const std::vector<Scope*>& step_scopes,
-                  const std::vector<MemoryAttr>& memories, const size_t step_id,
+                  const std::vector<StateAttr>& memories, const size_t step_id,
                   const int offset);
 
 void InitArgument(const ArgumentName& name, Argument* arg,
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 9ef47b88fd08b29ad0c917966c499e8d44f1e7af..e5ddc14587623905dbf52b4c1690236ffeb069a1 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -413,18 +413,18 @@ All parameter, weight, gradient are variables in Paddle.
                     return static_cast<operators::DynamicRecurrentOp *>(
                         rnn_op.release());
                   })
-      .def("set_stepnet",
+      .def("set_step_unit",
            [](operators::DynamicRecurrentOp &self, const operators::NetOp &net)
-               -> void { self.SetStepNet(net.Clone()); })
+               -> void { self.rnn.SetStepUnit(net.Clone()); })
       .def("get_state",
            [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.state(name); })
+               -> const TensorArray & { return self.rnn.state(name); })
       .def("get_step_input",
            [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.step_input(name); })
+               -> const TensorArray & { return self.rnn.step_input(name); })
       .def("get_step_output",
            [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.step_output(name); });
+               -> const TensorArray & { return self.rnn.step_output(name); });
 
   // cond_op
   py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
diff --git a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
index 2b01e43454e70c12b423db9925837cf336f79935..fa2ccd0c3b74a2ee8b8fd9eb8986cb79ff07c98e 100644
--- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
@@ -4,6 +4,12 @@ import unittest
 from paddle.v2.framework.op import Operator, DynamicRecurrentOp
 import numpy as np
 
+# for siplicity, just one level LoD
+lod_py = [[0, 4, 7, 9, 10]]
+input_dim = 30
+num_sents = len(lod_py[0]) - 1
+weight_dim = 15
+
 
 def create_tensor(scope, name, shape, np_data):
     tensor = scope.var(name).get_tensor()
@@ -12,6 +18,17 @@ def create_tensor(scope, name, shape, np_data):
     return tensor
 
 
+class PyRNNStep(object):
+    def __init__(self):
+
+        self.x = np.random.normal(size=(lod_py[0][-1],
+                                        input_dim)).astype("float32")
+        self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.h_boot = np.random.normal(size=(num_sents,
+                                             input_dim)).astype("float32")
+
+
 class DynamicRecurrentOpTest(unittest.TestCase):
     '''
     Test RNNOp
@@ -23,17 +40,13 @@ class DynamicRecurrentOpTest(unittest.TestCase):
         - U
     vars:
         - x
-    memories:
+    states:
         - h
     outputs:
        - h
     '''
 
-    # for siplicity, just one level LoD
-    lod_py = [[0, 4, 7, 9, 10]]
-    input_dim = 30
-    num_sents = len(lod_py[0]) - 1
-    weight_dim = 15
+    py = PyRNNStep()
 
     def forward(self):
         self.scope = core.Scope()
@@ -42,64 +55,55 @@ class DynamicRecurrentOpTest(unittest.TestCase):
         self.create_step_net()
         ctx = core.DeviceContext.create(core.CPUPlace())
         self.rnnop.run(self.scope, ctx)
-        state = self.rnnop.get_state("h@mem")
+        state = self.rnnop.get_state("h@state")
         print 'state size: ', state.size()
 
         step_inputs = self.rnnop.get_step_input("x")
         print "x size ", step_inputs.size()
         for i in range(step_inputs.size()):
             print "x %d" % i, np.array(step_inputs.read(i).get_dims())
-        step_outputs = self.rnnop.get_step_output('h@mem')
+        step_outputs = self.rnnop.get_step_output('h@state')
         print 'step_outputs.size ', step_outputs.size()
-        output = self.scope.find_var("h@mem").get_tensor()
-
+        output = self.scope.find_var("h@state").get_tensor()
         print 'output', np.array(output).shape
 
     def create_global_variables(self):
-        x = np.random.normal(size=(self.lod_py[0][-1],
-                                   self.input_dim)).astype("float32")
-        W = np.random.normal(size=(self.input_dim,
-                                   self.input_dim)).astype("float32")
-        U = np.random.normal(size=(self.input_dim,
-                                   self.input_dim)).astype("float32")
-        h_boot = np.random.normal(size=(self.num_sents,
-                                        self.input_dim)).astype("float32")
         # create inlink
-        x_tensor = create_tensor(self.scope, "x",
-                                 [self.num_sents, self.input_dim], x)
-        x_tensor.set_lod(self.lod_py)
-        create_tensor(self.scope, "W", [self.input_dim, self.input_dim], W)
-        create_tensor(self.scope, "U", [self.input_dim, self.input_dim], U)
-        create_tensor(self.scope, "h_boot", [self.num_sents, self.input_dim],
-                      h_boot)
+        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
+                                 self.py.x)
+        x_tensor.set_lod(lod_py)
+        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
+        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
+        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
+                      self.py.h_boot)
         self.scope.var("step_scopes")
-        self.scope.var("h@mem")
+        self.scope.var("h@state")
 
     def create_rnn_op(self):
         # create RNNOp
         self.rnnop = DynamicRecurrentOp(
             # inputs
-            inlinks=["x"],
-            boot_memories=["h_boot"],
-            step_net="stepnet",
+            inputs=["x"],
+            initial_states=["h_boot"],
+            step_net="step_unit",
             # outputs
-            outlinks=["h@mem"],
+            outputs=["h@state"],
             step_scopes="step_scopes",
             # attributes
-            pre_memories=["h@pre"],
-            memories=["h@mem"])
+            ex_states=["h@pre"],
+            states=["h@state"])
 
     def create_step_net(self):
-        stepnet = core.Net.create()
+        step_unit = core.Net.create()
         x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
         h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
         sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
-        sig_op = Operator("sigmoid", X="sum", Y="h@mem")
+        sig_op = Operator("sigmoid", X="sum", Y="h@state")
 
         for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
-            stepnet.append_op(op)
-        stepnet.complete_add_op(True)
-        self.rnnop.set_stepnet(stepnet)
+            step_unit.append_op(op)
+        step_unit.complete_add_op(True)
+        self.rnnop.set_step_unit(step_unit)
 
     def test_forward(self):
         print 'test recurrent op forward'
@@ -107,5 +111,58 @@ class DynamicRecurrentOpTest(unittest.TestCase):
         print 'pd_output', pd_output
 
 
+class RecurrentGradientOpTest(unittest.TestCase):
+    py = PyRNNStep()
+
+    def create_forward_op(self):
+        # create RNNOp
+        self.forward_op = DynamicRecurrentOp(
+            # inputs
+            inputs=["x"],
+            initial_states=["h_boot"],
+            step_net="step_unit",
+            # outputs
+            outputs=["h@state"],
+            step_scopes="step_scopes",
+            # attributes
+            ex_states=["h@pre"],
+            states=["h@state"])
+
+    def create_gradient_op(self):
+        a = set()
+        backward_op = core.DynamicRecurrentOp.backward(self.forward_op, a)
+
+    def create_step_net(self):
+        step_unit = core.Net.create()
+        x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
+        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
+        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
+        sig_op = Operator("sigmoid", X="sum", Y="h@state")
+
+        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
+            step_unit.append_op(op)
+        step_unit.complete_add_op(True)
+        self.forward_op.set_step_unit(step_unit)
+
+    def create_global_variables(self):
+        # create inlink
+        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
+                                 self.py.x)
+        x_tensor.set_lod(lod_py)
+        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
+        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
+        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
+                      self.py.h_boot)
+        self.scope.var("step_scopes")
+        self.scope.var("h@state")
+
+    def test_grad(self):
+        self.scope = core.Scope()
+        self.create_forward_op()
+        self.create_global_variables()
+        self.create_step_net()
+        self.create_gradient_op()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index 191ce0b0c8d5fb6c4d8037a6c1bfda57c394489e..cc4008c0d8e73a3f7d9a9be2a4aacfd120ecd522 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -132,15 +132,15 @@ class RecurrentOpTest(unittest.TestCase):
         # create RNNOp
         self.rnnop = RecurrentOp(
             # inputs
-            inlinks=["x"],
-            boot_memories=["h_boot"],
+            inputs=["x"],
+            initial_states=["h_boot"],
             step_net="stepnet",
             # outputs
-            outlinks=["h@mem"],
+            outputs=["h@mem"],
             step_scopes="step_scopes",
             # attributes
-            pre_memories=["h@pre"],
-            memories=["h@mem"])
+            ex_states=["h@pre"],
+            states=["h@mem"])
 
     def create_step_net(self):
         stepnet = core.Net.create()
@@ -169,15 +169,15 @@ class RecurrentGradientOpTest(unittest.TestCase):
     def create_forward_op(self):
         self.forward_op = RecurrentOp(
             # inputs
-            inlinks=["x"],
-            boot_memories=["h_boot"],
+            inputs=["x"],
+            initial_states=["h_boot"],
             step_net="stepnet",
             # outputs
-            outlinks=["h"],
+            outputs=["h"],
             step_scopes="step_scopes",
             # attributes
-            pre_memories=["h@pre"],
-            memories=["h@alias"])
+            ex_states=["h@pre"],
+            states=["h@alias"])
 
         # create a stepnet for RNN
         stepnet = core.Net.create()