From 085260f3deee7a0250181baee1fdea4d6758110c Mon Sep 17 00:00:00 2001
From: Jack Zhou <zhoushunjie@baidu.com>
Date: Fri, 27 Nov 2020 16:01:29 +0800
Subject: [PATCH] Add eigen gru and fix the dropout bug in the rnn

Add eigen gru and fix the dropout bug in the rnn
---
 .../operators/math/detail/gru_cpu_kernel.h    | 178 +++++++--
 paddle/fluid/operators/math/gru_compute.cc    |  54 ++-
 paddle/fluid/operators/math/gru_compute.h     |   2 +-
 paddle/fluid/operators/rnn_op.h               | 365 +++++++-----------
 .../fluid/tests/unittests/rnn/rnn_numpy.py    |   8 +-
 .../fluid/tests/unittests/test_rnn_op.py      |   6 +-
 6 files changed, 346 insertions(+), 267 deletions(-)
diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
index e05a5190e80..611daff7309 100644
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
 
@@ -21,6 +23,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 namespace detail {
+using Array1 = Eigen::DSizes<int64_t, 1>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 #ifndef __NVCC__
 
@@ -242,23 +248,46 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
 #endif
 }
 
+template <typename T>
+inline void forward_reset_outputV2(const platform::CPUDeviceContext &context,
+                                   GRUMetaValue<T> value, int frame_size) {
+  auto &place = *context.eigen_device();
+  auto value_reset_gate =
+      typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
+  auto value_update_gate = typename EigenVector<T>::Type(
+      value.gate_value + frame_size, Array1(frame_size));
+  auto value_reset_output = typename EigenVector<T>::Type(
+      value.reset_output_value, Array1(frame_size));
+  auto value_reset_bias =
+      typename EigenVector<T>::ConstType(value.reset_bias, Array1(frame_size));
+  SigmoidFunctor<T>()(place, value_reset_gate, value_reset_gate);
+  SigmoidFunctor<T>()(place, value_update_gate, value_update_gate);
+  value_reset_output.device(place) =
+      (value_reset_output + value_reset_bias) * value_reset_gate;
+}
+
 template <class OpResetOutput, typename T>
-inline void forward_reset_output(OpResetOutput op_reset_output,
-                                 GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_gate,
-                                 bool old_version = true) {
+inline void forward_reset_output(
+    OpResetOutput op_reset_output, GRUMetaValue<T> value, int frame_size,
+    int batch_size, ActivationType active_gate, bool old_version = true,
+    const platform::CPUDeviceContext *context = nullptr) {
   for (int b = 0; b < batch_size; b++) {
-    if (OpResetOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
-        (sizeof(T) == 4)) {
-      hl_avx_gru_forward_reset_output(
-          op_reset_output, value.gate_value, value.reset_output_value,
-          value.prev_out_value, frame_size, active_gate, old_version,
-          value.reset_bias);
+    if (!old_version) {
+      // use eigen
+      forward_reset_outputV2(*context, value, frame_size);
     } else {
-      hl_naive_gru_forward_reset_output(
-          op_reset_output, value.gate_value, value.reset_output_value,
-          value.prev_out_value, frame_size, active_gate, old_version,
-          value.reset_bias);
+      if (OpResetOutput::avx && (frame_size & static_cast<int>(8 - 1)) &&
+          (sizeof(T) == 4)) {
+        hl_avx_gru_forward_reset_output(
+            op_reset_output, value.gate_value, value.reset_output_value,
+            value.prev_out_value, frame_size, active_gate, old_version,
+            value.reset_bias);
+      } else {
+        hl_naive_gru_forward_reset_output(
+            op_reset_output, value.gate_value, value.reset_output_value,
+            value.prev_out_value, frame_size, active_gate, old_version,
+            value.reset_bias);
+      }
     }
     value.gate_value += frame_size * 3;
     value.reset_output_value += frame_size;
@@ -268,25 +297,51 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
   }
 }
 
+template <typename T>
+inline void forward_final_outputV2(const platform::CPUDeviceContext &context,
+                                   GRUMetaValue<T> value, int frame_size) {
+  auto &place = *context.eigen_device();
+  auto value_update_gate = typename EigenVector<T>::Type(
+      value.gate_value + frame_size, Array1(frame_size));
+  auto value_frame_state = typename EigenVector<T>::Type(
+      value.gate_value + 2 * frame_size, Array1(frame_size));
+  auto value_output =
+      typename EigenVector<T>::Type(value.output_value, Array1(frame_size));
+  TanhFunctor<T>()(place, value_frame_state, value_frame_state);
+  value_output.device(place) =
+      (static_cast<T>(1.0) - value_update_gate) * value_frame_state;
+  if (value.prev_out_value) {
+    auto value_prev_out = typename EigenVector<T>::ConstType(
+        value.prev_out_value, Array1(frame_size));
+    value_output.device(place) =
+        value_output + value_update_gate * value_prev_out;
+  }
+}
+
 template <class OpFinalOutput, typename T>
-inline void forward_final_output(OpFinalOutput op_final_output,
-                                 GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_node,
-                                 bool origin_mode, bool old_version = true) {
+inline void forward_final_output(
+    OpFinalOutput op_final_output, GRUMetaValue<T> value, int frame_size,
+    int batch_size, ActivationType active_node, bool origin_mode,
+    bool old_version = true,
+    const platform::CPUDeviceContext *context = nullptr) {
   for (int b = 0; b < batch_size; b++) {
-    if (OpFinalOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
-        (sizeof(T) == 4)) {
-      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
-                                      value.prev_out_value, value.output_value,
-                                      frame_size, active_node, origin_mode,
-                                      old_version);
+    if (!old_version) {
+      // eigen
+      forward_final_outputV2(*context, value, frame_size);
     } else {
-      hl_naive_gru_forward_final_output(op_final_output, value.gate_value,
+      if (OpFinalOutput::avx && (frame_size & static_cast<int>(8 - 1)) &&
+          (sizeof(T) == 4)) {
+        hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
                                         value.prev_out_value,
                                         value.output_value, frame_size,
                                         active_node, origin_mode, old_version);
+      } else {
+        hl_naive_gru_forward_final_output(
+            op_final_output, value.gate_value, value.prev_out_value,
+            value.output_value, frame_size, active_node, origin_mode,
+            old_version);
+      }
     }
-
     value.gate_value += frame_size * 3;
     value.output_value += frame_size;
     if (value.prev_out_value) {
@@ -664,23 +719,70 @@ inline void backward_reset_grad(OpResetGrad op_reset_grad,
   }
 }
 
+template <typename T>
+inline void gru_backward(const platform::CPUDeviceContext &context,
+                         GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                         int frame_size) {
+  auto &place = *context.eigen_device();
+
+  auto value_reset_gate =
+      typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
+  auto grad_reset_gate =
+      typename EigenVector<T>::Type(grad.gate_grad, Array1(frame_size));
+  auto value_update_gate = typename EigenVector<T>::Type(
+      value.gate_value + frame_size, Array1(frame_size));
+  auto grad_update_gate = typename EigenVector<T>::Type(
+      grad.gate_grad + frame_size, Array1(frame_size));
+  auto value_frame_state = typename EigenVector<T>::Type(
+      value.gate_value + frame_size * 2, Array1(frame_size));
+  auto grad_frame_state = typename EigenVector<T>::Type(
+      grad.gate_grad + frame_size * 2, Array1(frame_size));
+
+  auto grad_output =
+      typename EigenVector<T>::Type(grad.output_grad, Array1(frame_size));
+  auto value_reset_output = typename EigenVector<T>::Type(
+      value.reset_output_value, Array1(frame_size));
+  auto grad_reset_output =
+      typename EigenVector<T>::Type(grad.reset_output_grad, Array1(frame_size));
+
+  if (value.prev_out_value) {
+    auto value_prev_out = typename EigenVector<T>::ConstType(
+        value.prev_out_value, Array1(frame_size));
+    SigmoidGradFunctor<T>()(place, 1 /*useless*/, value_update_gate,
+                            (value_prev_out - value_frame_state) * grad_output,
+                            grad_update_gate);
+  } else {
+    SigmoidGradFunctor<T>()(
+        place, 1 /*useless*/, value_update_gate,
+        static_cast<T>(-1) * value_frame_state * grad_output, grad_update_gate);
+  }
+  if (grad.prev_out_grad) {
+    auto grad_prev_out =
+        typename EigenVector<T>::Type(grad.prev_out_grad, Array1(frame_size));
+    grad_prev_out.device(place) =
+        grad_prev_out + grad_output * value_update_gate;
+  }
+  TanhGradFunctor<T>()(place, 1 /*useless*/, value_frame_state,
+                       grad_output * (static_cast<T>(1.0) - value_update_gate),
+                       grad_frame_state);
+  SigmoidGradFunctor<T>()(
+      place, 1 /*useless*/, value_reset_gate,
+      value_reset_output / value_reset_gate * grad_frame_state,
+      grad_reset_gate);
+  if (value.prev_out_value && grad.prev_out_grad) {
+    grad_reset_output.device(place) = value_reset_gate * grad_frame_state;
+  }
+}
+
 template <class OpGruGrad, typename T>
-inline void cpu_gru_backward(OpGruGrad op_gru_grad, GRUMetaValue<T> value,
+inline void cpu_gru_backward(const platform::CPUDeviceContext &context,
+                             OpGruGrad op_gru_grad, GRUMetaValue<T> value,
                              GRUMetaGrad<T> grad, int frame_size,
                              int batch_size, ActivationType active_node,
                              ActivationType active_gate) {
   for (int b = 0; b < batch_size; ++b) {
-    if (OpGruGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_backward(
-          op_gru_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, value.reset_output_value, grad.reset_output_grad,
-          grad.output_grad, frame_size, active_node, active_gate);
-    } else {
-      hl_naive_gru_backward(
-          op_gru_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, value.reset_output_value, grad.reset_output_grad,
-          grad.output_grad, frame_size, active_node, active_gate);
-    }
+    // eigen
+    gru_backward(context, value, grad, frame_size);
 
     value.gate_value += frame_size * 3;
     value.reset_output_value += frame_size;
diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc
index aa726118def..34dd06040d3 100644
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -42,7 +42,8 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
     }
 
     detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
-                                 frame_size, batch_size, active_gate);
+                                 frame_size, batch_size, active_gate, true,
+                                 &context);
 
     if (value.prev_out_value) {
       blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
@@ -53,7 +54,7 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
 
     detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
                                  frame_size, batch_size, active_node,
-                                 origin_mode);
+                                 origin_mode, &context);
 #endif
   }
 };
@@ -116,7 +117,8 @@ struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {
                 value.reset_output_value);
     }
     detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
-                                 frame_size, batch_size, active_gate, false);
+                                 frame_size, batch_size, active_gate, false,
+                                 &context);
 
     T *cell_state_value = value.gate_value + 2 * frame_size;
     T *reset_output_value = value.reset_output_value;
@@ -129,7 +131,7 @@ struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {
 
     detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
                                  frame_size, batch_size, active_node, true,
-                                 false);
+                                 false, &context);
 #endif
   }
 };
@@ -144,8 +146,50 @@ struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, T> {
 #ifndef __NVCC__
     // calculate grad_update_gate, grad_frame_state,
     // grad_reset_output, grad_reset_gate
-    detail::cpu_gru_backward(detail::backward::gru<T>(), value, grad,
+    detail::cpu_gru_backward(context, detail::backward::gru<T>(), value, grad,
                              frame_size, batch_size, active_node, active_gate);
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    if (grad.prev_out_grad && value.prev_out_value) {
+      // update prev_out_grad
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+                grad.gate_grad, frame_size * 3, value.gate_weight, frame_size,
+                1, grad.prev_out_grad, frame_size);
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+                grad.gate_grad + frame_size, frame_size * 3,
+                value.gate_weight + frame_size * frame_size, frame_size, 1,
+                grad.prev_out_grad, frame_size);
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+                grad.reset_output_grad, frame_size, value.state_weight,
+                frame_size, 1, grad.prev_out_grad, frame_size);
+      // update weight_hh_grad
+      if (grad.gate_weight_grad) {
+        // reset gate
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                  grad.gate_grad, frame_size * 3, value.prev_out_value,
+                  frame_size, 1, grad.gate_weight_grad, frame_size);
+        // update gate
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                  grad.gate_grad + frame_size, frame_size * 3,
+                  value.prev_out_value, frame_size, 1,
+                  grad.gate_weight_grad + frame_size * frame_size, frame_size);
+        // cell state
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                  grad.reset_output_grad, frame_size, value.prev_out_value,
+                  frame_size, 1, grad.state_weight_grad, frame_size);
+      }
+    }
+    // update bias_hh_grad
+    T *gate_grad = grad.gate_grad;
+    T *bias_hh_grad = grad.bias_hh_grad;
+    T *state_bias_grad = grad.bias_hh_grad + 2 * frame_size;
+    T *reset_output_grad = grad.reset_output_grad;
+    for (int b = 0; b < batch_size; ++b) {
+      blas.VADD(2 * frame_size, bias_hh_grad, gate_grad, bias_hh_grad);
+      blas.VADD(frame_size, state_bias_grad, reset_output_grad,
+                state_bias_grad);
+      gate_grad += 3 * frame_size;
+      reset_output_grad += frame_size;
+    }
 #endif
   }
 };
diff --git a/paddle/fluid/operators/math/gru_compute.h b/paddle/fluid/operators/math/gru_compute.h
index cd713d19297..70cbfecefc8 100644
--- a/paddle/fluid/operators/math/gru_compute.h
+++ b/paddle/fluid/operators/math/gru_compute.h
@@ -38,7 +38,7 @@ struct GRUMetaGrad {
   T *reset_output_grad;
   T *output_grad;
   T *prev_out_grad;
-  T *state_bias_grad;
+  T *bias_hh_grad;
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h
index 599cb31dea2..253765bb419 100644
--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
@@ -210,66 +210,58 @@ struct LSTMCell : Cell<T> {
   }
 };
 
+template <typename T>
+void dropout_helper(const framework::ExecutionContext& context, Tensor* x,
+                    Tensor* y, const Tensor* mask, const float& dropout_prob) {
+  auto& place = *context.template device_context<platform::CPUDeviceContext>()
+                     .eigen_device();
+  auto dropout_mask = EigenVector<uint8_t>::Flatten(*mask);
+  auto in = EigenVector<T>::Flatten(*x);
+  auto out = EigenVector<T>::Flatten(*y);
+  if (dropout_prob == 1.0f) {
+    out.device(place) = static_cast<T>(0) * in;
+  } else {
+    out.device(place) =
+        in * dropout_mask.cast<T>() / static_cast<T>(1.0f - dropout_prob);
+  }
+}
+
 template <typename T>
 void dropout_cpu_function_inplace(const framework::ExecutionContext& context,
-                                  Tensor* x, Tensor* mask,
+                                  Tensor* x, Tensor* y, Tensor* mask,
                                   const float& dropout_prob,
                                   const int& seed_number, const bool& is_test,
                                   bool* is_has_reset) {
   if (is_test) {
     return;
   }
-  auto* x_data = x->data<T>();
   size_t size = framework::product(x->dims());
   auto* mask_data = mask->data<uint8_t>();
   if (!(*is_has_reset)) {
     // Special case when dropout_prob is 1.0
     if (dropout_prob == 1.0f) {
-      std::fill(x_data, x_data + size, static_cast<T>(0));
-      std::fill(mask_data, mask_data + size, static_cast<T>(0));
-      *is_has_reset = true;
-      return;
-    }
-    auto engine = framework::GetCPURandomEngine(seed_number);
-    std::uniform_real_distribution<float> dist(0, 1);
-    for (size_t i = 0; i < size; ++i) {
-      if (dist(*engine) < dropout_prob) {
-        mask_data[i] = 0;
-        x_data[i] = static_cast<T>(0);
-      } else {
-        mask_data[i] = 1;
-        x_data[i] /= static_cast<T>(1.0f - dropout_prob);
+      std::fill(mask_data, mask_data + size, static_cast<uint8_t>(0));
+    } else {
+      auto engine = framework::GetCPURandomEngine(seed_number);
+      std::uniform_real_distribution<float> dist(0, 1);
+      for (size_t i = 0; i < size; ++i) {
+        if (dist(*engine) < dropout_prob) {
+          mask_data[i] = 0;
+        } else {
+          mask_data[i] = 1;
+        }
       }
     }
     *is_has_reset = true;
-  } else {
-    if (dropout_prob == 1.0f) {
-      std::fill(x_data, x_data + size, static_cast<T>(0));
-      return;
-    }
-    for (size_t i = 0; i < size; ++i) {
-      if (mask_data[i] == 0) {
-        x_data[i] = static_cast<T>(0);
-      } else {
-        x_data[i] /= static_cast<T>(1.0f - dropout_prob);
-      }
-    }
   }
+  dropout_helper<T>(context, x, y, mask, dropout_prob);
 }
 
 template <typename T>
 void dropout_cpu_grad_function_inplace(
     const framework::ExecutionContext& context, Tensor* grad_x,
     const Tensor* mask, const float& dropout_prob) {
-  auto& place = *context.template device_context<platform::CPUDeviceContext>()
-                     .eigen_device();
-  auto M = EigenVector<uint8_t>::Flatten(*mask);
-  auto dX = EigenVector<T>::Flatten(*grad_x);
-  if (dropout_prob == 1.0f) {
-    dX.device(place) = static_cast<T>(0) * dX;
-  } else {
-    dX.device(place) = dX * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
-  }
+  dropout_helper<T>(context, grad_x, grad_x, mask, dropout_prob);
 }
 
 template <typename T, typename CellType>
@@ -298,14 +290,13 @@ struct Layer {
     blas.MatMul(*input, mat_dim_a, weight, mat_dim_b, static_cast<T>(1.0),
                 cache_input, static_cast<T>(0));
 
-    auto eigen_in = framework::EigenMatrix<T>::Reshape(
+    auto in = framework::EigenMatrix<T>::Reshape(
         *cache_input, cache_input->dims().size() - 1);
-    auto eigen_bias_ih = framework::EigenMatrix<T>::From(
+    auto bias_ih_tmp = framework::EigenMatrix<T>::From(
         bias_ih, framework::make_ddim({1, bias_ih.dims()[0]}));
     const int& row_num =
         framework::product(cache_input->dims()) / cache_input->dims()[2];
-    eigen_in =
-        eigen_in + eigen_bias_ih.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
+    in = in + bias_ih_tmp.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
     if (is_gru(context)) {
       // reset_gate update_gate cell_gate = [1, 1, 0]
       Tensor bias_hh_tmp;
@@ -317,15 +308,13 @@ struct Layer {
       math::SetConstant<platform::CPUDeviceContext, T> zero;
       zero(dev_ctx, &bias_hh_tmp_unbind[2], static_cast<T>(0.0));
 
-      auto eigen_bias_hh_tmp = framework::EigenMatrix<T>::From(
+      auto bias_hh_after_mask = framework::EigenMatrix<T>::From(
           bias_hh_tmp, framework::make_ddim({1, bias_hh.dims()[0]}));
-      eigen_in = eigen_in +
-                 eigen_bias_hh_tmp.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
+      in = in + bias_hh_after_mask.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
     } else {
-      auto eigen_bias_hh = framework::EigenMatrix<T>::From(
+      auto bias_hh_no_mask = framework::EigenMatrix<T>::From(
           bias_hh, framework::make_ddim({1, bias_hh.dims()[0]}));
-      eigen_in =
-          eigen_in + eigen_bias_hh.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
+      in = in + bias_hh_no_mask.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
     }
   }
 
@@ -335,27 +324,26 @@ struct Layer {
     // in the output, if mask flag is 0, we will retun the zero data
     auto& place = *context.template device_context<platform::CPUDeviceContext>()
                        .eigen_device();
-    auto eigen_output =
+    auto out =
         framework::EigenMatrix<T>::Reshape(*output, output->dims().size() - 1);
-    auto eigen_mask = framework::EigenMatrix<T>::From(
+    auto mask = framework::EigenMatrix<T>::From(
         mask_tensor, framework::make_ddim({mask_tensor.dims()[1], 1}));
-    auto eigen_init_h =
+    auto pre_h =
         framework::EigenMatrix<T>::Reshape(*init_h, init_h->dims().size() - 1);
-    auto eigen_last_h =
+    auto curr_h =
         framework::EigenMatrix<T>::Reshape(*last_h, last_h->dims().size() - 1);
-    auto eigen_mask_broadcast =
-        eigen_mask.broadcast(Eigen::DSizes<int, 2>(1, output->dims()[2]));
-    eigen_last_h.device(place) = eigen_output * eigen_mask_broadcast +
-                                 eigen_init_h * (1 - eigen_mask_broadcast);
-    eigen_output.device(place) = eigen_output * eigen_mask_broadcast;
+    auto mask_broadcast =
+        mask.broadcast(Eigen::DSizes<int, 2>(1, output->dims()[2]));
+    curr_h.device(place) = out * mask_broadcast + pre_h * (1 - mask_broadcast);
+    out.device(place) = out * mask_broadcast;
 
     if (is_lstm(context)) {
-      auto eigen_init_c = framework::EigenMatrix<T>::Reshape(
+      auto pre_c = framework::EigenMatrix<T>::Reshape(
           *init_c, init_c->dims().size() - 1);
-      auto eigen_last_c = framework::EigenMatrix<T>::Reshape(
+      auto curr_c = framework::EigenMatrix<T>::Reshape(
           *last_c, last_c->dims().size() - 1);
-      eigen_last_c.device(place) = eigen_last_c * eigen_mask_broadcast +
-                                   eigen_init_c * (1 - eigen_mask_broadcast);
+      curr_c.device(place) =
+          curr_c * mask_broadcast + pre_c * (1 - mask_broadcast);
     }
   }
 
@@ -910,16 +898,18 @@ void RnnFunc(const framework::ExecutionContext& ctx, const Tensor* input,
       }
       if (!is_test) {
         prev_hidden_data = hidden_data.Slice(i - 1, i);
-        input_holder = &prev_hidden_data;
         input_holder->Resize(output->dims());
+        if (dropout_prob != 0) {
+          dropout_cpu_function_inplace<T>(ctx, &prev_hidden_data, input_holder,
+                                          dropout_mask, dropout_prob, seed,
+                                          is_test, &has_dropout_reset);
+        } else {
+          input_holder = &prev_hidden_data;
+          input_holder->Resize(output->dims());
+        }
       } else {
         SwapPoniter(&output_holder, &input_holder);
       }
-      if (dropout_prob != 0 && (!is_test)) {
-        dropout_cpu_function_inplace<T>(ctx, input_holder, dropout_mask,
-                                        dropout_prob, seed, is_test,
-                                        &has_dropout_reset);
-      }
     }
     const Tensor* input_temp_holder = input;
     if (i > 0) {
@@ -1040,53 +1030,6 @@ void create_tensor_by_list(const framework::ExecutionContext& context,
   }
 }
 
-template <typename T>
-void make_grad_gate_buf(const framework::ExecutionContext& context,
-                        Tensor* grad_gate, Tensor* grad_gate_buf,
-                        Tensor* reset_output_grad = nullptr) {
-  int dim_size = grad_gate->dims().size();
-  int batch_size = grad_gate->dims()[dim_size - 2];
-  int frame_size = grad_gate->dims()[dim_size - 1];
-
-  Tensor grad_gate_mask;
-  create_tensor_by_list<T>(context, &grad_gate_mask, {1, 1, 0});
-
-  auto& place = *context.template device_context<platform::CPUDeviceContext>()
-                     .eigen_device();
-  auto eigen_grad_gate_mask = framework::EigenMatrix<T>::From(
-      grad_gate_mask, framework::make_ddim({3, 1}));
-  auto eigen_grad_gate_mask_broadcast =
-      eigen_grad_gate_mask.broadcast(Eigen::DSizes<int, 2>(1, frame_size / 3))
-          .reshape(Eigen::DSizes<int, 1>(frame_size))
-          .broadcast(Eigen::DSizes<int, 2>(batch_size, 1));
-  auto eigen_grad_gate_buf = framework::EigenMatrix<T>::From(
-      *grad_gate_buf, framework::make_ddim({batch_size, frame_size}));
-  auto eigen_grad_gate = framework::EigenMatrix<T>::From(
-      *grad_gate, framework::make_ddim({batch_size, frame_size}));
-  eigen_grad_gate_buf.device(place) =
-      eigen_grad_gate * eigen_grad_gate_mask_broadcast;
-
-  if (reset_output_grad) {
-    Tensor grad_reset_output_mask;
-    create_tensor_by_list<T>(context, &grad_reset_output_mask, {0, 0, 1});
-    auto eigen_grad_reset_output_mask = framework::EigenMatrix<T>::From(
-        grad_reset_output_mask, framework::make_ddim({3, 1}));
-    auto eigen_grad_reset_output_mask_broadcast =
-        eigen_grad_reset_output_mask
-            .broadcast(Eigen::DSizes<int, 2>(1, frame_size / 3))
-            .reshape(Eigen::DSizes<int, 1>(frame_size))
-            .broadcast(Eigen::DSizes<int, 2>(batch_size, 1));
-    auto eigen_grad_reset_output =
-        framework::EigenMatrix<T>::Reshape(*reset_output_grad,
-                                           reset_output_grad->dims().size() - 1)
-            .broadcast(Eigen::DSizes<int, 3>(1, 3, 1))
-            .reshape(Eigen::DSizes<int, 2>(batch_size, frame_size));
-    eigen_grad_gate_buf.device(place) =
-        eigen_grad_gate_buf +
-        eigen_grad_reset_output_mask_broadcast * eigen_grad_reset_output;
-  }
-}
-
 template <typename T, typename GradCellType>
 struct GradLayer {
   explicit GradLayer(const GradCellType& cell) : cell_(cell) {}
@@ -1196,12 +1139,10 @@ struct GradLayer {
     Tensor* pre_hidden = nullptr;
     Tensor* pre_state = nullptr;
     Tensor* hidden = nullptr;
-    Tensor grad_gate_buf;
-    TensorList grad_gate_buf_unbind;
     if (is_gru(context)) {
-      grad_gate_buf.Resize(layer_grad_gate_tensor->dims());
-      grad_gate_buf.mutable_data<T>(context.GetPlace());
-      grad_gate_buf_unbind = Unbind(grad_gate_buf);
+      zero(device_ctx,
+           &((*weight_list_grad)[layer_idx][current_reverse_idx * 4 + 3]),
+           static_cast<T>(0.0));
     }
     for (int i = time_step - 1; i >= 0; --i) {
       if (has_sequence_length) {
@@ -1232,7 +1173,7 @@ struct GradLayer {
           &(parameter_lists[layer_idx][current_reverse_idx * 4 + 1]),
           pre_hidden, pre_state, dynamic_grad_last_h, dynamic_grad_last_c,
           &(*layer_grad_gate_tensor_unbind)[i], weight_grad, dynamic_grad_pre_h,
-          dynamic_grad_pre_c, &grad_gate_buf_unbind[i],
+          dynamic_grad_pre_c,
           &((*weight_list_grad)[layer_idx][current_reverse_idx * 4 + 3]),
           mask_tensor_list[i], has_sequence_length);
       SwapPoniter(&dynamic_grad_last_h, &dynamic_grad_pre_h);
@@ -1241,8 +1182,7 @@ struct GradLayer {
     // postproces for gradient for w_hi, X, bias_hi, bias_hh
     this->postprocess(context, *layer_grad_gate_tensor, *input, input_grad,
                       parameter_lists[layer_idx],
-                      &((*weight_list_grad)[layer_idx]), &grad_gate_buf,
-                      is_reverse);
+                      &((*weight_list_grad)[layer_idx]), is_reverse);
 
     // copy the gradient to init_c init_h
     if ((*init_h_grad_unbind).size() > 0 && time_step % 2 == 0) {
@@ -1268,16 +1208,17 @@ struct GradLayer {
       TensorList* init_h_grad_unbind, TensorList* init_c_grad_unbind,
       const std::vector<TensorList>& weight_list_grad, const int& layer_idx,
       const int& gate_num) {}
+
   void preprocess(const framework::ExecutionContext& context,
                   const Tensor* grad_output, Tensor* grad_last_h) {
     auto& place = *context.template device_context<platform::CPUDeviceContext>()
                        .eigen_device();
-    auto eigen_grad_output = framework::EigenMatrix<T>::Reshape(
+    auto output_grad = framework::EigenMatrix<T>::Reshape(
         *grad_output, grad_output->dims().size() - 1);
-    auto eigen_grad_last_h = framework::EigenMatrix<T>::Reshape(
+    auto last_h_grad = framework::EigenMatrix<T>::Reshape(
         *grad_last_h, grad_last_h->dims().size() - 1);
     // the output gradient contribute the gradient to last_h
-    eigen_grad_last_h.device(place) = eigen_grad_last_h + eigen_grad_output;
+    last_h_grad.device(place) = last_h_grad + output_grad;
   }
 
   void mask_preprocess(const framework::ExecutionContext& context,
@@ -1286,40 +1227,35 @@ struct GradLayer {
                        Tensor* grad_pre_c, const Tensor& mask_tensor) {
     auto& place = *context.template device_context<platform::CPUDeviceContext>()
                        .eigen_device();
-    auto eigen_mask = framework::EigenMatrix<T>::From(
+    auto mask = framework::EigenMatrix<T>::From(
         mask_tensor, framework::make_ddim({mask_tensor.dims()[1], 1}));
-    auto eigen_mask_broadcast =
-        eigen_mask.broadcast(Eigen::DSizes<int, 2>(1, grad_output->dims()[2]));
+    auto mask_broadcast =
+        mask.broadcast(Eigen::DSizes<int, 2>(1, grad_output->dims()[2]));
 
-    auto eigen_grad_last_h = framework::EigenMatrix<T>::Reshape(
+    auto last_h_grad = framework::EigenMatrix<T>::Reshape(
         *grad_last_h, grad_last_h->dims().size() - 1);
-    auto eigen_grad_pre_h = framework::EigenMatrix<T>::Reshape(
+    auto pre_h_grad = framework::EigenMatrix<T>::Reshape(
         *grad_pre_h, grad_pre_h->dims().size() - 1);
-    auto eigen_grad_output = framework::EigenMatrix<T>::Reshape(
+    auto output_grad = framework::EigenMatrix<T>::Reshape(
         *grad_output, grad_output->dims().size() - 1);
-    eigen_grad_last_h.device(place) =
-        eigen_grad_last_h + eigen_grad_output * eigen_mask_broadcast;
-    eigen_grad_pre_h.device(place) =
-        (1 - eigen_mask_broadcast) * eigen_grad_last_h;
-    eigen_grad_last_h.device(place) = eigen_mask_broadcast * eigen_grad_last_h;
+    last_h_grad.device(place) = last_h_grad + output_grad * mask_broadcast;
+    pre_h_grad.device(place) = (1 - mask_broadcast) * last_h_grad;
+    last_h_grad.device(place) = mask_broadcast * last_h_grad;
 
     if (grad_last_c && grad_pre_c && is_lstm(context)) {
-      auto eigen_grad_last_c = framework::EigenMatrix<T>::Reshape(
+      auto last_c_grad = framework::EigenMatrix<T>::Reshape(
           *grad_last_c, grad_last_c->dims().size() - 1);
-      auto eigen_grad_pre_c = framework::EigenMatrix<T>::Reshape(
+      auto pre_c_grad = framework::EigenMatrix<T>::Reshape(
           *grad_pre_c, grad_pre_c->dims().size() - 1);
-      eigen_grad_pre_c.device(place) =
-          (1 - eigen_mask_broadcast) * eigen_grad_last_c;
-      eigen_grad_last_c.device(place) =
-          eigen_mask_broadcast * eigen_grad_last_c;
+      pre_c_grad.device(place) = (1 - mask_broadcast) * last_c_grad;
+      last_c_grad.device(place) = mask_broadcast * last_c_grad;
     }
   }
 
   void postprocess(const framework::ExecutionContext& context,
                    const Tensor& grad_gate, const Tensor& input,
                    Tensor* input_grad, const TensorList& parameters,
-                   TensorList* grad_parameters, Tensor* grad_gate_buf,
-                   const int& is_reverse) {
+                   TensorList* grad_parameters, const int& is_reverse) {
     // we get the grad_gate step by step, and need to bradocast the grad to the
     // grad_w_hi, grad_bias_hi, grad_bias_hh
     int begin_idx = 0;
@@ -1360,10 +1296,7 @@ struct GradLayer {
         {grad_gate.dims()[0] * grad_gate.dims()[1], grad_gate.dims()[2]});
     col_sum(device_ctx, tmp_grad_gate, &((*grad_parameters)[begin_idx + 2]));
     // Bias_hh
-    if (is_gru(context)) {
-      grad_gate_buf->Resize(tmp_grad_gate.dims());
-      col_sum(device_ctx, *grad_gate_buf, &((*grad_parameters)[begin_idx + 3]));
-    } else {
+    if (!is_gru(context)) {
       col_sum(device_ctx, tmp_grad_gate, &((*grad_parameters)[begin_idx + 3]));
     }
   }
@@ -1600,64 +1533,69 @@ struct GradCell {
                           Tensor* pre_state, Tensor* grad_hidden,
                           Tensor* grad_state, Tensor* grad_gate,
                           Tensor* grad_weight_hh, Tensor* grad_pre_hidden,
-                          Tensor* grad_pre_state, Tensor* grad_gate_buf,
-                          Tensor* grad_bias_hh, const Tensor& mask_tensor,
+                          Tensor* grad_pre_state, Tensor* grad_bias_hh,
+                          const Tensor& mask_tensor,
                           bool has_sequence_length) const {}
+
+  void postprocess_pre_hidden_grad(const framework::ExecutionContext& context,
+                                   Tensor* grad_pre_hidden,
+                                   Tensor* grad_pre_hidden_bak,
+                                   Tensor* grad_pre_state,
+                                   Tensor* grad_pre_state_bak,
+                                   const Tensor& mask_tensor,
+                                   bool has_sequence_length) const {
+    if (has_sequence_length) {
+      auto& place =
+          *context.template device_context<platform::CPUDeviceContext>()
+               .eigen_device();
+      auto mask = framework::EigenMatrix<T>::From(
+          mask_tensor, framework::make_ddim({mask_tensor.dims()[1], 1}));
+      auto mask_broadcast =
+          mask.broadcast(Eigen::DSizes<int, 2>(1, grad_pre_hidden->dims()[2]));
+      auto pre_hidden_grad = framework::EigenMatrix<T>::Reshape(
+          *grad_pre_hidden, grad_pre_hidden->dims().size() - 1);
+      auto pre_hidden_bak_grad = framework::EigenMatrix<T>::Reshape(
+          *grad_pre_hidden_bak, grad_pre_hidden_bak->dims().size() - 1);
+      pre_hidden_grad.device(place) =
+          (1 - mask_broadcast) * pre_hidden_bak_grad +
+          pre_hidden_grad * mask_broadcast;
+      if (grad_pre_state) {
+        auto pre_state_grad = framework::EigenMatrix<T>::Reshape(
+            *grad_pre_state, grad_pre_state->dims().size() - 1);
+        auto pre_state_bak_grad = framework::EigenMatrix<T>::Reshape(
+            *grad_pre_state_bak, grad_pre_state_bak->dims().size() - 1);
+        pre_state_grad.device(place) =
+            (1 - mask_broadcast) * pre_state_bak_grad +
+            pre_state_grad * mask_broadcast;
+      }
+    }
+  }
+
   virtual void update_pre_hidden_grad(
       const framework::ExecutionContext& context, Tensor* grad_gate,
       const Tensor* weight_hh, Tensor* grad_pre_hidden,
       Tensor* grad_pre_hidden_bak, Tensor* grad_pre_state,
-      Tensor* grad_pre_state_bak, Tensor* grad_gate_buf,
-      const Tensor& mask_tensor, bool has_sequence_length) const {
+      Tensor* grad_pre_state_bak, const Tensor& mask_tensor,
+      bool has_sequence_length) const {
     auto& device_ctx =
         context.template device_context<platform::CPUDeviceContext>();
     auto blas = math::GetBlas<platform::CPUDeviceContext, T>(device_ctx);
-    T beta = 0;
     Tensor* grad_gate_tmp = grad_gate;
-    if (is_gru(context)) {
-      beta = 1.0;
-      grad_gate_tmp = grad_gate_buf;
-    }
-
     auto mat_dim_a =
         math::CreateMatrixDescriptor(grad_gate_tmp->dims(), 0, false);
     mat_dim_a.height_ *= mat_dim_a.batch_size_;
     mat_dim_a.batch_size_ = 0;
     auto mat_dim_b = math::CreateMatrixDescriptor(weight_hh->dims(), 0, false);
     blas.MatMul(*grad_gate_tmp, mat_dim_a, *weight_hh, mat_dim_b,
-                static_cast<T>(1.0), grad_pre_hidden, beta);
-
-    if (has_sequence_length) {
-      auto& place =
-          *context.template device_context<platform::CPUDeviceContext>()
-               .eigen_device();
-      auto eigen_mask = framework::EigenMatrix<T>::From(
-          mask_tensor, framework::make_ddim({mask_tensor.dims()[1], 1}));
-      auto eigen_mask_broadcast = eigen_mask.broadcast(
-          Eigen::DSizes<int, 2>(1, grad_pre_hidden->dims()[2]));
-      auto eigen_grad_pre_hidden = framework::EigenMatrix<T>::Reshape(
-          *grad_pre_hidden, grad_pre_hidden->dims().size() - 1);
-      auto eigen_grad_pre_hidden_bak = framework::EigenMatrix<T>::Reshape(
-          *grad_pre_hidden_bak, grad_pre_hidden_bak->dims().size() - 1);
-      eigen_grad_pre_hidden.device(place) =
-          (1 - eigen_mask_broadcast) * eigen_grad_pre_hidden_bak +
-          eigen_grad_pre_hidden * eigen_mask_broadcast;
-      if (grad_pre_state) {
-        auto eigen_grad_pre_state = framework::EigenMatrix<T>::Reshape(
-            *grad_pre_state, grad_pre_state->dims().size() - 1);
-        auto eigen_grad_pre_state_bak = framework::EigenMatrix<T>::Reshape(
-            *grad_pre_state_bak, grad_pre_state_bak->dims().size() - 1);
-        eigen_grad_pre_state.device(place) =
-            (1 - eigen_mask_broadcast) * eigen_grad_pre_state_bak +
-            eigen_grad_pre_state * eigen_mask_broadcast;
-      }
-    }
+                static_cast<T>(1.0), grad_pre_hidden, 0);
+    postprocess_pre_hidden_grad(context, grad_pre_hidden, grad_pre_hidden_bak,
+                                grad_pre_state, grad_pre_state_bak, mask_tensor,
+                                has_sequence_length);
   }
 
   virtual void update_weight_hh_grad(const framework::ExecutionContext& context,
                                      Tensor* grad_gate, Tensor* pre_hidden,
-                                     Tensor* grad_weight_hh,
-                                     Tensor* grad_gate_buf) const {
+                                     Tensor* grad_weight_hh) const {
     auto& device_ctx =
         context.template device_context<platform::CPUDeviceContext>();
     auto blas = math::GetBlas<platform::CPUDeviceContext, T>(device_ctx);
@@ -1667,11 +1605,7 @@ struct GradCell {
     auto mat_dim_d = math::CreateMatrixDescriptor(pre_hidden->dims(), 0, false);
     mat_dim_d.height_ *= mat_dim_d.batch_size_;
     mat_dim_d.batch_size_ = 0;
-    Tensor* grad_gate_tmp = grad_gate;
-    if (is_gru(context)) {
-      grad_gate_tmp = grad_gate_buf;
-    }
-    blas.MatMul(*grad_gate_tmp, mat_dim_c, *pre_hidden, mat_dim_d,
+    blas.MatMul(*grad_gate, mat_dim_c, *pre_hidden, mat_dim_d,
                 static_cast<T>(1.0), grad_weight_hh, static_cast<T>(1.0));
   }
 };
@@ -1685,8 +1619,7 @@ struct SimpleRNNGradCell : GradCell<T> {
                   Tensor* pre_state, Tensor* grad_hidden, Tensor* grad_state,
                   Tensor* grad_gate, Tensor* grad_weight_hh,
                   Tensor* grad_pre_hidden, Tensor* grad_pre_state,
-                  Tensor* grad_gate_buf, Tensor* grad_bias_hh,
-                  const Tensor& mask_tensor,
+                  Tensor* grad_bias_hh, const Tensor& mask_tensor,
                   bool has_sequence_length) const override {
     auto& device_ctx =
         context.template device_context<platform::CPUDeviceContext>();
@@ -1711,11 +1644,10 @@ struct SimpleRNNGradCell : GradCell<T> {
     functor(*place, z, h, dh, dz);
 
     // update grad_weight_hh, grad_pre_hidden
-    this->update_pre_hidden_grad(
-        context, grad_gate, weight_hh, grad_pre_hidden, &grad_pre_hidden_bak,
-        nullptr, nullptr, grad_gate_buf, mask_tensor, has_sequence_length);
-    this->update_weight_hh_grad(context, grad_gate, pre_hidden, grad_weight_hh,
-                                grad_gate_buf);
+    this->update_pre_hidden_grad(context, grad_gate, weight_hh, grad_pre_hidden,
+                                 &grad_pre_hidden_bak, nullptr, nullptr,
+                                 mask_tensor, has_sequence_length);
+    this->update_weight_hh_grad(context, grad_gate, pre_hidden, grad_weight_hh);
   }
 };
 
@@ -1728,8 +1660,7 @@ struct GRUGradCell : GradCell<T> {
                   Tensor* pre_state, Tensor* grad_hidden, Tensor* grad_state,
                   Tensor* grad_gate, Tensor* grad_weight_hh,
                   Tensor* grad_pre_hidden, Tensor* grad_pre_state,
-                  Tensor* grad_gate_buf, Tensor* grad_bias_hh,
-                  const Tensor& mask_tensor,
+                  Tensor* grad_bias_hh, const Tensor& mask_tensor,
                   bool has_sequence_length) const override {
     auto& device_ctx =
         context.template device_context<platform::CPUDeviceContext>();
@@ -1747,6 +1678,8 @@ struct GRUGradCell : GradCell<T> {
     gru_value.gate_value = gate_tensor->data<T>();
     gru_value.prev_out_value = pre_hidden->data<T>();
     gru_value.reset_output_value = state_tensor->data<T>();
+    gru_value.state_weight = weight_hh->data<T>() + 2 * frame_size * frame_size;
+    gru_value.gate_weight = weight_hh->data<T>();
 
     gru_grad.gate_grad = grad_gate->data<T>();
     gru_grad.reset_output_grad = grad_state->data<T>();
@@ -1755,7 +1688,7 @@ struct GRUGradCell : GradCell<T> {
     gru_grad.gate_weight_grad = grad_weight_hh->data<T>();
     gru_grad.state_weight_grad =
         grad_weight_hh->data<T>() + 2 * frame_size * frame_size;
-    gru_grad.state_bias_grad = grad_bias_hh->data<T>() + 2 * frame_size;
+    gru_grad.bias_hh_grad = grad_bias_hh->data<T>();
 
     auto act_gate = math::detail::GetActivationType("sigmoid_v2");
     auto act_node = math::detail::GetActivationType("tanh_v2");
@@ -1763,13 +1696,9 @@ struct GRUGradCell : GradCell<T> {
         device_ctx, gru_value, gru_grad, frame_size, batch_size, act_node,
         act_gate);
 
-    make_grad_gate_buf<T>(context, grad_gate, grad_gate_buf, grad_state);
-
-    this->update_pre_hidden_grad(
-        context, grad_gate, weight_hh, grad_pre_hidden, &grad_pre_hidden_bak,
-        nullptr, nullptr, grad_gate_buf, mask_tensor, has_sequence_length);
-    this->update_weight_hh_grad(context, grad_gate, pre_hidden, grad_weight_hh,
-                                grad_gate_buf);
+    this->postprocess_pre_hidden_grad(context, grad_pre_hidden,
+                                      &grad_pre_hidden_bak, nullptr, nullptr,
+                                      mask_tensor, has_sequence_length);
   }
 };
 
@@ -1782,8 +1711,7 @@ struct LSTMGradCell : GradCell<T> {
                   Tensor* pre_state, Tensor* grad_hidden, Tensor* grad_state,
                   Tensor* grad_gate, Tensor* grad_weight_hh,
                   Tensor* grad_pre_hidden, Tensor* grad_pre_state,
-                  Tensor* grad_gate_buf, Tensor* grad_bias_hh,
-                  const Tensor& mask_tensor,
+                  Tensor* grad_bias_hh, const Tensor& mask_tensor,
                   bool has_sequence_length) const override {
     auto& device_ctx =
         context.template device_context<platform::CPUDeviceContext>();
@@ -1822,12 +1750,10 @@ struct LSTMGradCell : GradCell<T> {
     math::LstmUnitGradFunctor<platform::CPUDeviceContext, T>::compute(
         device_ctx, lstm_value, lstm_grad, frame_size, batch_size, cell_clip,
         gate_act, state_act, cand_act, false);
-    this->update_pre_hidden_grad(context, grad_gate, weight_hh, grad_pre_hidden,
-                                 &grad_pre_hidden_bak, grad_pre_state,
-                                 &grad_pre_state_bak, grad_gate_buf,
-                                 mask_tensor, has_sequence_length);
-    this->update_weight_hh_grad(context, grad_gate, pre_hidden, grad_weight_hh,
-                                grad_gate_buf);
+    this->update_pre_hidden_grad(
+        context, grad_gate, weight_hh, grad_pre_hidden, &grad_pre_hidden_bak,
+        grad_pre_state, &grad_pre_state_bak, mask_tensor, has_sequence_length);
+    this->update_weight_hh_grad(context, grad_gate, pre_hidden, grad_weight_hh);
   }
 };
 
@@ -2001,7 +1927,12 @@ void RnnGradFunc(const framework::ExecutionContext& context,
   for (int i = num_layers - 1; i >= 0; --i) {
     // the layer input output had saved, just use the data
     if (i > 0) {
-      layer_input.ShareDataWith(hidden_tensor_unbind[i - 1]);
+      if (layer_input.numel() == 0) {
+        layer_input.Resize(hidden_tensor_unbind[i - 1].dims());
+        layer_input.mutable_data<T>(context.GetPlace());
+      }
+      dropout_helper<T>(context, &hidden_tensor_unbind[i - 1], &layer_input,
+                        dropout_state, dropout_prob);
     } else {
       layer_input.ShareDataWith(*input);
     }
diff --git a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
index d9149b06287..bfaf6430f27 100644
--- a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
+++ b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
@@ -294,7 +294,6 @@ def unstack(array, axis=0):
 def dropout(array, p=0.5):
     if p == 0.0:
         return array
-
     mask = (np.random.uniform(size=array.shape) < (1 - p)).astype(array.dtype)
     return array * (mask / (1 - p))
 
@@ -390,11 +389,12 @@ class RNNMixin(LayerListMixin):
         states = split_states(initial_states, self.num_directions == 2,
                               self.state_components)
         final_states = []
-
+        input_temp = inputs
         for i, rnn_layer in enumerate(self):
             if i > 0:
-                inputs = dropout(inputs, self.dropout)
-            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+                input_temp = dropout(inputs, self.dropout)
+            outputs, final_state = rnn_layer(input_temp, states[i],
+                                             sequence_length)
             final_states.append(final_state)
             inputs = outputs
 
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_op.py b/python/paddle/fluid/tests/unittests/test_rnn_op.py
index af3add34d7f..5ad2ffec982 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_op.py
@@ -53,6 +53,7 @@ class TestRNNOp(OpTest):
         self.is_bidirec = False
         self.mode = "LSTM"
         self.is_test = False
+        self.dropout = 0.0
         self.set_attrs()
 
         self.direction_num = 2 if self.is_bidirec else 1
@@ -76,7 +77,8 @@ class TestRNNOp(OpTest):
             hidden_size,
             num_layers=self.num_layers,
             time_major=True,
-            direction=direction)
+            direction=direction,
+            dropout=self.dropout)
 
         flat_w = get_params_for_net(rnn1)
         output, (last_hidden, last_cell) = rnn1(
@@ -101,7 +103,7 @@ class TestRNNOp(OpTest):
                 'PreState': [('init_h', init_h), ('init_c', init_c)],
             }
         self.attrs = {
-            'dropout_prob': 0.0,
+            'dropout_prob': self.dropout,
             'is_bidirec': self.is_bidirec,
             'input_size': input_size,
             'hidden_size': hidden_size,
-- 
GitLab