Add eigen gru and fix the dropout bug in the rnn

Add eigen gru and fix the dropout bug in the rnn

Add eigen gru and fix the dropout bug in the rnn
085260f3 · Jack Zhou · GitHub · 545df287 · 085260f3 · 085260f3
6 changed file
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
@@ -14,6 +14,8 @@ limitations under the License. */

 #pragma once
 #include <type_traits>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"

@@ -21,6 +23,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 namespace detail {
+using Array1 = Eigen::DSizes<int64_t, 1>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;

 #ifndef __NVCC__

@@ -242,23 +248,46 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
 #endif
 }

+template <typename T>
+inline void forward_reset_outputV2(const platform::CPUDeviceContext &context,
+                                   GRUMetaValue<T> value, int frame_size) {
+  auto &place = *context.eigen_device();
+  auto value_reset_gate =
+      typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
+  auto value_update_gate = typename EigenVector<T>::Type(
+      value.gate_value + frame_size, Array1(frame_size));
+  auto value_reset_output = typename EigenVector<T>::Type(
+      value.reset_output_value, Array1(frame_size));
+  auto value_reset_bias =
+      typename EigenVector<T>::ConstType(value.reset_bias, Array1(frame_size));
+  SigmoidFunctor<T>()(place, value_reset_gate, value_reset_gate);
+  SigmoidFunctor<T>()(place, value_update_gate, value_update_gate);
+  value_reset_output.device(place) =
+      (value_reset_output + value_reset_bias) * value_reset_gate;
+}
+
 template <class OpResetOutput, typename T>
-inline void forward_reset_output(OpResetOutput op_reset_output,
-                                 GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_gate,
-                                 bool old_version = true) {
+inline void forward_reset_output(
+    OpResetOutput op_reset_output, GRUMetaValue<T> value, int frame_size,
+    int batch_size, ActivationType active_gate, bool old_version = true,
+    const platform::CPUDeviceContext *context = nullptr) {
  for (int b = 0; b < batch_size; b++) {
-    if (OpResetOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
-        (sizeof(T) == 4)) {
-      hl_avx_gru_forward_reset_output(
-          op_reset_output, value.gate_value, value.reset_output_value,
-          value.prev_out_value, frame_size, active_gate, old_version,
-          value.reset_bias);
+    if (!old_version) {
+      // use eigen
+      forward_reset_outputV2(*context, value, frame_size);
    } else {
-      hl_naive_gru_forward_reset_output(
-          op_reset_output, value.gate_value, value.reset_output_value,
-          value.prev_out_value, frame_size, active_gate, old_version,
-          value.reset_bias);
+      if (OpResetOutput::avx && (frame_size & static_cast<int>(8 - 1)) &&
+          (sizeof(T) == 4)) {
+        hl_avx_gru_forward_reset_output(
+            op_reset_output, value.gate_value, value.reset_output_value,
+            value.prev_out_value, frame_size, active_gate, old_version,
+            value.reset_bias);
+      } else {
+        hl_naive_gru_forward_reset_output(
+            op_reset_output, value.gate_value, value.reset_output_value,
+            value.prev_out_value, frame_size, active_gate, old_version,
+            value.reset_bias);
+      }
    }
    value.gate_value += frame_size * 3;
    value.reset_output_value += frame_size;
@@ -268,25 +297,51 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
  }
 }

+template <typename T>
+inline void forward_final_outputV2(const platform::CPUDeviceContext &context,
+                                   GRUMetaValue<T> value, int frame_size) {
+  auto &place = *context.eigen_device();
+  auto value_update_gate = typename EigenVector<T>::Type(
+      value.gate_value + frame_size, Array1(frame_size));
+  auto value_frame_state = typename EigenVector<T>::Type(
+      value.gate_value + 2 * frame_size, Array1(frame_size));
+  auto value_output =
+      typename EigenVector<T>::Type(value.output_value, Array1(frame_size));
+  TanhFunctor<T>()(place, value_frame_state, value_frame_state);
+  value_output.device(place) =
+      (static_cast<T>(1.0) - value_update_gate) * value_frame_state;
+  if (value.prev_out_value) {
+    auto value_prev_out = typename EigenVector<T>::ConstType(
+        value.prev_out_value, Array1(frame_size));
+    value_output.device(place) =
+        value_output + value_update_gate * value_prev_out;
+  }
+}
+
 template <class OpFinalOutput, typename T>
-inline void forward_final_output(OpFinalOutput op_final_output,
-                                 GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_node,
-                                 bool origin_mode, bool old_version = true) {
+inline void forward_final_output(
+    OpFinalOutput op_final_output, GRUMetaValue<T> value, int frame_size,
+    int batch_size, ActivationType active_node, bool origin_mode,
+    bool old_version = true,
+    const platform::CPUDeviceContext *context = nullptr) {
  for (int b = 0; b < batch_size; b++) {
-    if (OpFinalOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
-        (sizeof(T) == 4)) {
-      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
-                                      value.prev_out_value, value.output_value,
-                                      frame_size, active_node, origin_mode,
-                                      old_version);
+    if (!old_version) {
+      // eigen
+      forward_final_outputV2(*context, value, frame_size);
    } else {
-      hl_naive_gru_forward_final_output(op_final_output, value.gate_value,
+      if (OpFinalOutput::avx && (frame_size & static_cast<int>(8 - 1)) &&
+          (sizeof(T) == 4)) {
+        hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
                                        value.prev_out_value,
                                        value.output_value, frame_size,
                                        active_node, origin_mode, old_version);
+      } else {
+        hl_naive_gru_forward_final_output(
+            op_final_output, value.gate_value, value.prev_out_value,
+            value.output_value, frame_size, active_node, origin_mode,
+            old_version);
+      }
    }
-
    value.gate_value += frame_size * 3;
    value.output_value += frame_size;
    if (value.prev_out_value) {
@@ -664,23 +719,70 @@ inline void backward_reset_grad(OpResetGrad op_reset_grad,
  }
 }

+template <typename T>
+inline void gru_backward(const platform::CPUDeviceContext &context,
+                         GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                         int frame_size) {
+  auto &place = *context.eigen_device();
+
+  auto value_reset_gate =
+      typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
+  auto grad_reset_gate =
+      typename EigenVector<T>::Type(grad.gate_grad, Array1(frame_size));
+  auto value_update_gate = typename EigenVector<T>::Type(
+      value.gate_value + frame_size, Array1(frame_size));
+  auto grad_update_gate = typename EigenVector<T>::Type(
+      grad.gate_grad + frame_size, Array1(frame_size));
+  auto value_frame_state = typename EigenVector<T>::Type(
+      value.gate_value + frame_size * 2, Array1(frame_size));
+  auto grad_frame_state = typename EigenVector<T>::Type(
+      grad.gate_grad + frame_size * 2, Array1(frame_size));
+
+  auto grad_output =
+      typename EigenVector<T>::Type(grad.output_grad, Array1(frame_size));
+  auto value_reset_output = typename EigenVector<T>::Type(
+      value.reset_output_value, Array1(frame_size));
+  auto grad_reset_output =
+      typename EigenVector<T>::Type(grad.reset_output_grad, Array1(frame_size));
+
+  if (value.prev_out_value) {
+    auto value_prev_out = typename EigenVector<T>::ConstType(
+        value.prev_out_value, Array1(frame_size));
+    SigmoidGradFunctor<T>()(place, 1 /*useless*/, value_update_gate,
+                            (value_prev_out - value_frame_state) * grad_output,
+                            grad_update_gate);
+  } else {
+    SigmoidGradFunctor<T>()(
+        place, 1 /*useless*/, value_update_gate,
+        static_cast<T>(-1) * value_frame_state * grad_output, grad_update_gate);
+  }
+  if (grad.prev_out_grad) {
+    auto grad_prev_out =
+        typename EigenVector<T>::Type(grad.prev_out_grad, Array1(frame_size));
+    grad_prev_out.device(place) =
+        grad_prev_out + grad_output * value_update_gate;
+  }
+  TanhGradFunctor<T>()(place, 1 /*useless*/, value_frame_state,
+                       grad_output * (static_cast<T>(1.0) - value_update_gate),
+                       grad_frame_state);
+  SigmoidGradFunctor<T>()(
+      place, 1 /*useless*/, value_reset_gate,
+      value_reset_output / value_reset_gate * grad_frame_state,
+      grad_reset_gate);
+  if (value.prev_out_value && grad.prev_out_grad) {
+    grad_reset_output.device(place) = value_reset_gate * grad_frame_state;
+  }
+}
+
 template <class OpGruGrad, typename T>
-inline void cpu_gru_backward(OpGruGrad op_gru_grad, GRUMetaValue<T> value,
+inline void cpu_gru_backward(const platform::CPUDeviceContext &context,
+                             OpGruGrad op_gru_grad, GRUMetaValue<T> value,
                             GRUMetaGrad<T> grad, int frame_size,
                             int batch_size, ActivationType active_node,
                             ActivationType active_gate) {
  for (int b = 0; b < batch_size; ++b) {
-    if (OpGruGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_backward(
-          op_gru_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, value.reset_output_value, grad.reset_output_grad,
-          grad.output_grad, frame_size, active_node, active_gate);
-    } else {
-      hl_naive_gru_backward(
-          op_gru_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, value.reset_output_value, grad.reset_output_grad,
-          grad.output_grad, frame_size, active_node, active_gate);
-    }
+    // eigen
+    gru_backward(context, value, grad, frame_size);

    value.gate_value += frame_size * 3;
    value.reset_output_value += frame_size;

--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -42,7 +42,8 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
    }

    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
-                                 frame_size, batch_size, active_gate);
+                                 frame_size, batch_size, active_gate, true,
+                                 &context);

    if (value.prev_out_value) {
      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
@@ -53,7 +54,7 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {

    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
                                 frame_size, batch_size, active_node,
-                                 origin_mode);
+                                 origin_mode, &context);
 #endif
  }
 };
@@ -116,7 +117,8 @@ struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {
                value.reset_output_value);
    }
    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
-                                 frame_size, batch_size, active_gate, false);
+                                 frame_size, batch_size, active_gate, false,
+                                 &context);

    T *cell_state_value = value.gate_value + 2 * frame_size;
    T *reset_output_value = value.reset_output_value;
@@ -129,7 +131,7 @@ struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {

    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
                                 frame_size, batch_size, active_node, true,
-                                 false);
+                                 false, &context);
 #endif
  }
 };
@@ -144,8 +146,50 @@ struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, T> {
 #ifndef __NVCC__
    // calculate grad_update_gate, grad_frame_state,
    // grad_reset_output, grad_reset_gate
-    detail::cpu_gru_backward(detail::backward::gru<T>(), value, grad,
+    detail::cpu_gru_backward(context, detail::backward::gru<T>(), value, grad,
                             frame_size, batch_size, active_node, active_gate);
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    if (grad.prev_out_grad && value.prev_out_value) {
+      // update prev_out_grad
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+                grad.gate_grad, frame_size * 3, value.gate_weight, frame_size,
+                1, grad.prev_out_grad, frame_size);
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+                grad.gate_grad + frame_size, frame_size * 3,
+                value.gate_weight + frame_size * frame_size, frame_size, 1,
+                grad.prev_out_grad, frame_size);
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+                grad.reset_output_grad, frame_size, value.state_weight,
+                frame_size, 1, grad.prev_out_grad, frame_size);
+      // update weight_hh_grad
+      if (grad.gate_weight_grad) {
+        // reset gate
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                  grad.gate_grad, frame_size * 3, value.prev_out_value,
+                  frame_size, 1, grad.gate_weight_grad, frame_size);
+        // update gate
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                  grad.gate_grad + frame_size, frame_size * 3,
+                  value.prev_out_value, frame_size, 1,
+                  grad.gate_weight_grad + frame_size * frame_size, frame_size);
+        // cell state
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                  grad.reset_output_grad, frame_size, value.prev_out_value,
+                  frame_size, 1, grad.state_weight_grad, frame_size);
+      }
+    }
+    // update bias_hh_grad
+    T *gate_grad = grad.gate_grad;
+    T *bias_hh_grad = grad.bias_hh_grad;
+    T *state_bias_grad = grad.bias_hh_grad + 2 * frame_size;
+    T *reset_output_grad = grad.reset_output_grad;
+    for (int b = 0; b < batch_size; ++b) {
+      blas.VADD(2 * frame_size, bias_hh_grad, gate_grad, bias_hh_grad);
+      blas.VADD(frame_size, state_bias_grad, reset_output_grad,
+                state_bias_grad);
+      gate_grad += 3 * frame_size;
+      reset_output_grad += frame_size;
+    }
 #endif
  }
 };

--- a/paddle/fluid/operators/math/gru_compute.h
+++ b/paddle/fluid/operators/math/gru_compute.h
@@ -38,7 +38,7 @@ struct GRUMetaGrad {
  T *reset_output_grad;
  T *output_grad;
  T *prev_out_grad;
-  T *state_bias_grad;
+  T *bias_hh_grad;
 };

 template <typename DeviceContext, typename T>

--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
@@ -210,66 +210,58 @@ struct LSTMCell : Cell<T> {
  }
 };

+template <typename T>
+void dropout_helper(const framework::ExecutionContext& context, Tensor* x,
+                    Tensor* y, const Tensor* mask, const float& dropout_prob) {
+  auto& place = *context.template device_context<platform::CPUDeviceContext>()
+                     .eigen_device();
+  auto dropout_mask = EigenVector<uint8_t>::Flatten(*mask);
+  auto in = EigenVector<T>::Flatten(*x);
+  auto out = EigenVector<T>::Flatten(*y);
+  if (dropout_prob == 1.0f) {
+    out.device(place) = static_cast<T>(0) * in;
+  } else {
+    out.device(place) =
+        in * dropout_mask.cast<T>() / static_cast<T>(1.0f - dropout_prob);
+  }
+}
+
 template <typename T>
 void dropout_cpu_function_inplace(const framework::ExecutionContext& context,
-                                  Tensor* x, Tensor* mask,
+                                  Tensor* x, Tensor* y, Tensor* mask,
                                  const float& dropout_prob,
                                  const int& seed_number, const bool& is_test,
                                  bool* is_has_reset) {
  if (is_test) {
    return;
  }
-  auto* x_data = x->data<T>();
  size_t size = framework::product(x->dims());
  auto* mask_data = mask->data<uint8_t>();
  if (!(*is_has_reset)) {
    // Special case when dropout_prob is 1.0
    if (dropout_prob == 1.0f) {
-      std::fill(x_data, x_data + size, static_cast<T>(0));
-      std::fill(mask_data, mask_data + size, static_cast<T>(0));
-      *is_has_reset = true;
-      return;
-    }
-    auto engine = framework::GetCPURandomEngine(seed_number);
-    std::uniform_real_distribution<float> dist(0, 1);
-    for (size_t i = 0; i < size; ++i) {
-      if (dist(*engine) < dropout_prob) {
-        mask_data[i] = 0;
-        x_data[i] = static_cast<T>(0);
-      } else {
-        mask_data[i] = 1;
-        x_data[i] /= static_cast<T>(1.0f - dropout_prob);
+      std::fill(mask_data, mask_data + size, static_cast<uint8_t>(0));
+    } else {
+      auto engine = framework::GetCPURandomEngine(seed_number);
+      std::uniform_real_distribution<float> dist(0, 1);
+      for (size_t i = 0; i < size; ++i) {
+        if (dist(*engine) < dropout_prob) {
+          mask_data[i] = 0;
+        } else {
+          mask_data[i] = 1;
+        }
      }
    }
    *is_has_reset = true;
-  } else {
-    if (dropout_prob == 1.0f) {
-      std::fill(x_data, x_data + size, static_cast<T>(0));
-      return;
-    }
-    for (size_t i = 0; i < size; ++i) {
-      if (mask_data[i] == 0) {
-        x_data[i] = static_cast<T>(0);
-      } else {
-        x_data[i] /= static_cast<T>(1.0f - dropout_prob);
-      }
-    }
  }
+  dropout_helper<T>(context, x, y, mask, dropout_prob);
 }

 template <typename T>
 void dropout_cpu_grad_function_inplace(
    const framework::ExecutionContext& context, Tensor* grad_x,
    const Tensor* mask, const float& dropout_prob) {
-  auto& place = *context.template device_context<platform::CPUDeviceContext>()
-                     .eigen_device();
-  auto M = EigenVector<uint8_t>::Flatten(*mask);
-  auto dX = EigenVector<T>::Flatten(*grad_x);
-  if (dropout_prob == 1.0f) {
-    dX.device(place) = static_cast<T>(0) * dX;
-  } else {
-    dX.device(place) = dX * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
-  }
+  dropout_helper<T>(context, grad_x, grad_x, mask, dropout_prob);
 }

 template <typename T, typename CellType>
@@ -298,14 +290,13 @@ struct Layer {
    blas.MatMul(*input, mat_dim_a, weight, mat_dim_b, static_cast<T>(1.0),
                cache_input, static_cast<T>(0));

-    auto eigen_in = framework::EigenMatrix<T>::Reshape(
+    auto in = framework::EigenMatrix<T>::Reshape(
        *cache_input, cache_input->dims().size() - 1);
-    auto eigen_bias_ih = framework::EigenMatrix<T>::From(
+    auto bias_ih_tmp = framework::EigenMatrix<T>::From(
        bias_ih, framework::make_ddim({1, bias_ih.dims()[0]}));
    const int& row_num =
        framework::product(cache_input->dims()) / cache_input->dims()[2];
-    eigen_in =
-        eigen_in + eigen_bias_ih.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
+    in = in + bias_ih_tmp.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
    if (is_gru(context)) {
      // reset_gate update_gate cell_gate = [1, 1, 0]
      Tensor bias_hh_tmp;
@@ -317,15 +308,13 @@ struct Layer {
      math::SetConstant<platform::CPUDeviceContext, T> zero;
      zero(dev_ctx, &bias_hh_tmp_unbind[2], static_cast<T>(0.0));

-      auto eigen_bias_hh_tmp = framework::EigenMatrix<T>::From(
+      auto bias_hh_after_mask = framework::EigenMatrix<T>::From(
          bias_hh_tmp, framework::make_ddim({1, bias_hh.dims()[0]}));
-      eigen_in = eigen_in +
-                 eigen_bias_hh_tmp.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
+      in = in + bias_hh_after_mask.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
    } else {
-      auto eigen_bias_hh = framework::EigenMatrix<T>::From(
+      auto bias_hh_no_mask = framework::EigenMatrix<T>::From(
          bias_hh, framework::make_ddim({1, bias_hh.dims()[0]}));
-      eigen_in =
-          eigen_in + eigen_bias_hh.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
+      in = in + bias_hh_no_mask.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
    }
  }

@@ -335,27 +324,26 @@ struct Layer {
    // in the output, if mask flag is 0, we will retun the zero data
    auto& place = *context.template device_context<platform::CPUDeviceContext>()
                       .eigen_device();
-    auto eigen_output =
+    auto out =
        framework::EigenMatrix<T>::Reshape(*output, output->dims().size() - 1);
-    auto eigen_mask = framework::EigenMatrix<T>::From(
+    auto mask = framework::EigenMatrix<T>::From(
        mask_tensor, framework::make_ddim({mask_tensor.dims()[1], 1}));
-    auto eigen_init_h =
+    auto pre_h =
        framework::EigenMatrix<T>::Reshape(*init_h, init_h->dims().size() - 1);
-    auto eigen_last_h =
+    auto curr_h =
        framework::EigenMatrix<T>::Reshape(*last_h, last_h->dims().size() - 1);
-    auto eigen_mask_broadcast =
-        eigen_mask.broadcast(Eigen::DSizes<int, 2>(1, output->dims()[2]));
-    eigen_last_h.device(place) = eigen_output * eigen_mask_broadcast +
-                                 eigen_init_h * (1 - eigen_mask_broadcast);
-    eigen_output.device(place) = eigen_output * eigen_mask_broadcast;
+    auto mask_broadcast =
+        mask.broadcast(Eigen::DSizes<int, 2>(1, output->dims()[2]));
+    curr_h.device(place) = out * mask_broadcast + pre_h * (1 - mask_broadcast);
+    out.device(place) = out * mask_broadcast;

    if (is_lstm(context)) {
-      auto eigen_init_c = framework::EigenMatrix<T>::Reshape(
+      auto pre_c = framework::EigenMatrix<T>::Reshape(
          *init_c, init_c->dims().size() - 1);
-      auto eigen_last_c = framework::EigenMatrix<T>::Reshape(
+      auto curr_c = framework::EigenMatrix<T>::Reshape(
          *last_c, last_c->dims().size() - 1);
-      eigen_last_c.device(place) = eigen_last_c * eigen_mask_broadcast +
-                                   eigen_init_c * (1 - eigen_mask_broadcast);
+      curr_c.device(place) =
+          curr_c * mask_broadcast + pre_c * (1 - mask_broadcast);
    }
  }

@@ -910,16 +898,18 @@ void RnnFunc(const framework::ExecutionContext& ctx, const Tensor* input,
      }
      if (!is_test) {
        prev_hidden_data = hidden_data.Slice(i - 1, i);
-        input_holder = &prev_hidden_data;
        input_holder->Resize(output->dims());
+        if (dropout_prob != 0) {
+          dropout_cpu_function_inplace<T>(ctx, &prev_hidden_data, input_holder,
+                                          dropout_mask, dropout_prob, seed,
+                                          is_test, &has_dropout_reset);
+        } else {
+          input_holder = &prev_hidden_data;
+          input_holder->Resize(output->dims());
+        }
      } else {
        SwapPoniter(&output_holder, &input_holder);
      }
-      if (dropout_prob != 0 && (!is_test)) {
-        dropout_cpu_function_inplace<T>(ctx, input_holder, dropout_mask,
-                                        dropout_prob, seed, is_test,
-                                        &has_dropout_reset);
-      }
    }
    const Tensor* input_temp_holder = input;
    if (i > 0) {
@@ -1040,53 +1030,6 @@ void create_tensor_by_list(const framework::ExecutionContext& context,
  }
 }

-template <typename T>
-void make_grad_gate_buf(const framework::ExecutionContext& context,
-                        Tensor* grad_gate, Tensor* grad_gate_buf,
-                        Tensor* reset_output_grad = nullptr) {
-  int dim_size = grad_gate->dims().size();
-  int batch_size = grad_gate->dims()[dim_size - 2];
-  int frame_size = grad_gate->dims()[dim_size - 1];
-
-  Tensor grad_gate_mask;
-  create_tensor_by_list<T>(context, &grad_gate_mask, {1, 1, 0});
-
-  auto& place = *context.template device_context<platform::CPUDeviceContext>()
-                     .eigen_device();
-  auto eigen_grad_gate_mask = framework::EigenMatrix<T>::From(
-      grad_gate_mask, framework::make_ddim({3, 1}));
-  auto eigen_grad_gate_mask_broadcast =
-      eigen_grad_gate_mask.broadcast(Eigen::DSizes<int, 2>(1, frame_size / 3))
-          .reshape(Eigen::DSizes<int, 1>(frame_size))
-          .broadcast(Eigen::DSizes<int, 2>(batch_size, 1));
-  auto eigen_grad_gate_buf = framework::EigenMatrix<T>::From(
-      *grad_gate_buf, framework::make_ddim({batch_size, frame_size}));
-  auto eigen_grad_gate = framework::EigenMatrix<T>::From(
-      *grad_gate, framework::make_ddim({batch_size, frame_size}));
-  eigen_grad_gate_buf.device(place) =
-      eigen_grad_gate * eigen_grad_gate_mask_broadcast;
-
-  if (reset_output_grad) {
-    Tensor grad_reset_output_mask;
-    create_tensor_by_list<T>(context, &grad_reset_output_mask, {0, 0, 1});
-    auto eigen_grad_reset_output_mask = framework::EigenMatrix<T>::From(
-        grad_reset_output_mask, framework::make_ddim({3, 1}));
-    auto eigen_grad_reset_output_mask_broadcast =
-        eigen_grad_reset_output_mask
-            .broadcast(Eigen::DSizes<int, 2>(1, frame_size / 3))
-            .reshape(Eigen::DSizes<int, 1>(frame_size))
-            .broadcast(Eigen::DSizes<int, 2>(batch_size, 1));
-    auto eigen_grad_reset_output =
-        framework::EigenMatrix<T>::Reshape(*reset_output_grad,
-                                           reset_output_grad->dims().size() - 1)
-            .broadcast(Eigen::DSizes<int, 3>(1, 3, 1))
-            .reshape(Eigen::DSizes<int, 2>(batch_size, frame_size));
-    eigen_grad_gate_buf.device(place) =
-        eigen_grad_gate_buf +
-        eigen_grad_reset_output_mask_broadcast * eigen_grad_reset_output;
-  }
-}
-
 template <typename T, typename GradCellType>
 struct GradLayer {
  explicit GradLayer(const GradCellType& cell) : cell_(cell) {}
@@ -1196,12 +1139,10 @@ struct GradLayer {
    Tensor* pre_hidden = nullptr;
    Tensor* pre_state = nullptr;
    Tensor* hidden = nullptr;
-    Tensor grad_gate_buf;
-    TensorList grad_gate_buf_unbind;
    if (is_gru(context)) {
-      grad_gate_buf.Resize(layer_grad_gate_tensor->dims());
-      grad_gate_buf.mutable_data<T>(context.GetPlace());
-      grad_gate_buf_unbind = Unbind(grad_gate_buf);
+      zero(device_ctx,
+           &((*weight_list_grad)[layer_idx][current_reverse_idx * 4 + 3]),
+           static_cast<T>(0.0));
    }
    for (int i = time_step - 1; i >= 0; --i) {
      if (has_sequence_length) {
@@ -1232,7 +1173,7 @@ struct GradLayer {
          &(parameter_lists[layer_idx][current_reverse_idx * 4 + 1]),
          pre_hidden, pre_state, dynamic_grad_last_h, dynamic_grad_last_c,
          &(*layer_grad_gate_tensor_unbind)[i], weight_grad, dynamic_grad_pre_h,
-          dynamic_grad_pre_c, &grad_gate_buf_unbind[i],
+          dynamic_grad_pre_c,
          &((*weight_list_grad)[layer_idx][current_reverse_idx * 4 + 3]),
          mask_tensor_list[i], has_sequence_length);
      SwapPoniter(&dynamic_grad_last_h, &dynamic_grad_pre_h);
@@ -1241,8 +1182,7 @@ struct GradLayer {
    // postproces for gradient for w_hi, X, bias_hi, bias_hh
    this->postprocess(context, *layer_grad_gate_tensor, *input, input_grad,
                      parameter_lists[layer_idx],
-                      &((*weight_list_grad)[layer_idx]), &grad_gate_buf,
-                      is_reverse);
+                      &((*weight_list_grad)[layer_idx]), is_reverse);

    // copy the gradient to init_c init_h
    if ((*init_h_grad_unbind).size() > 0 && time_step % 2 == 0) {
@@ -1268,16 +1208,17 @@ struct GradLayer {
      TensorList* init_h_grad_unbind, TensorList* init_c_grad_unbind,
      const std::vector<TensorList>& weight_list_grad, const int& layer_idx,
      const int& gate_num) {}
+
  void preprocess(const framework::ExecutionContext& context,
                  const Tensor* grad_output, Tensor* grad_last_h) {
    auto& place = *context.template device_context<platform::CPUDeviceContext>()
                       .eigen_device();
-    auto eigen_grad_output = framework::EigenMatrix<T>::Reshape(
+    auto output_grad = framework::EigenMatrix<T>::Reshape(
        *grad_output, grad_output->dims().size() - 1);
-    auto eigen_grad_last_h = framework::EigenMatrix<T>::Reshape(
+    auto last_h_grad = framework::EigenMatrix<T>::Reshape(
        *grad_last_h, grad_last_h->dims().size() - 1);
    // the output gradient contribute the gradient to last_h
-    eigen_grad_last_h.device(place) = eigen_grad_last_h + eigen_grad_output;
+    last_h_grad.device(place) = last_h_grad + output_grad;
  }

  void mask_preprocess(const framework::ExecutionContext& context,
@@ -1286,40 +1227,35 @@ struct GradLayer {
                       Tensor* grad_pre_c, const Tensor& mask_tensor) {
    auto& place = *context.template device_context<platform::CPUDeviceContext>()
                       .eigen_device();
-    auto eigen_mask = framework::EigenMatrix<T>::From(
+    auto mask = framework::EigenMatrix<T>::From(
        mask_tensor, framework::make_ddim({mask_tensor.dims()[1], 1}));
-    auto eigen_mask_broadcast =
-        eigen_mask.broadcast(Eigen::DSizes<int, 2>(1, grad_output->dims()[2]));
+    auto mask_broadcast =
+        mask.broadcast(Eigen::DSizes<int, 2>(1, grad_output->dims()[2]));

-    auto eigen_grad_last_h = framework::EigenMatrix<T>::Reshape(
+    auto last_h_grad = framework::EigenMatrix<T>::Reshape(
        *grad_last_h, grad_last_h->dims().size() - 1);
-    auto eigen_grad_pre_h = framework::EigenMatrix<T>::Reshape(
+    auto pre_h_grad = framework::EigenMatrix<T>::Reshape(
        *grad_pre_h, grad_pre_h->dims().size() - 1);
-    auto eigen_grad_output = framework::EigenMatrix<T>::Reshape(
+    auto output_grad = framework::EigenMatrix<T>::Reshape(
        *grad_output, grad_output->dims().size() - 1);
-    eigen_grad_last_h.device(place) =
-        eigen_grad_last_h + eigen_grad_output * eigen_mask_broadcast;
-    eigen_grad_pre_h.device(place) =
-        (1 - eigen_mask_broadcast) * eigen_grad_last_h;
-    eigen_grad_last_h.device(place) = eigen_mask_broadcast * eigen_grad_last_h;
+    last_h_grad.device(place) = last_h_grad + output_grad * mask_broadcast;
+    pre_h_grad.device(place) = (1 - mask_broadcast) * last_h_grad;
+    last_h_grad.device(place) = mask_broadcast * last_h_grad;

    if (grad_last_c && grad_pre_c && is_lstm(context)) {
-      auto eigen_grad_last_c = framework::EigenMatrix<T>::Reshape(
+      auto last_c_grad = framework::EigenMatrix<T>::Reshape(
          *grad_last_c, grad_last_c->dims().size() - 1);
-      auto eigen_grad_pre_c = framework::EigenMatrix<T>::Reshape(
+      auto pre_c_grad = framework::EigenMatrix<T>::Reshape(
          *grad_pre_c, grad_pre_c->dims().size() - 1);
-      eigen_grad_pre_c.device(place) =
-          (1 - eigen_mask_broadcast) * eigen_grad_last_c;
-      eigen_grad_last_c.device(place) =
-          eigen_mask_broadcast * eigen_grad_last_c;
+      pre_c_grad.device(place) = (1 - mask_broadcast) * last_c_grad;
+      last_c_grad.device(place) = mask_broadcast * last_c_grad;
    }
  }

  void postprocess(const framework::ExecutionContext& context,
                   const Tensor& grad_gate, const Tensor& input,
                   Tensor* input_grad, const TensorList& parameters,
-                   TensorList* grad_parameters, Tensor* grad_gate_buf,
-                   const int& is_reverse) {
+                   TensorList* grad_parameters, const int& is_reverse) {
    // we get the grad_gate step by step, and need to bradocast the grad to the
    // grad_w_hi, grad_bias_hi, grad_bias_hh
    int begin_idx = 0;
@@ -1360,10 +1296,7 @@ struct GradLayer {
        {grad_gate.dims()[0] * grad_gate.dims()[1], grad_gate.dims()[2]});
    col_sum(device_ctx, tmp_grad_gate, &((*grad_parameters)[begin_idx + 2]));
    // Bias_hh
-    if (is_gru(context)) {
-      grad_gate_buf->Resize(tmp_grad_gate.dims());
-      col_sum(device_ctx, *grad_gate_buf, &((*grad_parameters)[begin_idx + 3]));
-    } else {
+    if (!is_gru(context)) {
      col_sum(device_ctx, tmp_grad_gate, &((*grad_parameters)[begin_idx + 3]));
    }
  }
@@ -1600,64 +1533,69 @@ struct GradCell {
                          Tensor* pre_state, Tensor* grad_hidden,
                          Tensor* grad_state, Tensor* grad_gate,
                          Tensor* grad_weight_hh, Tensor* grad_pre_hidden,
-                          Tensor* grad_pre_state, Tensor* grad_gate_buf,
-                          Tensor* grad_bias_hh, const Tensor& mask_tensor,
+                          Tensor* grad_pre_state, Tensor* grad_bias_hh,
+                          const Tensor& mask_tensor,
                          bool has_sequence_length) const {}
+
+  void postprocess_pre_hidden_grad(const framework::ExecutionContext& context,
+                                   Tensor* grad_pre_hidden,
+                                   Tensor* grad_pre_hidden_bak,
+                                   Tensor* grad_pre_state,
+                                   Tensor* grad_pre_state_bak,
+                                   const Tensor& mask_tensor,
+                                   bool has_sequence_length) const {
+    if (has_sequence_length) {
+      auto& place =
+          *context.template device_context<platform::CPUDeviceContext>()
+               .eigen_device();
+      auto mask = framework::EigenMatrix<T>::From(
+          mask_tensor, framework::make_ddim({mask_tensor.dims()[1], 1}));
+      auto mask_broadcast =
+          mask.broadcast(Eigen::DSizes<int, 2>(1, grad_pre_hidden->dims()[2]));
+      auto pre_hidden_grad = framework::EigenMatrix<T>::Reshape(
+          *grad_pre_hidden, grad_pre_hidden->dims().size() - 1);
+      auto pre_hidden_bak_grad = framework::EigenMatrix<T>::Reshape(
+          *grad_pre_hidden_bak, grad_pre_hidden_bak->dims().size() - 1);
+      pre_hidden_grad.device(place) =
+          (1 - mask_broadcast) * pre_hidden_bak_grad +
+          pre_hidden_grad * mask_broadcast;
+      if (grad_pre_state) {
+        auto pre_state_grad = framework::EigenMatrix<T>::Reshape(
+            *grad_pre_state, grad_pre_state->dims().size() - 1);
+        auto pre_state_bak_grad = framework::EigenMatrix<T>::Reshape(
+            *grad_pre_state_bak, grad_pre_state_bak->dims().size() - 1);
+        pre_state_grad.device(place) =
+            (1 - mask_broadcast) * pre_state_bak_grad +
+            pre_state_grad * mask_broadcast;
+      }
+    }
+  }
+
  virtual void update_pre_hidden_grad(
      const framework::ExecutionContext& context, Tensor* grad_gate,
      const Tensor* weight_hh, Tensor* grad_pre_hidden,
      Tensor* grad_pre_hidden_bak, Tensor* grad_pre_state,
-      Tensor* grad_pre_state_bak, Tensor* grad_gate_buf,
-      const Tensor& mask_tensor, bool has_sequence_length) const {
+      Tensor* grad_pre_state_bak, const Tensor& mask_tensor,
+      bool has_sequence_length) const {
    auto& device_ctx =
        context.template device_context<platform::CPUDeviceContext>();
    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(device_ctx);
-    T beta = 0;
    Tensor* grad_gate_tmp = grad_gate;
-    if (is_gru(context)) {
-      beta = 1.0;
-      grad_gate_tmp = grad_gate_buf;
-    }
-
    auto mat_dim_a =
        math::CreateMatrixDescriptor(grad_gate_tmp->dims(), 0, false);
    mat_dim_a.height_ *= mat_dim_a.batch_size_;
    mat_dim_a.batch_size_ = 0;
    auto mat_dim_b = math::CreateMatrixDescriptor(weight_hh->dims(), 0, false);
    blas.MatMul(*grad_gate_tmp, mat_dim_a, *weight_hh, mat_dim_b,
-                static_cast<T>(1.0), grad_pre_hidden, beta);
-
-    if (has_sequence_length) {
-      auto& place =
-          *context.template device_context<platform::CPUDeviceContext>()
-               .eigen_device();
-      auto eigen_mask = framework::EigenMatrix<T>::From(
-          mask_tensor, framework::make_ddim({mask_tensor.dims()[1], 1}));
-      auto eigen_mask_broadcast = eigen_mask.broadcast(
-          Eigen::DSizes<int, 2>(1, grad_pre_hidden->dims()[2]));
-      auto eigen_grad_pre_hidden = framework::EigenMatrix<T>::Reshape(
-          *grad_pre_hidden, grad_pre_hidden->dims().size() - 1);
-      auto eigen_grad_pre_hidden_bak = framework::EigenMatrix<T>::Reshape(
-          *grad_pre_hidden_bak, grad_pre_hidden_bak->dims().size() - 1);
-      eigen_grad_pre_hidden.device(place) =
-          (1 - eigen_mask_broadcast) * eigen_grad_pre_hidden_bak +
-          eigen_grad_pre_hidden * eigen_mask_broadcast;
-      if (grad_pre_state) {
-        auto eigen_grad_pre_state = framework::EigenMatrix<T>::Reshape(
-            *grad_pre_state, grad_pre_state->dims().size() - 1);
-        auto eigen_grad_pre_state_bak = framework::EigenMatrix<T>::Reshape(
-            *grad_pre_state_bak, grad_pre_state_bak->dims().size() - 1);
-        eigen_grad_pre_state.device(place) =
-            (1 - eigen_mask_broadcast) * eigen_grad_pre_state_bak +
-            eigen_grad_pre_state * eigen_mask_broadcast;
-      }
-    }
+                static_cast<T>(1.0), grad_pre_hidden, 0);
+    postprocess_pre_hidden_grad(context, grad_pre_hidden, grad_pre_hidden_bak,
+                                grad_pre_state, grad_pre_state_bak, mask_tensor,
+                                has_sequence_length);
  }

  virtual void update_weight_hh_grad(const framework::ExecutionContext& context,
                                     Tensor* grad_gate, Tensor* pre_hidden,
-                                     Tensor* grad_weight_hh,
-                                     Tensor* grad_gate_buf) const {
+                                     Tensor* grad_weight_hh) const {
    auto& device_ctx =
        context.template device_context<platform::CPUDeviceContext>();
    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(device_ctx);
@@ -1667,11 +1605,7 @@ struct GradCell {
    auto mat_dim_d = math::CreateMatrixDescriptor(pre_hidden->dims(), 0, false);
    mat_dim_d.height_ *= mat_dim_d.batch_size_;
    mat_dim_d.batch_size_ = 0;
-    Tensor* grad_gate_tmp = grad_gate;
-    if (is_gru(context)) {
-      grad_gate_tmp = grad_gate_buf;
-    }
-    blas.MatMul(*grad_gate_tmp, mat_dim_c, *pre_hidden, mat_dim_d,
+    blas.MatMul(*grad_gate, mat_dim_c, *pre_hidden, mat_dim_d,
                static_cast<T>(1.0), grad_weight_hh, static_cast<T>(1.0));
  }
 };
@@ -1685,8 +1619,7 @@ struct SimpleRNNGradCell : GradCell<T> {
                  Tensor* pre_state, Tensor* grad_hidden, Tensor* grad_state,
                  Tensor* grad_gate, Tensor* grad_weight_hh,
                  Tensor* grad_pre_hidden, Tensor* grad_pre_state,
-                  Tensor* grad_gate_buf, Tensor* grad_bias_hh,
-                  const Tensor& mask_tensor,
+                  Tensor* grad_bias_hh, const Tensor& mask_tensor,
                  bool has_sequence_length) const override {
    auto& device_ctx =
        context.template device_context<platform::CPUDeviceContext>();
@@ -1711,11 +1644,10 @@ struct SimpleRNNGradCell : GradCell<T> {
    functor(*place, z, h, dh, dz);

    // update grad_weight_hh, grad_pre_hidden
-    this->update_pre_hidden_grad(
-        context, grad_gate, weight_hh, grad_pre_hidden, &grad_pre_hidden_bak,
-        nullptr, nullptr, grad_gate_buf, mask_tensor, has_sequence_length);
-    this->update_weight_hh_grad(context, grad_gate, pre_hidden, grad_weight_hh,
-                                grad_gate_buf);
+    this->update_pre_hidden_grad(context, grad_gate, weight_hh, grad_pre_hidden,
+                                 &grad_pre_hidden_bak, nullptr, nullptr,
+                                 mask_tensor, has_sequence_length);
+    this->update_weight_hh_grad(context, grad_gate, pre_hidden, grad_weight_hh);
  }
 };

@@ -1728,8 +1660,7 @@ struct GRUGradCell : GradCell<T> {
                  Tensor* pre_state, Tensor* grad_hidden, Tensor* grad_state,
                  Tensor* grad_gate, Tensor* grad_weight_hh,
                  Tensor* grad_pre_hidden, Tensor* grad_pre_state,
-                  Tensor* grad_gate_buf, Tensor* grad_bias_hh,
-                  const Tensor& mask_tensor,
+                  Tensor* grad_bias_hh, const Tensor& mask_tensor,
                  bool has_sequence_length) const override {
    auto& device_ctx =
        context.template device_context<platform::CPUDeviceContext>();
@@ -1747,6 +1678,8 @@ struct GRUGradCell : GradCell<T> {
    gru_value.gate_value = gate_tensor->data<T>();
    gru_value.prev_out_value = pre_hidden->data<T>();
    gru_value.reset_output_value = state_tensor->data<T>();
+    gru_value.state_weight = weight_hh->data<T>() + 2 * frame_size * frame_size;
+    gru_value.gate_weight = weight_hh->data<T>();

    gru_grad.gate_grad = grad_gate->data<T>();
    gru_grad.reset_output_grad = grad_state->data<T>();
@@ -1755,7 +1688,7 @@ struct GRUGradCell : GradCell<T> {
    gru_grad.gate_weight_grad = grad_weight_hh->data<T>();
    gru_grad.state_weight_grad =
        grad_weight_hh->data<T>() + 2 * frame_size * frame_size;
-    gru_grad.state_bias_grad = grad_bias_hh->data<T>() + 2 * frame_size;
+    gru_grad.bias_hh_grad = grad_bias_hh->data<T>();

    auto act_gate = math::detail::GetActivationType("sigmoid_v2");
    auto act_node = math::detail::GetActivationType("tanh_v2");
@@ -1763,13 +1696,9 @@ struct GRUGradCell : GradCell<T> {
        device_ctx, gru_value, gru_grad, frame_size, batch_size, act_node,
        act_gate);

-    make_grad_gate_buf<T>(context, grad_gate, grad_gate_buf, grad_state);
-
-    this->update_pre_hidden_grad(
-        context, grad_gate, weight_hh, grad_pre_hidden, &grad_pre_hidden_bak,
-        nullptr, nullptr, grad_gate_buf, mask_tensor, has_sequence_length);
-    this->update_weight_hh_grad(context, grad_gate, pre_hidden, grad_weight_hh,
-                                grad_gate_buf);
+    this->postprocess_pre_hidden_grad(context, grad_pre_hidden,
+                                      &grad_pre_hidden_bak, nullptr, nullptr,
+                                      mask_tensor, has_sequence_length);
  }
 };

@@ -1782,8 +1711,7 @@ struct LSTMGradCell : GradCell<T> {
                  Tensor* pre_state, Tensor* grad_hidden, Tensor* grad_state,
                  Tensor* grad_gate, Tensor* grad_weight_hh,
                  Tensor* grad_pre_hidden, Tensor* grad_pre_state,
-                  Tensor* grad_gate_buf, Tensor* grad_bias_hh,
-                  const Tensor& mask_tensor,
+                  Tensor* grad_bias_hh, const Tensor& mask_tensor,
                  bool has_sequence_length) const override {
    auto& device_ctx =
        context.template device_context<platform::CPUDeviceContext>();
@@ -1822,12 +1750,10 @@ struct LSTMGradCell : GradCell<T> {
    math::LstmUnitGradFunctor<platform::CPUDeviceContext, T>::compute(
        device_ctx, lstm_value, lstm_grad, frame_size, batch_size, cell_clip,
        gate_act, state_act, cand_act, false);
-    this->update_pre_hidden_grad(context, grad_gate, weight_hh, grad_pre_hidden,
-                                 &grad_pre_hidden_bak, grad_pre_state,
-                                 &grad_pre_state_bak, grad_gate_buf,
-                                 mask_tensor, has_sequence_length);
-    this->update_weight_hh_grad(context, grad_gate, pre_hidden, grad_weight_hh,
-                                grad_gate_buf);
+    this->update_pre_hidden_grad(
+        context, grad_gate, weight_hh, grad_pre_hidden, &grad_pre_hidden_bak,
+        grad_pre_state, &grad_pre_state_bak, mask_tensor, has_sequence_length);
+    this->update_weight_hh_grad(context, grad_gate, pre_hidden, grad_weight_hh);
  }
 };

@@ -2001,7 +1927,12 @@ void RnnGradFunc(const framework::ExecutionContext& context,
  for (int i = num_layers - 1; i >= 0; --i) {
    // the layer input output had saved, just use the data
    if (i > 0) {
-      layer_input.ShareDataWith(hidden_tensor_unbind[i - 1]);
+      if (layer_input.numel() == 0) {
+        layer_input.Resize(hidden_tensor_unbind[i - 1].dims());
+        layer_input.mutable_data<T>(context.GetPlace());
+      }
+      dropout_helper<T>(context, &hidden_tensor_unbind[i - 1], &layer_input,
+                        dropout_state, dropout_prob);
    } else {
      layer_input.ShareDataWith(*input);
    }

--- a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
+++ b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
@@ -294,7 +294,6 @@ def unstack(array, axis=0):
 def dropout(array, p=0.5):
    if p == 0.0:
        return array
-
    mask = (np.random.uniform(size=array.shape) < (1 - p)).astype(array.dtype)
    return array * (mask / (1 - p))

@@ -390,11 +389,12 @@ class RNNMixin(LayerListMixin):
        states = split_states(initial_states, self.num_directions == 2,
                              self.state_components)
        final_states = []
-
+        input_temp = inputs
        for i, rnn_layer in enumerate(self):
            if i > 0:
-                inputs = dropout(inputs, self.dropout)
-            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+                input_temp = dropout(inputs, self.dropout)
+            outputs, final_state = rnn_layer(input_temp, states[i],
+                                             sequence_length)
            final_states.append(final_state)
            inputs = outputs


--- a/python/paddle/fluid/tests/unittests/test_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_op.py
@@ -53,6 +53,7 @@ class TestRNNOp(OpTest):
        self.is_bidirec = False
        self.mode = "LSTM"
        self.is_test = False
+        self.dropout = 0.0
        self.set_attrs()

        self.direction_num = 2 if self.is_bidirec else 1
@@ -76,7 +77,8 @@ class TestRNNOp(OpTest):
            hidden_size,
            num_layers=self.num_layers,
            time_major=True,
-            direction=direction)
+            direction=direction,
+            dropout=self.dropout)

        flat_w = get_params_for_net(rnn1)
        output, (last_hidden, last_cell) = rnn1(
@@ -101,7 +103,7 @@ class TestRNNOp(OpTest):
                'PreState': [('init_h', init_h), ('init_c', init_c)],
            }
        self.attrs = {
-            'dropout_prob': 0.0,
+            'dropout_prob': self.dropout,
            'is_bidirec': self.is_bidirec,
            'input_size': input_size,
            'hidden_size': hidden_size,