Add eigen gru and fix the dropout bug in the rnn

Add eigen gru and fix the dropout bug in the rnn

Add eigen gru and fix the dropout bug in the rnn
085260f3 · Jack Zhou · GitHub · 545df287 · 085260f3 · 085260f3
6 changed file
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
@@ -14,6 +14,8 @@ limitations under the License. */

 #pragma once
 #include <type_traits>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"

@@ -21,6 +23,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 namespace detail {
+using Array1 = Eigen::DSizes<int64_t, 1>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;

 #ifndef __NVCC__

@@ -242,23 +248,46 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
 #endif
 }

+template <typename T>
+inline void forward_reset_outputV2(const platform::CPUDeviceContext &context,
+                                   GRUMetaValue<T> value, int frame_size) {
+  auto &place = *context.eigen_device();
+  auto value_reset_gate =
+      typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
+  auto value_update_gate = typename EigenVector<T>::Type(
+      value.gate_value + frame_size, Array1(frame_size));
+  auto value_reset_output = typename EigenVector<T>::Type(
+      value.reset_output_value, Array1(frame_size));
+  auto value_reset_bias =
+      typename EigenVector<T>::ConstType(value.reset_bias, Array1(frame_size));
+  SigmoidFunctor<T>()(place, value_reset_gate, value_reset_gate);
+  SigmoidFunctor<T>()(place, value_update_gate, value_update_gate);
+  value_reset_output.device(place) =
+      (value_reset_output + value_reset_bias) * value_reset_gate;
+}
+
 template <class OpResetOutput, typename T>
-inline void forward_reset_output(OpResetOutput op_reset_output,
-                                 GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_gate,
-                                 bool old_version = true) {
+inline void forward_reset_output(
+    OpResetOutput op_reset_output, GRUMetaValue<T> value, int frame_size,
+    int batch_size, ActivationType active_gate, bool old_version = true,
+    const platform::CPUDeviceContext *context = nullptr) {
  for (int b = 0; b < batch_size; b++) {
-    if (OpResetOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
-        (sizeof(T) == 4)) {
-      hl_avx_gru_forward_reset_output(
-          op_reset_output, value.gate_value, value.reset_output_value,
-          value.prev_out_value, frame_size, active_gate, old_version,
-          value.reset_bias);
+    if (!old_version) {
+      // use eigen
+      forward_reset_outputV2(*context, value, frame_size);
    } else {
-      hl_naive_gru_forward_reset_output(
-          op_reset_output, value.gate_value, value.reset_output_value,
-          value.prev_out_value, frame_size, active_gate, old_version,
-          value.reset_bias);
+      if (OpResetOutput::avx && (frame_size & static_cast<int>(8 - 1)) &&
+          (sizeof(T) == 4)) {
+        hl_avx_gru_forward_reset_output(
+            op_reset_output, value.gate_value, value.reset_output_value,
+            value.prev_out_value, frame_size, active_gate, old_version,
+            value.reset_bias);
+      } else {
+        hl_naive_gru_forward_reset_output(
+            op_reset_output, value.gate_value, value.reset_output_value,
+            value.prev_out_value, frame_size, active_gate, old_version,
+            value.reset_bias);
+      }
    }
    value.gate_value += frame_size * 3;
    value.reset_output_value += frame_size;
@@ -268,25 +297,51 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
  }
 }

+template <typename T>
+inline void forward_final_outputV2(const platform::CPUDeviceContext &context,
+                                   GRUMetaValue<T> value, int frame_size) {
+  auto &place = *context.eigen_device();
+  auto value_update_gate = typename EigenVector<T>::Type(
+      value.gate_value + frame_size, Array1(frame_size));
+  auto value_frame_state = typename EigenVector<T>::Type(
+      value.gate_value + 2 * frame_size, Array1(frame_size));
+  auto value_output =
+      typename EigenVector<T>::Type(value.output_value, Array1(frame_size));
+  TanhFunctor<T>()(place, value_frame_state, value_frame_state);
+  value_output.device(place) =
+      (static_cast<T>(1.0) - value_update_gate) * value_frame_state;
+  if (value.prev_out_value) {
+    auto value_prev_out = typename EigenVector<T>::ConstType(
+        value.prev_out_value, Array1(frame_size));
+    value_output.device(place) =
+        value_output + value_update_gate * value_prev_out;
+  }
+}
+
 template <class OpFinalOutput, typename T>
-inline void forward_final_output(OpFinalOutput op_final_output,
-                                 GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_node,
-                                 bool origin_mode, bool old_version = true) {
+inline void forward_final_output(
+    OpFinalOutput op_final_output, GRUMetaValue<T> value, int frame_size,
+    int batch_size, ActivationType active_node, bool origin_mode,
+    bool old_version = true,
+    const platform::CPUDeviceContext *context = nullptr) {
  for (int b = 0; b < batch_size; b++) {
-    if (OpFinalOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
-        (sizeof(T) == 4)) {
-      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
-                                      value.prev_out_value, value.output_value,
-                                      frame_size, active_node, origin_mode,
-                                      old_version);
+    if (!old_version) {
+      // eigen
+      forward_final_outputV2(*context, value, frame_size);
    } else {
-      hl_naive_gru_forward_final_output(op_final_output, value.gate_value,
+      if (OpFinalOutput::avx && (frame_size & static_cast<int>(8 - 1)) &&
+          (sizeof(T) == 4)) {
+        hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
                                        value.prev_out_value,
                                        value.output_value, frame_size,
                                        active_node, origin_mode, old_version);
+      } else {
+        hl_naive_gru_forward_final_output(
+            op_final_output, value.gate_value, value.prev_out_value,
+            value.output_value, frame_size, active_node, origin_mode,
+            old_version);
+      }
    }
-
    value.gate_value += frame_size * 3;
    value.output_value += frame_size;
    if (value.prev_out_value) {
@@ -664,23 +719,70 @@ inline void backward_reset_grad(OpResetGrad op_reset_grad,
  }
 }

+template <typename T>
+inline void gru_backward(const platform::CPUDeviceContext &context,
+                         GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                         int frame_size) {
+  auto &place = *context.eigen_device();
+
+  auto value_reset_gate =
+      typename EigenVector<T>::Type(value.gate_value, Array1(frame_size));
+  auto grad_reset_gate =
+      typename EigenVector<T>::Type(grad.gate_grad, Array1(frame_size));
+  auto value_update_gate = typename EigenVector<T>::Type(
+      value.gate_value + frame_size, Array1(frame_size));
+  auto grad_update_gate = typename EigenVector<T>::Type(
+      grad.gate_grad + frame_size, Array1(frame_size));
+  auto value_frame_state = typename EigenVector<T>::Type(
+      value.gate_value + frame_size * 2, Array1(frame_size));
+  auto grad_frame_state = typename EigenVector<T>::Type(
+      grad.gate_grad + frame_size * 2, Array1(frame_size));
+
+  auto grad_output =
+      typename EigenVector<T>::Type(grad.output_grad, Array1(frame_size));
+  auto value_reset_output = typename EigenVector<T>::Type(
+      value.reset_output_value, Array1(frame_size));
+  auto grad_reset_output =
+      typename EigenVector<T>::Type(grad.reset_output_grad, Array1(frame_size));
+
+  if (value.prev_out_value) {
+    auto value_prev_out = typename EigenVector<T>::ConstType(
+        value.prev_out_value, Array1(frame_size));
+    SigmoidGradFunctor<T>()(place, 1 /*useless*/, value_update_gate,
+                            (value_prev_out - value_frame_state) * grad_output,
+                            grad_update_gate);
+  } else {
+    SigmoidGradFunctor<T>()(
+        place, 1 /*useless*/, value_update_gate,
+        static_cast<T>(-1) * value_frame_state * grad_output, grad_update_gate);
+  }
+  if (grad.prev_out_grad) {
+    auto grad_prev_out =
+        typename EigenVector<T>::Type(grad.prev_out_grad, Array1(frame_size));
+    grad_prev_out.device(place) =
+        grad_prev_out + grad_output * value_update_gate;
+  }
+  TanhGradFunctor<T>()(place, 1 /*useless*/, value_frame_state,
+                       grad_output * (static_cast<T>(1.0) - value_update_gate),
+                       grad_frame_state);
+  SigmoidGradFunctor<T>()(
+      place, 1 /*useless*/, value_reset_gate,
+      value_reset_output / value_reset_gate * grad_frame_state,
+      grad_reset_gate);
+  if (value.prev_out_value && grad.prev_out_grad) {
+    grad_reset_output.device(place) = value_reset_gate * grad_frame_state;
+  }
+}
+
 template <class OpGruGrad, typename T>
-inline void cpu_gru_backward(OpGruGrad op_gru_grad, GRUMetaValue<T> value,
+inline void cpu_gru_backward(const platform::CPUDeviceContext &context,
+                             OpGruGrad op_gru_grad, GRUMetaValue<T> value,
                             GRUMetaGrad<T> grad, int frame_size,
                             int batch_size, ActivationType active_node,
                             ActivationType active_gate) {
  for (int b = 0; b < batch_size; ++b) {
-    if (OpGruGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_backward(
-          op_gru_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, value.reset_output_value, grad.reset_output_grad,
-          grad.output_grad, frame_size, active_node, active_gate);
-    } else {
-      hl_naive_gru_backward(
-          op_gru_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prev_out_grad, value.reset_output_value, grad.reset_output_grad,
-          grad.output_grad, frame_size, active_node, active_gate);
-    }
+    // eigen
+    gru_backward(context, value, grad, frame_size);

    value.gate_value += frame_size * 3;
    value.reset_output_value += frame_size;

--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -42,7 +42,8 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
    }

    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
-                                 frame_size, batch_size, active_gate);
+                                 frame_size, batch_size, active_gate, true,
+                                 &context);

    if (value.prev_out_value) {
      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
@@ -53,7 +54,7 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {

    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
                                 frame_size, batch_size, active_node,
-                                 origin_mode);
+                                 origin_mode, &context);
 #endif
  }
 };
@@ -116,7 +117,8 @@ struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {
                value.reset_output_value);
    }
    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
-                                 frame_size, batch_size, active_gate, false);
+                                 frame_size, batch_size, active_gate, false,
+                                 &context);

    T *cell_state_value = value.gate_value + 2 * frame_size;
    T *reset_output_value = value.reset_output_value;
@@ -129,7 +131,7 @@ struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {

    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
                                 frame_size, batch_size, active_node, true,
-                                 false);
+                                 false, &context);
 #endif
  }
 };
@@ -144,8 +146,50 @@ struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, T> {
 #ifndef __NVCC__
    // calculate grad_update_gate, grad_frame_state,
    // grad_reset_output, grad_reset_gate
-    detail::cpu_gru_backward(detail::backward::gru<T>(), value, grad,
+    detail::cpu_gru_backward(context, detail::backward::gru<T>(), value, grad,
                             frame_size, batch_size, active_node, active_gate);
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    if (grad.prev_out_grad && value.prev_out_value) {
+      // update prev_out_grad
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+                grad.gate_grad, frame_size * 3, value.gate_weight, frame_size,
+                1, grad.prev_out_grad, frame_size);
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+                grad.gate_grad + frame_size, frame_size * 3,
+                value.gate_weight + frame_size * frame_size, frame_size, 1,
+                grad.prev_out_grad, frame_size);
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+                grad.reset_output_grad, frame_size, value.state_weight,
+                frame_size, 1, grad.prev_out_grad, frame_size);
+      // update weight_hh_grad
+      if (grad.gate_weight_grad) {
+        // reset gate
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                  grad.gate_grad, frame_size * 3, value.prev_out_value,
+                  frame_size, 1, grad.gate_weight_grad, frame_size);
+        // update gate
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                  grad.gate_grad + frame_size, frame_size * 3,
+                  value.prev_out_value, frame_size, 1,
+                  grad.gate_weight_grad + frame_size * frame_size, frame_size);
+        // cell state
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                  grad.reset_output_grad, frame_size, value.prev_out_value,
+                  frame_size, 1, grad.state_weight_grad, frame_size);
+      }
+    }
+    // update bias_hh_grad
+    T *gate_grad = grad.gate_grad;
+    T *bias_hh_grad = grad.bias_hh_grad;
+    T *state_bias_grad = grad.bias_hh_grad + 2 * frame_size;
+    T *reset_output_grad = grad.reset_output_grad;
+    for (int b = 0; b < batch_size; ++b) {
+      blas.VADD(2 * frame_size, bias_hh_grad, gate_grad, bias_hh_grad);
+      blas.VADD(frame_size, state_bias_grad, reset_output_grad,
+                state_bias_grad);
+      gate_grad += 3 * frame_size;
+      reset_output_grad += frame_size;
+    }
 #endif
  }
 };

--- a/paddle/fluid/operators/math/gru_compute.h
+++ b/paddle/fluid/operators/math/gru_compute.h
@@ -38,7 +38,7 @@ struct GRUMetaGrad {
  T *reset_output_grad;
  T *output_grad;
  T *prev_out_grad;
-  T *state_bias_grad;
+  T *bias_hh_grad;
 };

 template <typename DeviceContext, typename T>

--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
--- a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
+++ b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
@@ -294,7 +294,6 @@ def unstack(array, axis=0):
 def dropout(array, p=0.5):
    if p == 0.0:
        return array
-
    mask = (np.random.uniform(size=array.shape) < (1 - p)).astype(array.dtype)
    return array * (mask / (1 - p))

@@ -390,11 +389,12 @@ class RNNMixin(LayerListMixin):
        states = split_states(initial_states, self.num_directions == 2,
                              self.state_components)
        final_states = []
-
+        input_temp = inputs
        for i, rnn_layer in enumerate(self):
            if i > 0:
-                inputs = dropout(inputs, self.dropout)
-            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+                input_temp = dropout(inputs, self.dropout)
+            outputs, final_state = rnn_layer(input_temp, states[i],
+                                             sequence_length)
            final_states.append(final_state)
            inputs = outputs


--- a/python/paddle/fluid/tests/unittests/test_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_op.py
@@ -53,6 +53,7 @@ class TestRNNOp(OpTest):
        self.is_bidirec = False
        self.mode = "LSTM"
        self.is_test = False
+        self.dropout = 0.0
        self.set_attrs()

        self.direction_num = 2 if self.is_bidirec else 1
@@ -76,7 +77,8 @@ class TestRNNOp(OpTest):
            hidden_size,
            num_layers=self.num_layers,
            time_major=True,
-            direction=direction)
+            direction=direction,
+            dropout=self.dropout)

        flat_w = get_params_for_net(rnn1)
        output, (last_hidden, last_cell) = rnn1(
@@ -101,7 +103,7 @@ class TestRNNOp(OpTest):
                'PreState': [('init_h', init_h), ('init_c', init_c)],
            }
        self.attrs = {
-            'dropout_prob': 0.0,
+            'dropout_prob': self.dropout,
            'is_bidirec': self.is_bidirec,
            'input_size': input_size,
            'hidden_size': hidden_size,