From 3e1b914fcb00befcb6ef5f337235f3acf9d374b5 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 7 Jan 2019 09:55:36 +0800
Subject: [PATCH] update gru op forward kernel

---
 paddle/fluid/operators/gru_op.cc              |  9 ++++--
 paddle/fluid/operators/gru_op.cu.cc           |  3 +-
 paddle/fluid/operators/gru_op.h               |  3 +-
 .../operators/math/detail/gru_cpu_kernel.h    | 19 +++++++-----
 .../operators/math/detail/gru_gpu_kernel.h    |  5 ++--
 .../fluid/operators/math/detail/gru_kernel.h  | 29 ++++++++++++++-----
 paddle/fluid/operators/math/gru_compute.cc    |  9 ++++--
 paddle/fluid/operators/math/gru_compute.cu    |  7 +++--
 paddle/fluid/operators/math/gru_compute.h     |  6 ++--
 9 files changed, 61 insertions(+), 29 deletions(-)
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 087f903a8..752d706cb 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -137,6 +137,10 @@ class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, defalut: False) "
                   "whether to compute reversed GRU.")
         .SetDefault(false);
+    AddAttr<bool>("origin_mode",
+                  "bool"
+                  "use origin mode in article https://arxiv.org/abs/1412.3555")
+        .SetDefault(false);
     AddComment(R"DOC(
 GRU Operator implements part calculations of the complete GRU as following:
 
@@ -221,6 +225,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
     using DeviceContext = paddle::platform::CPUDeviceContext;
+    bool origin_mode = context.Attr<bool>("origin_mode");
     auto* input = context.Input<LoDTensor>("Input");
     auto* h0 = context.Input<Tensor>("H0");
     auto* weight = context.Input<Tensor>("Weight");
@@ -327,7 +332,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
 
         math::detail::forward_final_output(
             math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_node);
+            cur_batch_size, active_node, origin_mode);
 
         gru_value.prev_out_value = gru_value.output_value;
       }
@@ -351,7 +356,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
 
         math::GRUUnitFunctor<DeviceContext, T>::compute(
             dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
-            active_gate);
+            active_gate, origin_mode);
 
         gru_value.prev_out_value = gru_value.output_value;
       }
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index 55721c283..ba918b3de 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -21,6 +21,7 @@ template <typename DeviceContext, typename T>
 class GRUKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
+    bool origin_mode = context.Attr<bool>("origin_mode");
     auto* input = context.Input<LoDTensor>("Input");
     auto* h0 = context.Input<Tensor>("H0");
     auto* weight = context.Input<Tensor>("Weight");
@@ -87,7 +88,7 @@ class GRUKernel : public framework::OpKernel<T> {
       gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
       math::GRUUnitFunctor<DeviceContext, T>::compute(
           dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
-          active_gate);
+          active_gate, origin_mode);
       gru_value.prev_out_value = gru_value.output_value;
     }
 
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 0b551e804..45c769ee3 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -41,6 +41,7 @@ template <typename DeviceContext, typename T>
 class GRUGradKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
+    bool origin_mode = context.Attr<bool>("origin_mode");
     auto* h0 = context.Input<Tensor>("H0");
     auto* weight = context.Input<Tensor>("Weight");
     const T* weight_data = weight->data<T>();
@@ -146,7 +147,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
 
       math::GRUUnitGradFunctor<DeviceContext, T>::compute(
           dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node,
-          active_gate);
+          active_gate, origin_mode);
     }
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
index 47c771f7c..6e74e124f 100644
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
@@ -56,7 +56,8 @@ template <class OpFinalOutput, typename T>
 void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
                                        T *gate_value, T *prev_output_value,
                                        T *output_value, int frame_size,
-                                       ActivationType active_node) {
+                                       ActivationType active_node,
+                                       bool origin_mode) {
   T r_value_update_gate;
   T r_value_frame_state;
   T r_prev_out = 0;
@@ -72,7 +73,7 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
     }
 
     op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
-                    &r_output, active_node);
+                    &r_output, active_node, origin_mode);
 
     frame_state[i] = r_value_frame_state;
     output_value[i] = r_output;
@@ -146,7 +147,8 @@ template <class OpFinalOutput, typename T>
 void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
                                      T *gate_value, T *prev_output_value,
                                      T *output_value, int frame_size,
-                                     ActivationType active_node) {
+                                     ActivationType active_node,
+                                     bool origin_mode) {
 #ifdef __AVX__
   __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f);
   __m256 r_value_frame_state, r_value_frame_state_last = _mm256_set1_ps(0.0f);
@@ -180,7 +182,7 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
     }
 
     op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
-                    &r_output, active_node);
+                    &r_output, active_node, origin_mode);
 
     _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
                      r_value_frame_state);
@@ -190,7 +192,7 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
   if (rest > 0) {
     i = n - block;
     op_final_output(&r_value_update_gate_last, &r_value_frame_state_last,
-                    &r_prev_out_last, &r_output, active_node);
+                    &r_prev_out_last, &r_output, active_node, origin_mode);
 
     _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
                      r_value_frame_state_last);
@@ -227,17 +229,18 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
 template <class OpFinalOutput, typename T>
 inline void forward_final_output(OpFinalOutput op_final_output,
                                  GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_node) {
+                                 int batch_size, ActivationType active_node,
+                                 bool origin_mode) {
   for (int b = 0; b < batch_size; b++) {
     if (OpFinalOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
         (sizeof(T) == 4)) {
       hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
                                       value.prev_out_value, value.output_value,
-                                      frame_size, active_node);
+                                      frame_size, active_node, origin_mode);
     } else {
       hl_naive_gru_forward_final_output(
           op_final_output, value.gate_value, value.prev_out_value,
-          value.output_value, frame_size, active_node);
+          value.output_value, frame_size, active_node, origin_mode);
     }
 
     value.gate_value += frame_size * 3;
diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
index 813d69f6a..8d133f532 100644
--- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
@@ -72,7 +72,8 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
                                         T *gate_value, T *prev_output_value,
                                         T *output_value, int frame_size,
                                         int batch_size,
-                                        ActivationType active_node) {
+                                        ActivationType active_node,
+                                        bool origin_mode) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (frame_idx >= frame_size) return;
   int batch_idx = 0;
@@ -94,7 +95,7 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
   }
 
   op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
-                  &r_output, active_node);
+                  &r_output, active_node, origin_mode);
 
   gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
   output_value[frame_idx] = r_output;
diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/fluid/operators/math/detail/gru_kernel.h
index f6d192358..d978bd95c 100644
--- a/paddle/fluid/operators/math/detail/gru_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_kernel.h
@@ -57,10 +57,16 @@ class gru_finalOutput {
  public:
   HOSTDEVICE void operator()(T *value_update_gate, T *value_frame_state,
                              T *prev_out, T *value_output,
-                             ActivationType act_input) {
+                             ActivationType act_input, bool origin_mode) {
     *value_frame_state = activation(*value_frame_state, act_input);
-    *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) +
-                    ((*value_update_gate) * (*value_frame_state));
+    if (origin_mode) {
+      *value_output = ((*value_update_gate) * (*prev_out)) +
+                      *value_frame_state -
+                      ((*value_update_gate) * (*value_frame_state));
+    } else {
+      *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) +
+                      ((*value_update_gate) * (*value_frame_state));
+    }
   }
 #ifndef __NVCC__
 #ifndef __AVX__
@@ -69,11 +75,20 @@ class gru_finalOutput {
   static const bool avx = true;
   HOSTDEVICE void operator()(__m256 *value_update_gate,
                              __m256 *value_frame_state, __m256 *prev_out,
-                             __m256 *value_output, ActivationType act_input) {
+                             __m256 *value_output, ActivationType act_input,
+                             bool origin_mode) {
     *value_frame_state = activation(*value_frame_state, act_input);
-    *value_output = _mm256_add_ps(
-        _mm256_sub_ps(*prev_out, _mm256_mul_ps(*value_update_gate, *prev_out)),
-        _mm256_mul_ps(*value_update_gate, *value_frame_state));
+    if (origin_mode) {
+      *value_output = _mm256_sub_ps(
+          _mm256_add_ps(_mm256_mul_ps(*value_update_gate, *prev_out),
+                        *value_frame_state),
+          _mm256_mul_ps(*value_update_gate, *value_frame_state));
+    } else {
+      *value_output = _mm256_add_ps(
+          _mm256_sub_ps(*prev_out,
+                        _mm256_mul_ps(*value_update_gate, *prev_out)),
+          _mm256_mul_ps(*value_update_gate, *value_frame_state));
+    }
   }
 #endif
 #endif
diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc
index 0e15b81de..295b75356 100644
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -23,7 +23,8 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
   static void compute(const platform::CPUDeviceContext &context,
                       GRUMetaValue<T> value, int frame_size, int batch_size,
                       const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate) {
+                      const detail::ActivationType active_gate,
+                      bool origin_mode) {
 #ifndef __NVCC__
     auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
     if (value.prev_out_value) {
@@ -43,7 +44,8 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
     }
 
     detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
-                                 frame_size, batch_size, active_node);
+                                 frame_size, batch_size, active_node,
+                                 origin_mode);
 #endif
   }
 };
@@ -54,7 +56,8 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
                       GRUMetaValue<T> value, GRUMetaGrad<T> grad,
                       int frame_size, int batch_size,
                       const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate) {
+                      const detail::ActivationType active_gate,
+                      bool origin_mode) {
 #ifndef __NVCC__
     detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
                                 grad, frame_size, batch_size, active_node);
diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu
index 1327d9149..e2c40b739 100644
--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
@@ -24,7 +24,8 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
   static void compute(const platform::CUDADeviceContext &context,
                       GRUMetaValue<T> value, int frame_size, int batch_size,
                       const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate) {
+                      const detail::ActivationType active_gate,
+                      bool origin_mode) {
     auto stream = context.stream();
     dim3 threads;
     dim3 grid;
@@ -73,14 +74,14 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
                                       T><<<grid, threads, 0, stream>>>(
           detail::forward::gru_finalOutput<T>(), value.gate_value,
           value.prev_out_value, value.output_value, frame_size, batch_size,
-          active_node);
+          active_node, origin_mode);
     } else {
       detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
                                       /* is_batch= */ true,
                                       T><<<grid, threads, 0, stream>>>(
           detail::forward::gru_finalOutput<T>(), value.gate_value,
           value.prev_out_value, value.output_value, frame_size, batch_size,
-          active_node);
+          active_node, origin_mode);
     }
   }
 };
diff --git a/paddle/fluid/operators/math/gru_compute.h b/paddle/fluid/operators/math/gru_compute.h
index c5816b16c..f5ddec0aa 100644
--- a/paddle/fluid/operators/math/gru_compute.h
+++ b/paddle/fluid/operators/math/gru_compute.h
@@ -44,7 +44,8 @@ struct GRUUnitFunctor {
   static void compute(const DeviceContext &context, GRUMetaValue<T> value,
                       int frame_size, int batch_size,
                       const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate);
+                      const detail::ActivationType active_gate,
+                      bool origin_mode);
 };
 
 template <typename DeviceContext, typename T>
@@ -52,7 +53,8 @@ struct GRUUnitGradFunctor {
   static void compute(const DeviceContext &context, GRUMetaValue<T> value,
                       GRUMetaGrad<T> grad, int frame_size, int batch_size,
                       const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate);
+                      const detail::ActivationType active_gate,
+                      bool origin_mode);
 };
 
 }  // namespace math
-- 
GitLab