From 3462c29940ccf4e60f56f430757655d9c9676200 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 24 Aug 2018 14:53:45 +0800
Subject: [PATCH] refine add bias with avx

---
 paddle/fluid/operators/attention_lstm_op.cc | 30 +++-------
 paddle/fluid/operators/math/cpu_vec.h       | 66 +++++++++++++++++++--
 2 files changed, 69 insertions(+), 27 deletions(-)
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 1cb65346e..a73ea09f1 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -232,40 +232,28 @@ use lstm_x_t as input and compute as standard LSTM.
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
   if (bias) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = x[i] + bias[0];
-    }
-    math::vec_relu<T>(n, y, y);
+    math::vec_add_bias<T, platform::jit::avx>(n, *bias, x, y);
+    math::vec_relu<T, platform::jit::avx>(n, y, y);
   } else {
-    math::vec_relu<T>(n, x, y);
+    math::vec_relu<T, platform::jit::avx>(n, x, y);
   }
 }
 
-template <typename DeviceContext, typename T>
-inline void vec_softmax(const math::BlasT<DeviceContext, T>& blas, const int n,
-                        const T* x, T* y) {
+template <typename T>
+inline void vec_softmax(const int n, const T* x, T* y) {
   T scalar = x[0];
   // max
   for (int i = 1; i < n; ++i) {
     scalar = scalar < x[i] ? x[i] : scalar;
   }
-
-  // sub
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] - scalar;
-  }
-
-  // exp
-  blas.VEXP(n, y, y);
-
+  math::vec_add_bias<T, platform::jit::avx>(n, -scalar, x, y);  // sub
+  math::vec_exp<T>(n, y, y);                                    // exp
   // sum
   scalar = T(0);
   for (int i = 0; i < n; ++i) {
     scalar += y[i];
   }
-
-  // scale
-  blas.SCAL(n, static_cast<T>(1) / scalar, y);
+  math::vec_scal<T>(n, static_cast<T>(1) / scalar, y);  // scale
 }
 
 template <typename T>
@@ -363,7 +351,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
                        fc_out_data);
         }
         // 1d. softmax
-        vec_softmax<DeviceContext, T>(blas, seq_len, fc_out_data, fc_out_data);
+        vec_softmax<T>(seq_len, fc_out_data, fc_out_data);
         // mul x(seq_len*M) and sum pool
         math::FCCompute<DeviceContext, T>(blas, 1, M, seq_len, fc_out_data,
                                           cur_x_data, lstm_x_data);
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index d5f247e7e..0bae926e9 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -87,7 +87,7 @@ inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
                                                 const float* x, float* y) {
 #ifdef __AVX__
   constexpr int block = AVX_FLOAT_BLOCK;
-  if (n < block * 4) {  // use larger threshold, since small ones has no boost
+  if (n < block) {
     vec_scal<float, platform::jit::isa_any>(n, a, x, y);
     return;
   }
@@ -131,6 +131,62 @@ inline void vec_scal<float, platform::jit::avx512_common>(const int n,
   vec_scal<float, platform::jit::avx2>(n, a, x, y);
 }
 
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a,
+                                                    const float* x, float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_add_ps(tmp, bias); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+#else
+  vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
+                                                     const float* x, float* y) {
+  vec_add_bias<float, platform::jit::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx512_common>(const int n,
+                                                              const float a,
+                                                              const float* x,
+                                                              float* y) {
+  // TODO(TJ): enable me
+  vec_add_bias<float, platform::jit::avx2>(n, a, x, y);
+}
+
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_identity(const int n, const T* x, T* y) {
   // do nothing
@@ -229,11 +285,10 @@ inline void vec_tanh(const int n, const T* x, T* y) {
   vec_scal<T, isa>(n, static_cast<T>(2), x, y);
   vec_sigmoid<T, isa>(n, y, y);
   vec_scal<T>(n, static_cast<T>(2), y);
-  for (int i = 0; i < n; ++i) {
-    y[i] = y[i] - static_cast<T>(1);
-  }
+  vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
 }
 
+// TODO(TJ): make relu clip
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_relu(const int n, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
@@ -246,7 +301,7 @@ inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
                                                 float* y) {
 #ifdef __AVX__
   constexpr int block = AVX_FLOAT_BLOCK;
-  if (n < block) {
+  if (n < block * 4) {
     vec_relu<float, platform::jit::isa_any>(n, x, y);
     return;
   }
@@ -288,7 +343,6 @@ inline void vec_relu<float, platform::jit::avx512_common>(const int n,
   // TODO(TJ): enable me
   vec_relu<float, platform::jit::avx2>(n, x, y);
 }
-// TODO(TJ): add vec add bias, make relu clip
 
 // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
 
-- 
GitLab