refine add bias with avx

3462c299 · tensor-tang · bb9f98e1 · 3462c299 · 3462c299
显示空白变更内容
内联并排

Showing with 69 addition and 27 deletion

paddle/fluid/operators/attention_lstm_op.cc paddle/fluid/operators/attention_lstm_op.cc +9 -21

paddle/fluid/operators/math/cpu_vec.h paddle/fluid/operators/math/cpu_vec.h +60 -6

未找到文件。
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -232,40 +232,28 @@ use lstm_x_t as input and compute as standard LSTM.
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
  if (bias) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = x[i] + bias[0];
-    }
-    math::vec_relu<T>(n, y, y);
+    math::vec_add_bias<T, platform::jit::avx>(n, *bias, x, y);
+    math::vec_relu<T, platform::jit::avx>(n, y, y);
  } else {
-    math::vec_relu<T>(n, x, y);
+    math::vec_relu<T, platform::jit::avx>(n, x, y);
  }
 }

-template <typename DeviceContext, typename T>
-inline void vec_softmax(const math::BlasT<DeviceContext, T>& blas, const int n,
-                        const T* x, T* y) {
+template <typename T>
+inline void vec_softmax(const int n, const T* x, T* y) {
  T scalar = x[0];
  // max
  for (int i = 1; i < n; ++i) {
    scalar = scalar < x[i] ? x[i] : scalar;
  }
-
-  // sub
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] - scalar;
-  }
-
-  // exp
-  blas.VEXP(n, y, y);
-
+  math::vec_add_bias<T, platform::jit::avx>(n, -scalar, x, y);  // sub
+  math::vec_exp<T>(n, y, y);                                    // exp
  // sum
  scalar = T(0);
  for (int i = 0; i < n; ++i) {
    scalar += y[i];
  }
-
-  // scale
-  blas.SCAL(n, static_cast<T>(1) / scalar, y);
+  math::vec_scal<T>(n, static_cast<T>(1) / scalar, y);  // scale
 }

 template <typename T>
@@ -363,7 +351,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
                       fc_out_data);
        }
        // 1d. softmax
-        vec_softmax<DeviceContext, T>(blas, seq_len, fc_out_data, fc_out_data);
+        vec_softmax<T>(seq_len, fc_out_data, fc_out_data);
        // mul x(seq_len*M) and sum pool
        math::FCCompute<DeviceContext, T>(blas, 1, M, seq_len, fc_out_data,
                                          cur_x_data, lstm_x_data);

--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -87,7 +87,7 @@ inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
                                                const float* x, float* y) {
 #ifdef __AVX__
  constexpr int block = AVX_FLOAT_BLOCK;
-  if (n < block * 4) {  // use larger threshold, since small ones has no boost
+  if (n < block) {
    vec_scal<float, platform::jit::isa_any>(n, a, x, y);
    return;
  }
@@ -131,6 +131,62 @@ inline void vec_scal<float, platform::jit::avx512_common>(const int n,
  vec_scal<float, platform::jit::avx2>(n, a, x, y);
 }

+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a,
+                                                    const float* x, float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_add_ps(tmp, bias); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+#else
+  vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
+                                                     const float* x, float* y) {
+  vec_add_bias<float, platform::jit::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx512_common>(const int n,
+                                                              const float a,
+                                                              const float* x,
+                                                              float* y) {
+  // TODO(TJ): enable me
+  vec_add_bias<float, platform::jit::avx2>(n, a, x, y);
+}
+
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_identity(const int n, const T* x, T* y) {
  // do nothing
@@ -229,11 +285,10 @@ inline void vec_tanh(const int n, const T* x, T* y) {
  vec_scal<T, isa>(n, static_cast<T>(2), x, y);
  vec_sigmoid<T, isa>(n, y, y);
  vec_scal<T>(n, static_cast<T>(2), y);
-  for (int i = 0; i < n; ++i) {
-    y[i] = y[i] - static_cast<T>(1);
-  }
+  vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
 }

+// TODO(TJ): make relu clip
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_relu(const int n, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {
@@ -246,7 +301,7 @@ inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
                                                float* y) {
 #ifdef __AVX__
  constexpr int block = AVX_FLOAT_BLOCK;
-  if (n < block) {
+  if (n < block * 4) {
    vec_relu<float, platform::jit::isa_any>(n, x, y);
    return;
  }
@@ -288,7 +343,6 @@ inline void vec_relu<float, platform::jit::avx512_common>(const int n,
  // TODO(TJ): enable me
  vec_relu<float, platform::jit::avx2>(n, x, y);
 }
-// TODO(TJ): add vec add bias, make relu clip

 // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary