optimize relu with avx and avx512

e3bb98eb · tensor-tang · 25976fe7 · e3bb98eb
隐藏空白更改
内联并排

Showing with 74 addition and 9 deletion

paddle/fluid/operators/math/cpu_vec.h paddle/fluid/operators/math/cpu_vec.h +74 -9

未找到文件。
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -31,6 +31,13 @@ namespace math {
 #define SIGMOID_THRESHOLD_MIN -40.0
 #define SIGMOID_THRESHOLD_MAX 13.0

+#define AVX_FLOAT_BLOCK 8
+#define AVX_DOUBLE_BLOCK 4
+#define AVX2_FLOAT_BLOCK 8
+#define AVX2_DOUBLE_BLOCK 4
+#define AVX512_FLOAT_BLOCK 16
+#define AVX512_DOUBLE_BLOCK 8
+
 template <typename T>
 inline void vec_exp(const int n, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {
@@ -88,24 +95,82 @@ inline void vec_relu(const int n, const T* x, T* y) {
  }
 }

+template <>
+inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
+                                                float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_relu<float, platform::jit::isa_any>(n, x, y);
+    return;
+  }
+
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 zeros = _mm256_setzero_ps();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm256_loadu_ps(x + i);    \
+  tmp = _mm256_max_ps(tmp, zeros); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+  if (rest == 0) {
+    return;
+  }
+  i = n - block;
+  MOVE_ONE_STEP;
+#undef MOVE_ONE_STEP
+
+#else
+  vec_relu<float, platform::jit::isa_any>(n, x, y);
+#endif
+}
+
 template <>
 inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
                                                 float* y) {
-  // TODO(TJ): complete me
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
+  vec_relu<float, platform::jit::avx>(n, x, y);
 }

 template <>
-inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
-                                                float* y) {
-  // TODO(TJ): complete me
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
+inline void vec_relu<float, platform::jit::avx512_common>(const int n,
+                                                          const float* x,
+                                                          float* y) {
+#ifdef __AVX512F__
+  // test me
+  constexpr int block = AVX512_FLOAT_BLOCK;
+  if (n < block) {
+    vec_relu<float, platform::jit::avx2>(n, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m512 zeros = _mm512_setzero_ps();
+  __m512 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm512_loadu_ps(x + i);    \
+  tmp = _mm512_max_ps(tmp, zeros); \
+  _mm512_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
  }
+  if (rest == 0) {
+    return;
+  }
+  i = n - block;
+  MOVE_ONE_STEP;
+#undef MOVE_ONE_STEP
+#else
+  vec_relu<float, platform::jit::avx2>(n, x, y);
+#endif
 }

+// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
+
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 class VecActivations {
 public: