diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 5693761e9ffd96b40040223b5498b63b0274bf0f..bf6f552ad3ca5128f8ac850edf54e416acf0f04a 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -132,6 +132,62 @@ inline void vec_scal(const int n, vec_scal(n, a, x, y); } +template +inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = a - x[i]; + } +} + +template <> +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { +#ifdef __AVX__ + constexpr int block = AVX_FLOAT_BLOCK; + if (n < block) { + vec_bias_sub(n, a, x, y); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 bias = _mm256_set1_ps(a); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_sub_ps(bias, tmp); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest == 0) { + return; + } + // can not continue move step if src and dst are inplace + for (i = n - rest; i < n; ++i) { + y[i] = a - x[i]; + } +#else + vec_bias_sub(n, a, x, y); +#endif +} + +template <> +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { + vec_bias_sub(n, a, x, y); +} + +template <> +inline void vec_bias_sub(const int n, + const float a, + const float* x, + float* y) { + // TODO(TJ): enable me + vec_bias_sub(n, a, x, y); +} + template inline void vec_add_bias(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) {