提交 e3bb98eb 编写于 作者: T tensor-tang

optimize relu with avx and avx512

上级 25976fe7
......@@ -31,6 +31,13 @@ namespace math {
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define AVX_FLOAT_BLOCK 8
#define AVX_DOUBLE_BLOCK 4
#define AVX2_FLOAT_BLOCK 8
#define AVX2_DOUBLE_BLOCK 4
#define AVX512_FLOAT_BLOCK 16
#define AVX512_DOUBLE_BLOCK 8
template <typename T>
inline void vec_exp(const int n, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
......@@ -89,23 +96,81 @@ inline void vec_relu(const int n, const T* x, T* y) {
}
template <>
inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
float* y) {
// TODO(TJ): complete me
for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0;
#ifdef __AVX__
constexpr int block = AVX_FLOAT_BLOCK;
if (n < block) {
vec_relu<float, platform::jit::isa_any>(n, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 zeros = _mm256_setzero_ps();
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
if (rest == 0) {
return;
}
i = n - block;
MOVE_ONE_STEP;
#undef MOVE_ONE_STEP
#else
vec_relu<float, platform::jit::isa_any>(n, x, y);
#endif
}
template <>
inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
float* y) {
// TODO(TJ): complete me
for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0;
vec_relu<float, platform::jit::avx>(n, x, y);
}
template <>
inline void vec_relu<float, platform::jit::avx512_common>(const int n,
const float* x,
float* y) {
#ifdef __AVX512F__
// test me
constexpr int block = AVX512_FLOAT_BLOCK;
if (n < block) {
vec_relu<float, platform::jit::avx2>(n, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m512 zeros = _mm512_setzero_ps();
__m512 tmp;
#define MOVE_ONE_STEP \
tmp = _mm512_loadu_ps(x + i); \
tmp = _mm512_max_ps(tmp, zeros); \
_mm512_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
if (rest == 0) {
return;
}
i = n - block;
MOVE_ONE_STEP;
#undef MOVE_ONE_STEP
#else
vec_relu<float, platform::jit::avx2>(n, x, y);
#endif
}
// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
class VecActivations {
public:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册