diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 1cb65346ee2b755b48f8dd8f1456a32861c3a0b6..a73ea09f1e120356793a6fd198ce54e68d162cc8 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -232,40 +232,28 @@ use lstm_x_t as input and compute as standard LSTM. template inline void bias_relu(const int n, const T* x, const T* bias, T* y) { if (bias) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] + bias[0]; - } - math::vec_relu(n, y, y); + math::vec_add_bias(n, *bias, x, y); + math::vec_relu(n, y, y); } else { - math::vec_relu(n, x, y); + math::vec_relu(n, x, y); } } -template -inline void vec_softmax(const math::BlasT& blas, const int n, - const T* x, T* y) { +template +inline void vec_softmax(const int n, const T* x, T* y) { T scalar = x[0]; // max for (int i = 1; i < n; ++i) { scalar = scalar < x[i] ? x[i] : scalar; } - - // sub - for (int i = 0; i < n; ++i) { - y[i] = x[i] - scalar; - } - - // exp - blas.VEXP(n, y, y); - + math::vec_add_bias(n, -scalar, x, y); // sub + math::vec_exp(n, y, y); // exp // sum scalar = T(0); for (int i = 0; i < n; ++i) { scalar += y[i]; } - - // scale - blas.SCAL(n, static_cast(1) / scalar, y); + math::vec_scal(n, static_cast(1) / scalar, y); // scale } template @@ -363,7 +351,7 @@ class AttentionLSTMKernel : public framework::OpKernel { fc_out_data); } // 1d. softmax - vec_softmax(blas, seq_len, fc_out_data, fc_out_data); + vec_softmax(seq_len, fc_out_data, fc_out_data); // mul x(seq_len*M) and sum pool math::FCCompute(blas, 1, M, seq_len, fc_out_data, cur_x_data, lstm_x_data); diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index d5f247e7ef6d267115dd085c6aca4c733eb4dc05..0bae926e9892986b59c6dfc7fa9b8778da1dfcb7 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -87,7 +87,7 @@ inline void vec_scal(const int n, const float a, const float* x, float* y) { #ifdef __AVX__ constexpr int block = AVX_FLOAT_BLOCK; - if (n < block * 4) { // use larger threshold, since small ones has no boost + if (n < block) { vec_scal(n, a, x, y); return; } @@ -131,6 +131,62 @@ inline void vec_scal(const int n, vec_scal(n, a, x, y); } +template +inline void vec_add_bias(const int n, const T a, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] + a; + } +} + +template <> +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { +#ifdef __AVX__ + constexpr int block = AVX_FLOAT_BLOCK; + if (n < block) { + vec_add_bias(n, a, x, y); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 bias = _mm256_set1_ps(a); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_add_ps(tmp, bias); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest == 0) { + return; + } + // can not continue move step if src and dst are inplace + for (i = n - rest; i < n; ++i) { + y[i] = x[i] + a; + } +#else + vec_add_bias(n, a, x, y); +#endif +} + +template <> +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { + vec_add_bias(n, a, x, y); +} + +template <> +inline void vec_add_bias(const int n, + const float a, + const float* x, + float* y) { + // TODO(TJ): enable me + vec_add_bias(n, a, x, y); +} + template inline void vec_identity(const int n, const T* x, T* y) { // do nothing @@ -229,11 +285,10 @@ inline void vec_tanh(const int n, const T* x, T* y) { vec_scal(n, static_cast(2), x, y); vec_sigmoid(n, y, y); vec_scal(n, static_cast(2), y); - for (int i = 0; i < n; ++i) { - y[i] = y[i] - static_cast(1); - } + vec_add_bias(n, static_cast(-1), y, y); } +// TODO(TJ): make relu clip template inline void vec_relu(const int n, const T* x, T* y) { for (int i = 0; i < n; ++i) { @@ -246,7 +301,7 @@ inline void vec_relu(const int n, const float* x, float* y) { #ifdef __AVX__ constexpr int block = AVX_FLOAT_BLOCK; - if (n < block) { + if (n < block * 4) { vec_relu(n, x, y); return; } @@ -288,7 +343,6 @@ inline void vec_relu(const int n, // TODO(TJ): enable me vec_relu(n, x, y); } -// TODO(TJ): add vec add bias, make relu clip // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary