提交 b0d36c4c 编写于 作者: T tensor-tang

add cross vec to speedup gru

上级 038c16ee
...@@ -172,19 +172,19 @@ class FusionGRUKernel : public framework::OpKernel<T> { ...@@ -172,19 +172,19 @@ class FusionGRUKernel : public framework::OpKernel<T> {
bool is_reverse = ctx.Attr<bool>("is_reverse"); bool is_reverse = ctx.Attr<bool>("is_reverse");
std::function<void(const int, const T *, T *)> act_gate, act_state; std::function<void(const int, const T *, T *)> act_gate, act_state;
std::function<void(const int, const T, const T*, T*)> bias_sub; std::function<void(const int, const T*, const T*, const T*, T*)> cross;
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
auto& act_state_str = ctx.Attr<std::string>("activation"); auto& act_state_str = ctx.Attr<std::string>("activation");
if (platform::jit::MayIUse(platform::jit::avx)) { if (platform::jit::MayIUse(platform::jit::avx)) {
math::VecActivations<T, platform::jit::avx> act_functor; math::VecActivations<T, platform::jit::avx> act_functor;
act_gate = act_functor(act_gate_str); act_gate = act_functor(act_gate_str);
act_state = act_functor(act_state_str); act_state = act_functor(act_state_str);
bias_sub = math::vec_bias_sub<T, platform::jit::avx>; cross = math::vec_cross<T, platform::jit::avx>;
} else { } else {
math::VecActivations<T, platform::jit::isa_any> act_functor; math::VecActivations<T, platform::jit::isa_any> act_functor;
act_gate = act_functor(act_gate_str); act_gate = act_functor(act_gate_str);
act_state = act_functor(act_state_str); act_state = act_functor(act_state_str);
bias_sub = math::vec_bias_sub<T, platform::jit::isa_any>; cross = math::vec_cross<T, platform::jit::isa_any>;
} }
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
...@@ -288,15 +288,9 @@ class FusionGRUKernel : public framework::OpKernel<T> { ...@@ -288,15 +288,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
for (int i = 0; i < cur_bs; ++i) { for (int i = 0; i < cur_bs; ++i) {
// ht~ = act_state(...) // ht~ = act_state(...)
act_state(D, cur_batched_data + D2, cur_batched_data + D2); act_state(D, cur_batched_data + D2, cur_batched_data + D2);
// ht~~ = zt*ht~ inplace result // out = zt*ht~ + (1-zt)*ht_1
blas.VMUL(D, cur_batched_data, cur_batched_data + D2, cross(D, cur_batched_data, cur_batched_data + D2, cur_prev_hidden_data,
cur_batched_data + D2); cur_out_data);
// zt = 1 - zt inplace result
bias_sub(D, static_cast<T>(1), cur_batched_data, cur_batched_data);
// zt = ht_1 * zt
blas.VMUL(D, cur_prev_hidden_data, cur_batched_data, cur_batched_data);
// out = zt + ht~~
blas.VADD(D, cur_batched_data, cur_batched_data + D2, cur_out_data);
cur_batched_data += D3; cur_batched_data += D3;
cur_prev_hidden_data += D; cur_prev_hidden_data += D;
......
...@@ -188,6 +188,65 @@ inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n, ...@@ -188,6 +188,65 @@ inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n,
vec_bias_sub<float, platform::jit::avx2>(n, a, x, y); vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
} }
// out = x*y + (1-x)*z
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
for (int i = 0; i < n; ++i) {
out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
}
}
template <>
inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
const float* y, const float* z,
float* out) {
#ifdef __AVX__
constexpr int block = AVX_FLOAT_BLOCK;
if (n < block) {
vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 bias = _mm256_set1_ps(1.f);
__m256 tmpx, tmpy, tmpz;
for (i = 0; i < end; i += block) {
tmpx = _mm256_loadu_ps(x + i);
tmpy = _mm256_loadu_ps(y + i);
tmpz = _mm256_loadu_ps(z + i);
tmpy = _mm256_mul_ps(tmpx, tmpy);
tmpx = _mm256_sub_ps(bias, tmpx);
tmpz = _mm256_mul_ps(tmpx, tmpz);
tmpz = _mm256_add_ps(tmpy, tmpz);
_mm256_storeu_ps(out + i, tmpz);
}
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
}
#else
vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
#endif
}
template <>
inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
const float* y,
const float* z, float* out) {
vec_cross<float, platform::jit::avx>(n, x, y, z, out);
}
template <>
inline void vec_cross<float, platform::jit::avx512_common>(
const int n, const float* x, const float* y, const float* z, float* out) {
// TODO(TJ): enable me
vec_cross<float, platform::jit::avx>(n, x, y, z, out);
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_add_bias(const int n, const T a, const T* x, T* y) { inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册