From 159be8cc63dd2add903c2bb11771fced30e38da9 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 23 Oct 2018 15:57:35 +0800 Subject: [PATCH] optimize fusion gru kernel at size 8 --- paddle/fluid/operators/math/jit_kernel_rnn.cc | 172 ++++++++++++------ .../tests/unittests/test_fusion_gru_op.py | 6 + 2 files changed, 123 insertions(+), 55 deletions(-) diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index a340cdfaf6..c0847f0bee 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -136,6 +136,21 @@ static std::shared_ptr> GetActKernel( return nullptr; } +template +static std::unique_ptr GetAVXAct(const std::string& type) { + if (type == "sigmoid") { + return std::unique_ptr(new AVXActImpl()); + } else if (type == "relu") { + return std::unique_ptr(new AVXActImpl()); + } else if (type == "tanh") { + return std::unique_ptr(new AVXActImpl()); + } else if (type == "identity" || type == "") { + return std::unique_ptr(new AVXActImpl()); + } + PADDLE_THROW("Not support type: %s", type); + return nullptr; +} + /* LSTM JitKernel */ template class LSTMKernelImpl : public LSTMKernel { @@ -192,61 +207,49 @@ class LSTMKernelImpl : public LSTMKernel { #endif }; -#define INTRI8_FLOAT(isa) \ - template <> \ - LSTMKernelImpl::LSTMKernelImpl( \ - const std::string& act_gate, const std::string& act_cand, \ - const std::string& act_cell, int d) \ - : LSTMKernel() { \ - auto GetAVXAct = [&](const std::string& type) -> std::unique_ptr { \ - if (type == "sigmoid") { \ - return std::unique_ptr(new AVXActImpl()); \ - } else if (type == "relu") { \ - return std::unique_ptr(new AVXActImpl()); \ - } else if (type == "tanh") { \ - return std::unique_ptr(new AVXActImpl()); \ - } else if (type == "identity" || type == "") { \ - return std::unique_ptr(new AVXActImpl()); \ - } \ - PADDLE_THROW("Not support type: %s", type); \ - }; \ - avx_act_gate_ = GetAVXAct(act_gate); \ - avx_act_cand_ = GetAVXAct(act_cand); \ - avx_act_cell_ = GetAVXAct(act_cell); \ - } \ - template <> \ - void LSTMKernelImpl::ComputeCtHt( \ - float* gates, const float* ct_1, float* ct, float* ht, \ - const float* wp_data, float* checked) const { \ - /* gates: W_ch, W_ih, W_fh, W_oh */ \ - __m256 c, i, f, o; \ - c = _mm256_loadu_ps(gates); \ - i = _mm256_loadu_ps(gates + 8); \ - f = _mm256_loadu_ps(gates + 16); \ - o = _mm256_loadu_ps(gates + 24); \ - /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ - c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \ - i = _mm256_loadu_ps(ct_1); \ - f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \ - f = _mm256_add_ps(c, f); \ - _mm256_storeu_ps(ct, f); \ - /* H_t = act_cell(C_t) * ogated */ \ - o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \ - _mm256_storeu_ps(ht, o); \ - } \ - template <> \ - void LSTMKernelImpl::ComputeC1H1( \ - float* gates, float* ct, float* ht, const float* wp_data) const { \ - __m256 c, i, o; \ - c = _mm256_loadu_ps(gates); \ - i = _mm256_loadu_ps(gates + 8); \ - o = _mm256_loadu_ps(gates + 24); \ - /* C_t = igated * cgated*/ \ - c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \ - _mm256_storeu_ps(ct, c); \ - /* H_t = act_cell(C_t) * ogated */ \ - o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \ - _mm256_storeu_ps(ht, o); \ +#define INTRI8_FLOAT(isa) \ + template <> \ + LSTMKernelImpl::LSTMKernelImpl( \ + const std::string& act_gate, const std::string& act_cand, \ + const std::string& act_cell, int d) \ + : LSTMKernel() { \ + avx_act_gate_ = GetAVXAct(act_gate); \ + avx_act_cand_ = GetAVXAct(act_cand); \ + avx_act_cell_ = GetAVXAct(act_cell); \ + } \ + template <> \ + void LSTMKernelImpl::ComputeCtHt( \ + float* gates, const float* ct_1, float* ct, float* ht, \ + const float* wp_data, float* checked) const { \ + /* gates: W_ch, W_ih, W_fh, W_oh */ \ + __m256 c, i, f, o; \ + c = _mm256_loadu_ps(gates); \ + i = _mm256_loadu_ps(gates + 8); \ + f = _mm256_loadu_ps(gates + 16); \ + o = _mm256_loadu_ps(gates + 24); \ + /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ + c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \ + i = _mm256_loadu_ps(ct_1); \ + f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \ + f = _mm256_add_ps(c, f); \ + _mm256_storeu_ps(ct, f); \ + /* H_t = act_cell(C_t) * ogated */ \ + o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \ + _mm256_storeu_ps(ht, o); \ + } \ + template <> \ + void LSTMKernelImpl::ComputeC1H1( \ + float* gates, float* ct, float* ht, const float* wp_data) const { \ + __m256 c, i, o; \ + c = _mm256_loadu_ps(gates); \ + i = _mm256_loadu_ps(gates + 8); \ + o = _mm256_loadu_ps(gates + 24); \ + /* C_t = igated * cgated*/ \ + c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \ + _mm256_storeu_ps(ct, c); \ + /* H_t = act_cell(C_t) * ogated */ \ + o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \ + _mm256_storeu_ps(ht, o); \ } // TODO(TJ): optimize keq16 @@ -375,6 +378,7 @@ class GRUKernelImpl : public GRUKernel { act_state_d_->Compute(gates + d2_, gates + d2_); vmul_d_->Compute(gates, gates + d2_, ht); } + void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override { // W: {W_update, W_reset; W_state} act_gate_d2_->Compute(gates, gates); @@ -394,8 +398,65 @@ class GRUKernelImpl : public GRUKernel { int d_, d2_; std::shared_ptr> act_gate_d2_, act_gate_d_, act_state_d_; std::shared_ptr> vmul_d_; +#ifdef __AVX__ + std::unique_ptr avx_act_gate_, avx_act_state_; +#endif }; +#define INTRI8_FLOAT(isa) \ + template <> \ + GRUKernelImpl::GRUKernelImpl( \ + const std::string& act_gate, const std::string& act_state, int d) \ + : GRUKernel() { \ + avx_act_gate_ = GetAVXAct(act_gate); \ + avx_act_state_ = GetAVXAct(act_state); \ + } \ + template <> \ + void GRUKernelImpl::ComputeH1(float* gates, float* ht) \ + const { \ + __m256 u, s; \ + /* W: {W_update, W_reset; W_state} */ \ + u = _mm256_loadu_ps(gates); \ + s = _mm256_loadu_ps(gates + 16); \ + s = _mm256_mul_ps(avx_act_gate_->Compute(u), avx_act_state_->Compute(s)); \ + _mm256_storeu_ps(ht, s); \ + } \ + template <> \ + void GRUKernelImpl::ComputeHtPart1( \ + float* gates, const float* ht_1, float* ht) const { \ + /* not exactly equal the any implementation */ \ + __m256 r, ht0; \ + r = _mm256_loadu_ps(gates + 8); \ + ht0 = _mm256_loadu_ps(ht_1); \ + r = _mm256_mul_ps(avx_act_gate_->Compute(r), ht0); \ + _mm256_storeu_ps(ht, r); \ + } \ + template <> \ + void GRUKernelImpl::ComputeHtPart2( \ + float* gates, const float* ht_1, float* ht) const { \ + /* not exactly equal the any implementation */ \ + __m256 u, s, ht0; \ + u = _mm256_loadu_ps(gates); \ + s = _mm256_loadu_ps(gates + 16); \ + ht0 = _mm256_loadu_ps(ht_1); \ + u = avx_act_gate_->Compute(u); \ + s = _mm256_mul_ps(u, avx_act_state_->Compute(s)); \ + u = _mm256_sub_ps(_mm256_set1_ps(1.f), u); \ + u = _mm256_mul_ps(u, ht0); \ + u = _mm256_add_ps(s, u); \ + _mm256_storeu_ps(ht, u); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +#endif + #define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype) \ template <> \ std::shared_ptr> KernelPool::Get< \ @@ -412,6 +473,7 @@ class GRUKernelImpl : public GRUKernel { REGISTER_JITKERNEL_ARGS(gru, GRUKernel, JITKERNEL_DECLARE_GRU, JITKERNEL_KEY_GRU, JITKERNEL_NEW_GRU_IMPL); +#undef INTRI8_FLOAT #undef JITKERNEL_NEW_GRU_IMPL #undef JITKERNEL_KEY_GRU #undef JITKERNEL_DECLARE_GRU diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py index 36ebc8fb6e..377454e780 100644 --- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py @@ -125,6 +125,12 @@ class TestFusionGRUOpMD2(TestFusionGRUOp): self.D = 8 +class TestFusionGRUOpMD3(TestFusionGRUOp): + def set_confs(self): + self.M = 17 + self.D = 15 + + class TestFusionGRUOpBS1(TestFusionGRUOp): def set_confs(self): self.lod = [[3]] -- GitLab