diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 9e2cc18c7a5e396be40b2336382f68a17f8a2bf9..9375ca20670858b6b92ff7057c159d80cc500f0d 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -66,6 +66,42 @@ void VMulJitCode::generate() { ret(); } +bool VAddJitCode::init(int d) { return MayIUse(avx); } + +void VAddJitCode::generate() { + int offset = 0; + for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) { + vmovups(ymm_src1, ptr[param1 + offset]); + vmovups(ymm_src2, ptr[param2 + offset]); + vaddps(ymm_dst, ymm_src1, ymm_src2); + vmovups(ptr[param3 + offset], ymm_dst); + offset += sizeof(float) * AVX_FLOAT_BLOCK; + } + int rest = num_ % AVX_FLOAT_BLOCK; + if (rest >= 4) { + vmovups(xmm_src1, ptr[param1 + offset]); + vmovups(xmm_src2, ptr[param2 + offset]); + vaddps(xmm_dst, xmm_src1, xmm_src2); + vmovups(ptr[param3 + offset], xmm_dst); + offset += sizeof(float) * 4; + rest -= 4; + } + if (rest >= 2) { + vmovq(xmm_src1, ptr[param1 + offset]); + vmovq(xmm_src2, ptr[param2 + offset]); + vaddps(xmm_dst, xmm_src1, xmm_src2); + vmovq(ptr[param3 + offset], xmm_dst); + offset += sizeof(float) * 2; + rest -= 2; + } + if (rest > 0) { + vmovss(xmm_src1, ptr[param1 + offset]); + vmovss(xmm_src2, ptr[param2 + offset]); + vaddss(xmm_dst, xmm_src1, xmm_src2); + vmovss(ptr[param3 + offset], xmm_dst); + } + ret(); +} } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 6007b290815de0ceaa2226962c5273ae7da72e7e..0c4b75d0309bb0e90298150919983c04ece925c2 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -53,6 +53,30 @@ class VMulJitCode : public JitCode { ymm_t ymm_dst = ymm_t(2); }; +class VAddJitCode : public JitCode { + public: + DECLARE_JIT_CODE(VAddJitCode); + explicit VAddJitCode(int d, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), num_(d) {} + static bool init(int d); + void generate() override; + + private: + int num_; + reg64_t param1{abi_param1}; + reg64_t param2{abi_param2}; + reg64_t param3{abi_param3}; + + xmm_t xmm_src1 = xmm_t(0); + xmm_t xmm_src2 = xmm_t(1); + xmm_t xmm_dst = xmm_t(2); + + ymm_t ymm_src1 = ymm_t(0); + ymm_t ymm_src2 = ymm_t(1); + ymm_t ymm_dst = ymm_t(2); +}; + } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index 7b6027aa267803ff8ff830deabda536b1b27fec8..7c3fb5de9bd43b2f01b22fa90b02a530e2180399 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -71,7 +71,7 @@ class VMulKernel : public Kernel { template class VAddKernel : public Kernel { public: - virtual void Compute(const T *x, const T *y, T *z) const = 0; + void (*Compute)(const T *, const T *, T *, int); }; template diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index 7d38d511723ab3e6edfd4aa853bd7f2521ec98e2..16eab62dda75ff8379d87483160714c2869cf141 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -39,6 +39,13 @@ void VMulRefer(const T* x, const T* y, T* z, int n) { } } +template +void VAddRefer(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + } +} + #ifdef PADDLE_WITH_MKLML template void VMulMKL(const T* x, const T* y, T* z, int n); @@ -47,22 +54,38 @@ template <> void VMulMKL(const float* x, const float* y, float* z, int n) { platform::dynload::vsMul(n, x, y, z); } + template <> void VMulMKL(const double* x, const double* y, double* z, int n) { platform::dynload::vdMul(n, x, y, z); } + +template +void VAddMKL(const T* x, const T* y, T* z, int n); + +template <> +void VAddMKL(const float* x, const float* y, float* z, int n) { + platform::dynload::vsAdd(n, x, y, z); +} + +template <> +void VAddMKL(const double* x, const double* y, double* z, int n) { + platform::dynload::vdAdd(n, x, y, z); +} #endif +#define DECLARE_STATIC_FUNC \ + static inline std::string name(int d) { \ + PADDLE_THROW("DType should be either float or double"); \ + } \ + static inline bool useJIT(int d) { return false; } \ + static inline bool useMKL(int d) { return false; } + /* VMUL JitKernel */ template class VMulKernelImpl : public VMulKernel { public: - static inline std::string name(int d) { - PADDLE_THROW("DType should be either float or double"); - } - static inline bool useJIT(int d) { return false; } - static inline bool useMKL(int d) { return false; } - + DECLARE_STATIC_FUNC; explicit VMulKernelImpl(int d) : VMulKernel() { if (useJIT(d)) { // roughly estimate the size of code @@ -100,63 +123,51 @@ bool VMulKernelImpl::useMKL(int d) { return true; } -REGISTER_JITKERNEL(vmul, VMulKernel); - -/* VADD JitKernel */ -template +/* VAdd JitKernel */ +template class VAddKernelImpl : public VAddKernel { public: - explicit VAddKernelImpl(int d) : VAddKernel() { this->num_ = d; } - void Compute(const T* x, const T* y, T* z) const override { - for (int i = 0; i < this->num_; ++i) { - z[i] = x[i] + y[i]; + DECLARE_STATIC_FUNC; + explicit VAddKernelImpl(int d) : VAddKernel() { + if (useJIT(d)) { + size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8; + jitcode_.reset(new gen::VAddJitCode(d, sz > 4096 ? sz : 4096)); + this->Compute = + jitcode_->getCode(); + return; } +#ifdef PADDLE_WITH_MKLML + if (useMKL(d)) { + this->Compute = VAddMKL; + return; + } +#endif + this->Compute = VAddRefer; } + + private: + std::unique_ptr jitcode_{nullptr}; }; -#ifdef PADDLE_WITH_MKLML -#define MKL_FLOAT(isa, block) \ - template <> \ - void VAddKernelImpl::Compute( \ - const float* x, const float* y, float* z) const { \ - platform::dynload::vsAdd(this->num_, x, y, z); \ - } +template <> +bool VAddKernelImpl::useJIT(int d) { + return gen::VAddJitCode::init(d); +} -#define MKL_DOUBLE(isa, block) \ - template <> \ - void VAddKernelImpl::Compute( \ - const double* x, const double* y, double* z) const { \ - platform::dynload::vdAdd(this->num_, x, y, z); \ - } +template <> +bool VAddKernelImpl::useMKL(int d) { + return d > 512; +} -FOR_EACH_ISA(MKL_FLOAT, kGT16); -FOR_EACH_ISA_BLOCK(MKL_DOUBLE); -#endif +template <> +bool VAddKernelImpl::useMKL(int d) { + return true; +} -#define INTRI8_FLOAT(isa) \ - template <> \ - void VAddKernelImpl::Compute( \ - const float* x, const float* y, float* z) const { \ - __m256 tmpx, tmpy; \ - tmpx = _mm256_loadu_ps(x); \ - tmpy = _mm256_loadu_ps(y); \ - tmpx = _mm256_add_ps(tmpx, tmpy); \ - _mm256_storeu_ps(z, tmpx); \ - } -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2); -#endif -#ifdef __AVX512F__ -INTRI8_FLOAT(jit::avx512f); -#endif -// TODO(TJ): eq16 test and complete avx512 +#undef DECLARE_STATIC_FUNC -#undef INTRI8_FLOAT -#undef MKL_FLOAT -#undef MKL_DOUBLE +REGISTER_JITKERNEL(vmul, VMulKernel); +REGISTER_JITKERNEL(vadd, VAddKernel); /* VSCAL JitKernel */ template @@ -480,7 +491,6 @@ INTRI_COMMON_FLOAT(jit::avx512f, kGT16); #undef INTRI16_FLOAT #undef INTRI_COMMON_FLOAT -REGISTER_JITKERNEL_DEPRECATED(vadd, VAddKernel); REGISTER_JITKERNEL_DEPRECATED(vscal, VScalKernel); REGISTER_JITKERNEL_DEPRECATED(vaddb, VAddBiasKernel); REGISTER_JITKERNEL_DEPRECATED(vrelu, VReluKernel); diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index d0932a37bb85bbc41f662a106c8ef5693a72efeb..ba3e917377cf12192a068a9d71238442e12d5e5e 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -181,7 +181,7 @@ class LSTMKernelImpl : public LSTMKernel { act_cand_d_->Compute(gates, gates); vmul_d_->Compute(gates, gates + d_, gates + d_, d_); vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); - vadd_d_->Compute(gates + d_, gates + d2_, ct); + vadd_d_->Compute(gates + d_, gates + d2_, ct, d_); /* H_t = act_cell(C_t) * ogated */ act_cell_d_->Compute(ct, gates + d2_); @@ -291,16 +291,16 @@ class PeepholeKernelImpl : public LSTMKernel { /* get fgated and igated*/ vmul_d_->Compute(wp_data, ct_1, checked, d_); vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_); - vadd_d2_->Compute(checked, gates + d_, gates + d_); + vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_); act_gate_d2_->Compute(gates + d_, gates + d_); /* C_t = C_t-1 * fgated + cand_gated * igated*/ act_cand_d_->Compute(gates, gates); vmul_d_->Compute(gates, gates + d_, gates + d_, d_); vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); - vadd_d_->Compute(gates + d_, gates + d2_, ct); + vadd_d_->Compute(gates + d_, gates + d2_, ct, d_); /* get ogated*/ vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); - vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_); + vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_); act_gate_d_->Compute(gates + d3_, gates + d3_); /* H_t = act_cell(C_t) * ogated */ act_cell_d_->Compute(ct, gates + d2_); @@ -314,7 +314,7 @@ class PeepholeKernelImpl : public LSTMKernel { vmul_d_->Compute(gates, gates + d_, ct, d_); /* get outgated, put W_oc * C_t on igated */ vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); - vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_); + vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_); /* H_t = act_cell(C_t) * ogated */ act_gate_d_->Compute(gates + d3_, gates + d3_); act_cell_d_->Compute(ct, gates + d2_); diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index 667a95fe1a247cf9d3d63dae74f7e0fa9c2309ca..f9064d8b2f5a4000d816b7c563aad0eab15c1566 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -371,7 +371,7 @@ void lstm_ctht_better( vtanh_d->Compute(gates, gates); vmul_d->Compute(gates, gates + d, gates + d, d); vmul_d->Compute(ct_1, gates + d2, gates + d2, d); - vadd_d->Compute(gates + d, gates + d2, ct); + vadd_d->Compute(gates + d, gates + d2, ct, d); /* H_t = act_cell(C_t) * ogated */ vtanh_d->Compute(ct, gates + d2); vmul_d->Compute(gates + d2, gates + d * 3, ht, d); @@ -695,7 +695,7 @@ TEST(JitKernel, vadd) { auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(x_data, y_data, ztgt_data); + ker->Compute(x_data, y_data, ztgt_data, d); } auto ttgte = GetCurrentUS(); @@ -723,8 +723,8 @@ void vaddrelu_better( const paddle::operators::math::jitkernel::VAddKernel>& vadd, const std::shared_ptr< const paddle::operators::math::jitkernel::VReluKernel>& vrelu, - const float* x, const float* y, float* z) { - vadd->Compute(x, y, z); + const float* x, const float* y, float* z, int d) { + vadd->Compute(x, y, z, d); vrelu->Compute(z, z); } @@ -752,7 +752,7 @@ TEST(JitKernel, vaddrelu) { auto trefe = GetCurrentUS(); auto tmkls = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vaddrelu_better(vadd, vrelu, x_data, y_data, zref_data); + vaddrelu_better(vadd, vrelu, x_data, y_data, zref_data, d); } auto tmkle = GetCurrentUS(); auto ttgts = GetCurrentUS();