diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 0aed253c80fc28560716cbcfa70f74ef9c84f9b6..7d81aee596934308763002d440f52400f45b5f20 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -33,11 +33,11 @@ namespace math { #define SIGMOID_THRESHOLD_MIN -40.0 #define SIGMOID_THRESHOLD_MAX 13.0 -#define AVX_FLOAT_BLOCK 8 +#define YMM_FLOAT_BLOCK 8 #define AVX_DOUBLE_BLOCK 4 -#define AVX2_FLOAT_BLOCK 8 +#define YMM_FLOAT_BLOCK 8 #define AVX2_DOUBLE_BLOCK 4 -#define AVX512_FLOAT_BLOCK 16 +#define ZMM_FLOAT_BLOCK 16 #define AVX512_DOUBLE_BLOCK 8 template @@ -88,7 +88,7 @@ template <> inline void vec_scal(const int n, const float a, const float* x, float* y) { #ifdef __AVX__ - constexpr int block = AVX_FLOAT_BLOCK; + constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { vec_scal(n, a, x, y); return; @@ -142,7 +142,7 @@ template <> inline void vec_bias_sub(const int n, const float a, const float* x, float* y) { #ifdef __AVX__ - constexpr int block = AVX_FLOAT_BLOCK; + constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { vec_bias_sub(n, a, x, y); return; @@ -200,7 +200,7 @@ inline void vec_cross(const int n, const float* x, const float* y, const float* z, float* out) { #ifdef __AVX__ - constexpr int block = AVX_FLOAT_BLOCK; + constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { vec_cross(n, x, y, z, out); return; @@ -257,7 +257,7 @@ template <> inline void vec_add_bias(const int n, const float a, const float* x, float* y) { #ifdef __AVX__ - constexpr int block = AVX_FLOAT_BLOCK; + constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { vec_add_bias(n, a, x, y); return; @@ -326,7 +326,7 @@ template <> inline void vec_sigmoid(const int n, const float* x, float* y) { #ifdef __AVX__ - constexpr int block = AVX_FLOAT_BLOCK; + constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { vec_sigmoid(n, x, y); return; @@ -415,7 +415,7 @@ template <> inline void vec_relu(const int n, const float* x, float* y) { #ifdef __AVX__ - constexpr int block = AVX_FLOAT_BLOCK; + constexpr int block = YMM_FLOAT_BLOCK; if (n < block * 4) { vec_relu(n, x, y); return; diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index e46f60f764ab9f1c292db339a5b38b976de5a11a..e3b600d4427672faa477341e207a5eab2bcf383d 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -41,7 +41,7 @@ void VXXJitCode::generate() { } else if (scalar_index_ == 2) { vbroadcastss(ymm_src2, ptr[param2]); } - for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) { + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { if (scalar_index_ != 1) { vmovups(ymm_src1, ptr[param1 + offset]); } @@ -57,9 +57,9 @@ void VXXJitCode::generate() { vmaxps(ymm_dst, ymm_zero, ymm_dst); } vmovups(ptr[param3 + offset], ymm_dst); - offset += sizeof(float) * AVX_FLOAT_BLOCK; + offset += sizeof(float) * YMM_FLOAT_BLOCK; } - int rest = num_ % AVX_FLOAT_BLOCK; + int rest = num_ % YMM_FLOAT_BLOCK; if (rest >= 4) { if (scalar_index_ != 1) { vmovups(xmm_src1, ptr[param1 + offset]); @@ -118,18 +118,237 @@ void VXXJitCode::generate() { ret(); } -bool ReluJitCode::init(int d) { return MayIUse(avx); } +#define ALIGN32 __attribute__((aligned(32))) +#define EXP_HIG 88.3762626647949f +#define EXP_LOW -88.3762626647949f +#define CEPHES_LOG2EF 1.44269504088896341 +#define CEPHES_EXP_C1 0.693359375 +#define CEPHES_EXP_C2 -2.12194440e-4 +#define CEPHES_EXP_P0 1.9875691500E-4 +#define CEPHES_EXP_P1 1.3981999507E-3 +#define CEPHES_EXP_P2 8.3334519073E-3 +#define CEPHES_EXP_P3 4.1665795894E-2 +#define CEPHES_EXP_P4 1.6666665459E-1 +#define CEPHES_EXP_P5 5.0000001201E-1 -void ReluJitCode::generate() { - int offset = 0; +#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val + +#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float) + +static const float exp_float_consts[] ALIGN32 = { + REPEAT_8TIMES(1.f), + REPEAT_8TIMES(2.f), + REPEAT_8TIMES(0.5f), + REPEAT_8TIMES(EXP_HIG), + REPEAT_8TIMES(EXP_LOW), + REPEAT_8TIMES(CEPHES_LOG2EF), + REPEAT_8TIMES(CEPHES_EXP_C1), + REPEAT_8TIMES(CEPHES_EXP_C2), + REPEAT_8TIMES(CEPHES_EXP_P0), + REPEAT_8TIMES(CEPHES_EXP_P1), + REPEAT_8TIMES(CEPHES_EXP_P2), + REPEAT_8TIMES(CEPHES_EXP_P3), + REPEAT_8TIMES(CEPHES_EXP_P4), + REPEAT_8TIMES(CEPHES_EXP_P5), + REPEAT_8TIMES(EXP_MAX_INPUT), + REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX), + REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)}; + +static const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)}; +static int g_tmp_mem[16] ALIGN32 = {0}; + +bool VActJitCode::init(int d, operand_type type) { + bool ok = MayIUse(avx); + if (type == operand_type::relu) { + return ok; + } else if (type == operand_type::exp) { + // exp is slower than mkl when d >= 256 + return ok && d % 8 == 0 && d < 256; + } else { + // TODO(TJ): support more + return ok && d % 8 == 0; + } +} + +void VActJitCode::relu_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, ymm_t& ymm_zero) { + vmaxps(ymm_dst, ymm_zero, ymm_src); +} + +void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx, + int fy_idx, int mask_idx, int tmp_idx) { + assert(ymm_src.getIdx() != ymm_dst.getIdx()); // TODO(TJ): use enfore + // check all idx can not equal + ymm_t ymm_fx = ymm_t(fx_idx); + ymm_t ymm_fy = ymm_t(fy_idx); + ymm_t ymm_mask = ymm_t(mask_idx); + ymm_t ymm_tmp = ymm_t(tmp_idx); + reg64_t reg_ptr_global = rax; + push(reg_ptr_global); + mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]); + vminps(ymm_src, ymm_src, ymm_tmp); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]); + vmaxps(ymm_src, ymm_src, ymm_tmp); + // express exp(x) as exp(g + n*log(2)) + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]); + vmulps(ymm_fx, ymm_src, ymm_tmp); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]); + vaddps(ymm_fx, ymm_fx, ymm_tmp); + vroundps(ymm_fy, ymm_fx, 0x01); + // if greater, substract 1 + vcmpgtps(ymm_mask, ymm_fy, ymm_fx); + vmovaps(ymm_tmp, ptr[reg_ptr_global]); + vandps(ymm_mask, ymm_mask, ymm_tmp); + vsubps(ymm_fx, ymm_fy, ymm_mask); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]); + vmulps(ymm_fy, ymm_fx, ymm_tmp); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]); + ymm_t ymm_z = ymm_t(ymm_mask.getIdx()); + vmulps(ymm_z, ymm_fx, ymm_tmp); + vsubps(ymm_src, ymm_src, ymm_fy); + vsubps(ymm_src, ymm_src, ymm_z); + vmulps(ymm_z, ymm_src, ymm_src); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]); + vmulps(ymm_dst, ymm_src, ymm_tmp); + for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5; + i += (YMM_FLOAT_BLOCK * sizeof(float))) { + vmovaps(ymm_tmp, ptr[reg_ptr_global + i]); // P1~P4 + vaddps(ymm_dst, ymm_dst, ymm_tmp); + vmulps(ymm_dst, ymm_dst, ymm_src); + } + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]); + vaddps(ymm_dst, ymm_dst, ymm_tmp); + vmulps(ymm_dst, ymm_dst, ymm_z); + vaddps(ymm_dst, ymm_dst, ymm_src); + vmovaps(ymm_tmp, ptr[reg_ptr_global]); + vaddps(ymm_dst, ymm_dst, ymm_tmp); + // build 2^n + ymm_t ymm_int = ymm_fx; + vcvttps2dq(ymm_int, ymm_fx); + mov(reg_ptr_global, reinterpret_cast(exp_int_0x7f)); + vmovdqa(ymm_tmp, ptr[reg_ptr_global]); + if (MayIUse(avx2)) { + vpaddd(ymm_int, ymm_int, ymm_tmp); + vpslld(ymm_int, ymm_int, 23); + } else if (MayIUse(avx)) { + xmm_t xtmp1 = xmm_t(ymm_int.getIdx()); + xmm_t xtmp2 = xmm_t(ymm_tmp.getIdx()); + reg64_t reg_ptr_tmp = reg_ptr_global; + mov(reg_ptr_tmp, reinterpret_cast(g_tmp_mem)); + vmovdqa(ptr[reg_ptr_tmp], ymm_int); + vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], ymm_tmp); + vpaddd(xtmp1, xtmp1, xtmp2); + vpslld(xtmp1, xtmp1, 23); + vmovdqa(ptr[reg_ptr_tmp], xtmp1); + // next 128bits + vmovdqa(xtmp1, ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)]); + vmovdqa(xtmp2, + ptr[reg_ptr_tmp + + (YMM_FLOAT_BLOCK + 4 /*xmm float block*/) * sizeof(float)]); + vpaddd(xtmp1, xtmp1, xtmp2); + vpslld(xtmp1, xtmp1, 23); + vmovdqa(ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)], xtmp1); + // load out + vmovdqa(ymm_int, ptr[reg_ptr_tmp]); + } + vmulps(ymm_dst, ymm_dst, ymm_int); + pop(reg_ptr_global); +} + +void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx, + int fy_idx, int mask_idx, int tmp_idx) { + // y = 1 / (1 + e^-x) + ymm_t ymm_tmp = ymm_t(tmp_idx); + reg64_t reg_ptr_global = rax; + push(reg_ptr_global); + mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]); + vminps(ymm_src, ymm_src, ymm_tmp); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]); + vmaxps(ymm_src, ymm_src, ymm_tmp); + vxorps(ymm_tmp, ymm_tmp, ymm_tmp); + vsubps(ymm_src, ymm_tmp, ymm_src); + exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); + vaddps(ymm_dst, ymm_dst, ymm_tmp); + vdivps(ymm_dst, ymm_tmp, ymm_dst); + pop(reg_ptr_global); +} + +void VActJitCode::tanh_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx, + int fy_idx, int mask_idx, int tmp_idx) { + // y = 2 / (1 + e^(-2x)) - 1 + ymm_t ymm_tmp = ymm_t(tmp_idx); + ymm_t ymm_zero = ymm_t(mask_idx); + reg64_t reg_ptr_global = rax; + push(reg_ptr_global); + mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); vxorps(ymm_zero, ymm_zero, ymm_zero); - for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) { + vsubps(ymm_tmp, ymm_zero, ymm_tmp); + vmulps(ymm_src, ymm_src, ymm_tmp); + exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); + vaddps(ymm_dst, ymm_dst, ymm_tmp); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); + vdivps(ymm_dst, ymm_tmp, ymm_dst); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); + vsubps(ymm_dst, ymm_dst, ymm_tmp); + pop(reg_ptr_global); +} + +void VActJitCode::generate() { + xmm_t xmm_zero = xmm_t(2); + ymm_t ymm_zero = ymm_t(2); + if (type_ == operand_type::relu) { + vxorps(ymm_zero, ymm_zero, ymm_zero); + } + int offset = 0; + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { vmovups(ymm_src, ptr[param1 + offset]); - vmaxps(ymm_dst, ymm_zero, ymm_src); + switch (type_) { + case operand_type::relu: + relu_ymm(ymm_dst, ymm_src, ymm_zero); + break; + case operand_type::exp: + exp_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); + break; + case operand_type::sigmoid: + sigmoid_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); + break; + case operand_type::tanh: + tanh_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); + break; + case operand_type::identity: + break; + default: + break; + } vmovups(ptr[param2 + offset], ymm_dst); - offset += sizeof(float) * AVX_FLOAT_BLOCK; + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + if (type_ != operand_type::relu) { + // TODO(TJ): remove me + ret(); + return; } - int rest = num_ % AVX_FLOAT_BLOCK; + int rest = num_ % YMM_FLOAT_BLOCK; if (rest >= 4) { vmovups(xmm_src, ptr[param1 + offset]); vmaxps(xmm_dst, xmm_zero, xmm_src); @@ -151,6 +370,7 @@ void ReluJitCode::generate() { } ret(); } + } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 3c242870a24c5bb29d34d4b99406c5df8cec6763..71205b211b7f571f8081640ef60222de051ff49d 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -29,7 +29,16 @@ using ymm_t = const Xbyak::Ymm; using zmm_t = const Xbyak::Zmm; using Label = Xbyak::Label; -typedef enum { mul = 0, add } operand_type; +typedef enum { + mul = 0, + add, + sub, + relu, + exp, + sigmoid, + tanh, + identity +} operand_type; // function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu) class VXXJitCode : public JitCode { @@ -85,26 +94,65 @@ class VXXJitCode : public JitCode { ymm_t ymm_zero = ymm_t(3); }; -class ReluJitCode : public JitCode { +class VActJitCode : public JitCode { public: - DECLARE_JIT_CODE(ReluJitCode); - explicit ReluJitCode(int d, size_t code_size = 256 * 1024, + const char* name() const override { + std::string base = "VActJitCode"; + switch (type_) { + case operand_type::relu: + base += "_Relu"; + break; + case operand_type::exp: + base += "_Exp"; + break; + case operand_type::sigmoid: + base += "_Sigmoid"; + break; + case operand_type::tanh: + base += "_Tanh"; + break; + case operand_type::identity: + base += "_Identity"; + break; + default: + break; + } + return base.c_str(); + } + + explicit VActJitCode(int d, operand_type type, size_t code_size = 256 * 1024, void* code_ptr = nullptr) - : JitCode(code_size, code_ptr), num_(d) {} - static bool init(int d); + : JitCode(code_size, code_ptr), num_(d), type_(type) {} + static bool init(int d, operand_type type); void generate() override; - private: + protected: + // compute relu with ymm + void relu_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, + const Xbyak::Ymm& zero); + + // compute exp with ymm + void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, + int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); + + // compute sigmoid with ymm + void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, + int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); + + // compute tanh with ymm + void tanh_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, + int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); + + protected: int num_; + operand_type type_; reg64_t param1{abi_param1}; reg64_t param2{abi_param2}; - xmm_t xmm_zero = xmm_t(0); - xmm_t xmm_src = xmm_t(1); - xmm_t xmm_dst = xmm_t(1); + xmm_t xmm_src = xmm_t(0); + ymm_t ymm_src = ymm_t(0); - ymm_t ymm_zero = ymm_t(0); - ymm_t ymm_src = ymm_t(1); + xmm_t xmm_dst = xmm_t(1); ymm_t ymm_dst = ymm_t(1); }; diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index cd3a45e66773c89e45e80ab77ebd925abd6cbe53..4d8d3cd79a16a3ea61c4f63da3493e105847d30b 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -29,9 +29,9 @@ namespace jitkernel { #define SIGMOID_THRESHOLD_MIN -40.0 #define SIGMOID_THRESHOLD_MAX 13.0 #define EXP_MAX_INPUT 40.0 -#define AVX_FLOAT_BLOCK 8 -#define AVX2_FLOAT_BLOCK 8 -#define AVX512_FLOAT_BLOCK 16 +#define XMM_FLOAT_BLOCK 4 +#define YMM_FLOAT_BLOCK 8 +#define ZMM_FLOAT_BLOCK 16 typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block; @@ -97,39 +97,23 @@ class VAddBiasKernel : public Kernel { template class VActKernel : public Kernel { public: - virtual void ComputeDeprecated(const T *x, T *y) const = 0; + void (*Compute)(const T *, T *, int); }; template -class VReluKernel : public VActKernel { - public: - virtual void ComputeDeprecated(const T *x, T *y) const = 0; - void (*Compute)(const T *, T *, int); -}; +class VReluKernel : public VActKernel {}; template -class VIdentityKernel : public VActKernel { - public: - virtual void ComputeDeprecated(const T *x, T *y) const = 0; -}; +class VIdentityKernel : public VActKernel {}; template -class VExpKernel : public VActKernel { - public: - virtual void ComputeDeprecated(const T *x, T *y) const = 0; -}; +class VExpKernel : public VActKernel {}; template -class VSigmoidKernel : public VActKernel { - public: - virtual void ComputeDeprecated(const T *x, T *y) const = 0; -}; +class VSigmoidKernel : public VActKernel {}; template -class VTanhKernel : public VActKernel { - public: - virtual void ComputeDeprecated(const T *x, T *y) const = 0; -}; +class VTanhKernel : public VActKernel {}; template class LSTMKernel : public Kernel { diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index cf46a210afbd4903dc3841f27765c390f721c763..36a50f20434f313e93bfa3dd2c9d46963024caf7 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -25,10 +25,6 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/mklml.h" #endif -#ifdef __AVX__ -#include -#endif - namespace paddle { namespace operators { namespace math { @@ -128,23 +124,16 @@ void VScalMKL(const double* a, const double* x, double* y, int n) { #endif -#define DECLARE_STATIC_FUNC \ - static inline std::string name(int d) { \ - PADDLE_THROW("DType should be either float or double"); \ - } \ - static inline bool useJIT(int d) { return false; } \ - static inline bool useMKL(int d) { return false; } - /* VMUL JitKernel */ template class VMulKernelImpl : public VMulKernel { public: - DECLARE_STATIC_FUNC; + JITKERNEL_DECLARE_STATIC_FUNC; explicit VMulKernelImpl(int d) : VMulKernel() { #ifdef PADDLE_WITH_XBYAK if (useJIT(d)) { // roughly estimate the size of code - size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8; + size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8; jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 0, false, sz > 4096 ? sz : 4096)); this->Compute = @@ -191,11 +180,11 @@ bool VMulKernelImpl::useMKL(int d) { template class VAddKernelImpl : public VAddKernel { public: - DECLARE_STATIC_FUNC; + JITKERNEL_DECLARE_STATIC_FUNC; explicit VAddKernelImpl(int d) : VAddKernel() { #ifdef PADDLE_WITH_XBYAK if (useJIT(d)) { - size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8; + size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8; jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, false, sz > 4096 ? sz : 4096)); this->Compute = @@ -241,11 +230,11 @@ bool VAddKernelImpl::useMKL(int d) { template class VAddReluKernelImpl : public VAddReluKernel { public: - DECLARE_STATIC_FUNC; + JITKERNEL_DECLARE_STATIC_FUNC; explicit VAddReluKernelImpl(int d) : VAddReluKernel() { #ifdef PADDLE_WITH_XBYAK if (useJIT(d)) { - size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8; + size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8; jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, true, sz > 4096 ? sz : 4096)); this->Compute = @@ -273,11 +262,11 @@ bool VAddReluKernelImpl::useJIT(int d) { template class VScalKernelImpl : public VScalKernel { public: - DECLARE_STATIC_FUNC; + JITKERNEL_DECLARE_STATIC_FUNC; explicit VScalKernelImpl(int d) : VScalKernel() { #ifdef PADDLE_WITH_XBYAK if (useJIT(d)) { - size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8; + size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8; jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 1, false, sz > 4096 ? sz : 4096)); this->Compute = @@ -322,11 +311,11 @@ bool VScalKernelImpl::useMKL(int d) { template class VAddBiasKernelImpl : public VAddBiasKernel { public: - DECLARE_STATIC_FUNC; + JITKERNEL_DECLARE_STATIC_FUNC; explicit VAddBiasKernelImpl(int d) : VAddBiasKernel() { #ifdef PADDLE_WITH_XBYAK if (useJIT(d)) { - size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8; + size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8; jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 1, false, sz > 4096 ? sz : 4096)); this->Compute = @@ -355,15 +344,15 @@ bool VAddBiasKernelImpl::useJIT(int d) { template class VReluKernelImpl : public VReluKernel { public: - DECLARE_STATIC_FUNC; + JITKERNEL_DECLARE_STATIC_FUNC; explicit VReluKernelImpl(int d) : VReluKernel() { - this->num_ = d; // TODO(TJ): remove me when ComputeDeprecated done #ifdef PADDLE_WITH_XBYAK if (useJIT(d)) { - size_t sz = 96 /*init*/ + - d / AVX_FLOAT_BLOCK * 4 /* instructions*/ * - 8 /*everage byte for each instruction*/; - jitcode_.reset(new gen::ReluJitCode(d, sz > 4096 ? sz : 4096)); + size_t sz = 96 /* init size */ + + d / YMM_FLOAT_BLOCK * 4 /* instructions */ * + 8 /* average bytes for each instruction */; + jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::relu, + sz > 4096 ? sz : 4096)); this->Compute = jitcode_->getCode(); return; } @@ -371,24 +360,32 @@ class VReluKernelImpl : public VReluKernel { this->Compute = VReluRefer; } - void ComputeDeprecated(const T* x, T* y) const override { - VReluRefer(x, y, this->num_); - } #ifdef PADDLE_WITH_XBYAK private: - std::unique_ptr jitcode_{nullptr}; + std::unique_ptr jitcode_{nullptr}; #endif }; #ifdef PADDLE_WITH_XBYAK template <> bool VReluKernelImpl::useJIT(int d) { - return gen::ReluJitCode::init(d); + return gen::VActJitCode::init(d, gen::operand_type::relu); } #endif -#undef DECLARE_STATIC_FUNC +template +inline void VIdentityRefer(const T* x, T* y, int n) {} + +/* An empty JitKernel */ +template +class VIdentityKernelImpl : public VIdentityKernel { + public: + JITKERNEL_DECLARE_STATIC_FUNC; + explicit VIdentityKernelImpl(int d) : VIdentityKernel() { + this->Compute = VIdentityRefer; + } +}; REGISTER_JITKERNEL(vmul, VMulKernel); REGISTER_JITKERNEL(vadd, VAddKernel); @@ -396,16 +393,7 @@ REGISTER_JITKERNEL(vaddrelu, VAddReluKernel); REGISTER_JITKERNEL(vscal, VScalKernel); REGISTER_JITKERNEL(vaddbias, VAddBiasKernel); REGISTER_JITKERNEL(vrelu, VReluKernel); - -/* An empty JitKernel */ -template -class VIdentityKernelImpl : public VIdentityKernel { - public: - explicit VIdentityKernelImpl(int d) : VIdentityKernel() { this->num_ = d; } - void ComputeDeprecated(const T* x, T* y) const override {} -}; - -REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel); +REGISTER_JITKERNEL(videntity, VIdentityKernel); } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc index a4861c347e44ad86a066861d3375b556302a84bc..4d26b81948238f18b097f535534fcfe9049b93c3 100644 --- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc +++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc @@ -105,14 +105,14 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { int tag_num) \ : CRFDecodeKernel() { \ this->num_ = tag_num; \ - this->end_ = this->num_ / AVX_FLOAT_BLOCK; \ - this->rest_ = this->num_ % AVX_FLOAT_BLOCK; \ + this->end_ = this->num_ / YMM_FLOAT_BLOCK; \ + this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \ } \ template <> \ void CRFDecodeKernelImpl::Compute( \ const int seq_len, const float* x, const float* w, float* alpha, \ int* track) const { \ - INIT_ALPHA(AVX_FLOAT_BLOCK) \ + INIT_ALPHA(YMM_FLOAT_BLOCK) \ /* Use the column-major strategy to get the location of maximum score.*/ \ int seq_offset = 0; \ constexpr int state_trans_base_idx = 2; \ @@ -150,7 +150,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { max_score = _mm256_max_ps(max_score, score_v); \ trans_offset += this->num_; \ } \ - UPDATE_ALPHA(AVX_FLOAT_BLOCK) \ + UPDATE_ALPHA(YMM_FLOAT_BLOCK) \ } \ seq_offset += this->num_; \ } \ @@ -161,14 +161,14 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { CRFDecodeKernelImpl::CRFDecodeKernelImpl(int tag_num) \ : CRFDecodeKernel() { \ this->num_ = tag_num; \ - this->end_ = this->num_ / AVX2_FLOAT_BLOCK; \ - this->rest_ = this->num_ % AVX2_FLOAT_BLOCK; \ + this->end_ = this->num_ / YMM_FLOAT_BLOCK; \ + this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \ } \ template <> \ void CRFDecodeKernelImpl::Compute( \ const int seq_len, const float* x, const float* w, float* alpha, \ int* track) const { \ - INIT_ALPHA(AVX2_FLOAT_BLOCK) \ + INIT_ALPHA(YMM_FLOAT_BLOCK) \ /* Use the column-major strategy to get the location of maximum score.*/ \ int seq_offset = 0; \ constexpr int state_trans_base_idx = 2; \ @@ -196,7 +196,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { max_score = _mm256_max_ps(max_score, score_v); \ trans_offset += this->num_; \ } \ - UPDATE_ALPHA(AVX2_FLOAT_BLOCK) \ + UPDATE_ALPHA(YMM_FLOAT_BLOCK) \ } \ seq_offset += this->num_; \ } \ @@ -208,14 +208,14 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { int tag_num) \ : CRFDecodeKernel() { \ this->num_ = tag_num; \ - this->end_ = this->num_ / AVX512_FLOAT_BLOCK; \ - this->rest_ = this->num_ % AVX512_FLOAT_BLOCK; \ + this->end_ = this->num_ / ZMM_FLOAT_BLOCK; \ + this->rest_ = this->num_ % ZMM_FLOAT_BLOCK; \ } \ template <> \ void CRFDecodeKernelImpl::Compute( \ const int seq_len, const float* x, const float* w, float* alpha, \ int* track) const { \ - INIT_ALPHA(AVX512_FLOAT_BLOCK) \ + INIT_ALPHA(ZMM_FLOAT_BLOCK) \ /* Use the column-major strategy to get the location of maximum score.*/ \ int seq_offset = 0; \ constexpr int state_trans_base_idx = 2; \ @@ -250,7 +250,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { this->num_ + j_offset), \ max_j); \ /* Calculate the offset of next step*/ \ - j_offset += AVX512_FLOAT_BLOCK; \ + j_offset += ZMM_FLOAT_BLOCK; \ if (j == this->end_ - 1) { \ if (this->rest_ > 0) { \ j_offset += last_offset; \ diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index 2ac9e1092362f60ea3d89da0c971a365b45f39ea..f2cb8fb74e5d7f8f142bff73981b324d92bb9f7c 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -16,6 +16,11 @@ limitations under the License. */ #include // for exp #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" + +#ifdef PADDLE_WITH_XBYAK +#include "paddle/fluid/operators/math/jit_code.h" +#endif + #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" #endif @@ -30,41 +35,239 @@ namespace math { namespace jitkernel { namespace jit = platform::jit; +// TODO(TJ): move refer codes to one file +// Refer code only focus on correctness +template +void VExpRefer(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = std::exp(x[i]); + } +} + +template +void VSigmoidRefer(const T* x, T* y, int n) { + // y = 1 / (1 + e^-x) + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(1) / (static_cast(1) + std::exp(-tmp)); + } +} + +template +void VTanhRefer(const T* x, T* y, int n) { + // y = 2 * sigmoid(2x) - 1 + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * x[i]; + } + VSigmoidRefer(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * y[i] - static_cast(1); + } +} + +#ifdef PADDLE_WITH_MKLML +// try to use MKL to speedup +template +void VExpMKL(const T* x, T* y, int n); + +template <> +void VExpMKL(const float* x, float* y, int n) { + platform::dynload::vsExp(n, x, y); +} + +template <> +void VExpMKL(const double* x, double* y, int n) { + platform::dynload::vdExp(n, x, y); +} + +template +void VSigmoidMKL(const T* x, T* y, int n) { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(0) - y[i]; + } + VExpMKL(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(1) / (static_cast(1) + y[i]); + } +} + +template +void VTanhMKL(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * x[i]; + } + VSigmoidMKL(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * y[i] - static_cast(1); + } +} +#endif + /* VExp JitKernel */ -template +template class VExpKernelImpl : public VExpKernel { public: - explicit VExpKernelImpl(int d) : VExpKernel() { this->num_ = d; } - void ComputeDeprecated(const T* x, T* y) const override { - for (int i = 0; i < this->num_; ++i) { - y[i] = std::exp(x[i]); + JITKERNEL_DECLARE_STATIC_FUNC; + explicit VExpKernelImpl(int d) : VExpKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(d)) { + size_t sz = 96 + d / YMM_FLOAT_BLOCK * 70 * 8; + jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::exp, + sz > 4096 ? sz : 4096)); + this->Compute = jitcode_->getCode(); + return; + } +#endif +#ifdef PADDLE_WITH_MKLML + if (useMKL(d)) { + this->Compute = VExpMKL; + return; } +#endif + this->Compute = VExpRefer; } + +#ifdef PADDLE_WITH_XBYAK + + private: + std::unique_ptr jitcode_{nullptr}; +#endif }; +#ifdef PADDLE_WITH_XBYAK +template <> +bool VExpKernelImpl::useJIT(int d) { + return gen::VActJitCode::init(d, gen::operand_type::exp); +} +#endif + +#ifdef PADDLE_WITH_MKLML +template <> +bool VExpKernelImpl::useMKL(int d) { + return d > 512; +} + +template <> +bool VExpKernelImpl::useMKL(int d) { + return true; +} + +#endif + +/* VSigmoid JitKernel */ +template +class VSigmoidKernelImpl : public VSigmoidKernel { + public: + JITKERNEL_DECLARE_STATIC_FUNC; + explicit VSigmoidKernelImpl(int d) : VSigmoidKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(d)) { + size_t sz = 96 + d / YMM_FLOAT_BLOCK * 82 * 8; + jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::sigmoid, + sz > 4096 ? sz : 4096)); + this->Compute = jitcode_->getCode(); + return; + } +#endif + #ifdef PADDLE_WITH_MKLML -#define MKL_FLOAT(isa, block) \ - template <> \ - void VExpKernelImpl::ComputeDeprecated(const float* x, \ - float* y) const { \ - platform::dynload::vsExp(this->num_, x, y); \ + // strictly it's a better impl with MKL, then is refer + if (useMKL(d)) { + this->Compute = VSigmoidMKL; + return; + } +#endif + this->Compute = VSigmoidRefer; } -#define MKL_DOUBLE(isa, block) \ - template <> \ - void VExpKernelImpl::ComputeDeprecated( \ - const double* x, double* y) const { \ - platform::dynload::vdExp(this->num_, x, y); \ +#ifdef PADDLE_WITH_XBYAK + + private: + std::unique_ptr jitcode_{nullptr}; +#endif +}; + +#ifdef PADDLE_WITH_XBYAK +template <> +bool VSigmoidKernelImpl::useJIT(int d) { + return gen::VActJitCode::init(d, gen::operand_type::sigmoid); +} +#endif + +#ifdef PADDLE_WITH_MKLML +template <> +bool VSigmoidKernelImpl::useMKL(int d) { + return d > 512; +} + +template <> +bool VSigmoidKernelImpl::useMKL(int d) { + return true; +} +#endif + +/* VTanh JitKernel */ +template +class VTanhKernelImpl : public VTanhKernel { + public: + JITKERNEL_DECLARE_STATIC_FUNC; + explicit VTanhKernelImpl(int d) : VTanhKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(d)) { + size_t sz = 96 + d / YMM_FLOAT_BLOCK * 84 * 8; + jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::tanh, + sz > 4096 ? sz : 4096)); + this->Compute = jitcode_->getCode(); + return; + } +#endif + +#ifdef PADDLE_WITH_MKLML + // strictly it's a better impl with MKL, then is refer + if (useMKL(d)) { + this->Compute = VTanhMKL; + return; + } +#endif + this->Compute = VTanhRefer; } -FOR_EACH_ISA(MKL_FLOAT, kLT8); -FOR_EACH_ISA(MKL_FLOAT, kGT8LT16); -FOR_EACH_ISA(MKL_FLOAT, kGT16); -FOR_EACH_ISA_BLOCK(MKL_DOUBLE); + +#ifdef PADDLE_WITH_XBYAK + + private: + std::unique_ptr jitcode_{nullptr}; #endif +}; -namespace detail { +#ifdef PADDLE_WITH_XBYAK +template <> +bool VTanhKernelImpl::useJIT(int d) { + return gen::VActJitCode::init(d, gen::operand_type::tanh); +} +#endif -#ifdef __AVX__ +#ifdef PADDLE_WITH_MKLML +template <> +bool VTanhKernelImpl::useMKL(int d) { + return d > 512; +} + +template <> +bool VTanhKernelImpl::useMKL(int d) { + return true; +} +#endif + +REGISTER_JITKERNEL(vexp, VExpKernel); +REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel); +REGISTER_JITKERNEL(vtanh, VTanhKernel); + +namespace detail { #define ALIGN32 __attribute__((aligned(32))) @@ -195,7 +398,6 @@ __m256 ExpAVX(__m256 x) { y = _mm256_mul_ps(y, pow2n); return y; } -#endif #ifdef __AVX2__ __m256 ExpAVX2(__m256 x) { @@ -210,334 +412,6 @@ __m256 ExpAVX2(__m256 x) { #endif } // namespace detail - -#define INTRI8_FLOAT(isa, expisa) \ - template <> \ - void VExpKernelImpl::ComputeDeprecated(const float* x, \ - float* y) const { \ - __m256 tmp = _mm256_loadu_ps(x); \ - _mm256_storeu_ps(y, expisa(tmp)); \ - } - -#define INTRI16_FLOAT(isa, expisa) \ - template <> \ - void VExpKernelImpl::ComputeDeprecated(const float* x, \ - float* y) const { \ - __m256 tmp0 = _mm256_loadu_ps(x); \ - __m256 tmp1 = _mm256_loadu_ps(x + 8); \ - tmp0 = expisa(tmp0); \ - tmp1 = expisa(tmp1); \ - _mm256_storeu_ps(y, tmp0); \ - _mm256_storeu_ps(y + 8, tmp1); \ - } - -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx, detail::ExpAVX); -INTRI16_FLOAT(jit::avx, detail::ExpAVX); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2, detail::ExpAVX2); -INTRI16_FLOAT(jit::avx2, detail::ExpAVX2); -#endif -#ifdef __AVX512F__ -INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2); -INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2); -#endif -// TODO(TJ): eq16 test and complete avx512 - -#undef INTRI8_FLOAT -#undef INTRI16_FLOAT -#undef MKL_FLOAT -#undef MKL_DOUBLE - -REGISTER_JITKERNEL_DEPRECATED(vexp, VExpKernel); - -/* VSigmoid JitKernel */ -template -class VSigmoidKernelImpl : public VSigmoidKernel { - public: - explicit VSigmoidKernelImpl(int d) : VSigmoidKernel() { - this->num_ = d; - vexp_ = KernelPool::Instance().template Get>(d); - } - void ComputeDeprecated(const T* x, T* y) const override { - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < this->num_; ++i) { - y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(0) - y[i]; - } - vexp_->ComputeDeprecated(y, y); - for (int i = 0; i < this->num_; ++i) { - y[i] = static_cast(1) / (static_cast(1) + y[i]); - } - } - - private: - std::shared_ptr> vexp_; -}; - -#define INTRI_SIGMOID(tmp, min, max, expisa) \ - tmp = _mm256_max_ps(tmp, min); \ - tmp = _mm256_min_ps(tmp, max); \ - tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); \ - tmp = expisa(tmp); \ - tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \ - tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp) - -#define INTRI8_FLOAT(isa, expisa) \ - template <> \ - void VSigmoidKernelImpl::ComputeDeprecated( \ - const float* x, float* y) const { \ - /* TODO(TJ): try to use static const*/ \ - __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ - __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ - __m256 tmp = _mm256_loadu_ps(x); \ - INTRI_SIGMOID(tmp, min, max, expisa); \ - _mm256_storeu_ps(y, tmp); \ - } - -#define INTRI16_FLOAT(isa, expisa) \ - template <> \ - void VSigmoidKernelImpl::ComputeDeprecated( \ - const float* x, float* y) const { \ - __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ - __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ - __m256 tmp0 = _mm256_loadu_ps(x); \ - __m256 tmp1 = _mm256_loadu_ps(x + 8); \ - INTRI_SIGMOID(tmp0, min, max, expisa); \ - INTRI_SIGMOID(tmp1, min, max, expisa); \ - _mm256_storeu_ps(y, tmp0); \ - _mm256_storeu_ps(y + 8, tmp1); \ - } - -#define INTRI_GT8LT16_FLOAT(isa, expisa) \ - template <> \ - VSigmoidKernelImpl::VSigmoidKernelImpl(int d) \ - : VSigmoidKernel() { \ - this->num_ = d; \ - this->end_ = AVX_FLOAT_BLOCK; \ - this->rest_ = d - this->end_; \ - vexp_ = \ - KernelPool::Instance().template Get>(this->rest_); \ - } \ - template <> \ - void VSigmoidKernelImpl::ComputeDeprecated( \ - const float* x, float* y) const { \ - __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ - __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ - __m256 tmp = _mm256_loadu_ps(x); \ - INTRI_SIGMOID(tmp, min, max, expisa); \ - _mm256_storeu_ps(y, tmp); \ - const float min_ = SIGMOID_THRESHOLD_MIN; \ - const float max_ = SIGMOID_THRESHOLD_MAX; \ - for (int i = this->end_; i < this->num_; ++i) { \ - y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ - y[i] = 0.f - y[i]; \ - } \ - vexp_->ComputeDeprecated(y + this->end_, y + this->end_); \ - for (int i = this->end_; i < this->num_; ++i) { \ - y[i] = 1.f / (1.f + y[i]); \ - } \ - } - -#define INTRI_GT16_FLOAT(isa, expisa) \ - template <> \ - VSigmoidKernelImpl::VSigmoidKernelImpl(int d) \ - : VSigmoidKernel() { \ - this->num_ = d; \ - this->rest_ = d % AVX_FLOAT_BLOCK; \ - this->end_ = d - this->rest_; \ - vexp_ = \ - KernelPool::Instance().template Get>(this->rest_); \ - } \ - template <> \ - void VSigmoidKernelImpl::ComputeDeprecated( \ - const float* x, float* y) const { \ - __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ - __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ - for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ - __m256 tmp = _mm256_loadu_ps(x + i); \ - INTRI_SIGMOID(tmp, min, max, expisa); \ - _mm256_storeu_ps(y + i, tmp); \ - } \ - const float min_ = SIGMOID_THRESHOLD_MIN; \ - const float max_ = SIGMOID_THRESHOLD_MAX; \ - for (int i = this->end_; i < this->num_; ++i) { \ - y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ - y[i] = 0.f - y[i]; \ - } \ - vexp_->ComputeDeprecated(y + this->end_, y + this->end_); \ - for (int i = this->end_; i < this->num_; ++i) { \ - y[i] = 1.f / (1.f + y[i]); \ - } \ - } - -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx, detail::ExpAVX); -INTRI16_FLOAT(jit::avx, detail::ExpAVX); -INTRI_GT8LT16_FLOAT(jit::avx, detail::ExpAVX); -INTRI_GT16_FLOAT(jit::avx, detail::ExpAVX); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2, detail::ExpAVX2); -INTRI16_FLOAT(jit::avx2, detail::ExpAVX2); -// maybe use avx at gt8lt16 and gt16 -#endif -#ifdef __AVX512F__ -INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2); -INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2); -// maybe use avx2 at gt8lt16 and gt16 -#endif - -#undef INTRI8_FLOAT -#undef INTRI16_FLOAT -#undef INTRI_GT8LT16_FLOAT -#undef INTRI_GT16_FLOAT -#undef INTRI_VSIGMOID - -REGISTER_JITKERNEL_DEPRECATED(vsigmoid, VSigmoidKernel); - -/* VTanh JitKernel */ -template -class VTanhKernelImpl : public VTanhKernel { - public: - explicit VTanhKernelImpl(int d) : VTanhKernel() { - this->num_ = d; - vscal_ = KernelPool::Instance().template Get>(d); - vsigmoid_ = KernelPool::Instance().template Get>(d); - vaddbias_ = KernelPool::Instance().template Get>(d); - } - void ComputeDeprecated(const T* x, T* y) const override { - const T a = static_cast(2), b = static_cast(-1); - vscal_->Compute(&a, x, y, this->num_); - vsigmoid_->ComputeDeprecated(y, y); - vscal_->Compute(&a, y, y, this->num_); - vaddbias_->Compute(&b, y, y, this->num_); - } - - private: - std::shared_ptr> vscal_; - std::shared_ptr> vsigmoid_; - std::shared_ptr> vaddbias_; -}; - -#define INTRI_VTANH(tmp, expisa) \ - tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), tmp); \ - tmp = _mm256_min_ps(tmp, _mm256_set1_ps(EXP_MAX_INPUT)); \ - tmp = expisa(tmp); \ - tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \ - tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \ - tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f)) - -#define INTRI8_FLOAT(isa, expisa) \ - template <> \ - void VTanhKernelImpl::ComputeDeprecated(const float* x, \ - float* y) const { \ - __m256 tmp = _mm256_loadu_ps(x); \ - INTRI_VTANH(tmp, expisa); \ - _mm256_storeu_ps(y, tmp); \ - } - -#define INTRI16_FLOAT(isa, expisa) \ - template <> \ - void VTanhKernelImpl::ComputeDeprecated(const float* x, \ - float* y) const { \ - __m256 tmp0 = _mm256_loadu_ps(x); \ - __m256 tmp1 = _mm256_loadu_ps(x + 8); \ - INTRI_VTANH(tmp0, expisa); \ - INTRI_VTANH(tmp1, expisa); \ - _mm256_storeu_ps(y, tmp0); \ - _mm256_storeu_ps(y + 8, tmp1); \ - } - -#define INTRI_GT8LT16_FLOAT(isa, expisa) \ - template <> \ - VTanhKernelImpl::VTanhKernelImpl(int d) \ - : VTanhKernel() { \ - this->num_ = d; \ - this->end_ = AVX_FLOAT_BLOCK; \ - this->rest_ = d - this->end_; \ - vscal_ = \ - KernelPool::Instance().template Get>(this->rest_); \ - vsigmoid_ = KernelPool::Instance().template Get>( \ - this->rest_); \ - vaddbias_ = KernelPool::Instance().template Get>( \ - this->rest_); \ - } \ - template <> \ - void VTanhKernelImpl::ComputeDeprecated( \ - const float* x, float* y) const { \ - __m256 tmp = _mm256_loadu_ps(x); \ - INTRI_VTANH(tmp, expisa); \ - _mm256_storeu_ps(y, tmp); \ - x += AVX_FLOAT_BLOCK; \ - y += AVX_FLOAT_BLOCK; \ - const float a = 2.f, b = -1.f; \ - vscal_->Compute(&a, x, y, this->num_); \ - vsigmoid_->ComputeDeprecated(y, y); \ - vscal_->Compute(&a, y, y, this->num_); \ - vaddbias_->Compute(&b, y, y, this->num_); \ - } - -#define INTRI_GT16_FLOAT(isa, expisa) \ - template <> \ - VTanhKernelImpl::VTanhKernelImpl(int d) \ - : VTanhKernel() { \ - this->num_ = d; \ - this->rest_ = d % AVX_FLOAT_BLOCK; \ - this->end_ = d - this->rest_; \ - vscal_ = \ - KernelPool::Instance().template Get>(this->rest_); \ - vsigmoid_ = KernelPool::Instance().template Get>( \ - this->rest_); \ - vaddbias_ = KernelPool::Instance().template Get>( \ - this->rest_); \ - } \ - template <> \ - void VTanhKernelImpl::ComputeDeprecated(const float* x, \ - float* y) const { \ - for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ - __m256 tmp = _mm256_loadu_ps(x + i); \ - INTRI_VTANH(tmp, expisa); \ - _mm256_storeu_ps(y + i, tmp); \ - } \ - x += this->end_; \ - y += this->end_; \ - const float a = 2.f, b = -1.f; \ - vscal_->Compute(&a, x, y, this->num_); \ - vsigmoid_->ComputeDeprecated(y, y); \ - vscal_->Compute(&a, y, y, this->num_); \ - vaddbias_->Compute(&b, y, y, this->num_); \ - } - -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx, detail::ExpAVX); -INTRI16_FLOAT(jit::avx, detail::ExpAVX); -INTRI_GT8LT16_FLOAT(jit::avx, detail::ExpAVX); -INTRI_GT16_FLOAT(jit::avx, detail::ExpAVX); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2, detail::ExpAVX2); -INTRI16_FLOAT(jit::avx2, detail::ExpAVX2); -// maybe use avx at gt8lt16 and gt16 -#endif -#ifdef __AVX512F__ -INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2); -INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2); -// maybe use avx at gt8lt16 and gt16 -#endif - -#undef INTRI8_FLOAT -#undef INTRI16_FLOAT -#undef INTRI_GT8LT16_FLOAT -#undef INTRI_GT16_FLOAT -#undef INTRI_VTANH - -REGISTER_JITKERNEL_DEPRECATED(vtanh, VTanhKernel); - -#undef JITKERNEL_NEW_ACT_IMPL - } // namespace jitkernel } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h index a8169ea48ae3eee5a8cba291be4496c4c6074221..8acf60cfbfd3d47ad52862241b7635aba6982ebf 100644 --- a/paddle/fluid/operators/math/jit_kernel_macro.h +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -15,12 +15,20 @@ limitations under the License. */ #pragma once #include #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { namespace math { namespace jitkernel { +#define JITKERNEL_DECLARE_STATIC_FUNC \ + static inline std::string name(int d) { \ + PADDLE_THROW("DType should be either float or double"); \ + } \ + static inline bool useJIT(int d) { return false; } \ + static inline bool useMKL(int d) { return false; } + #define JITKERNEL_DEFINE_NAME(ker_key, ker_class) \ template <> \ std::string ker_class##Impl::name(int d) { \ @@ -86,17 +94,17 @@ namespace jitkernel { namespace jit = platform::jit; // TODO(TJ): below defines are deprecated, would be remove recently -#define SEARCH_BLOCK(macro_, ker, dtype, isa) \ - if (d < AVX_FLOAT_BLOCK) { \ - macro_(ker, dtype, isa, kLT8); \ - } else if (d == AVX_FLOAT_BLOCK) { \ - macro_(ker, dtype, isa, kEQ8); \ - } else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \ - macro_(ker, dtype, isa, kGT8LT16); \ - } else if (d == AVX512_FLOAT_BLOCK) { \ - macro_(ker, dtype, isa, kEQ16); \ - } else { \ - macro_(ker, dtype, isa, kGT16); \ +#define SEARCH_BLOCK(macro_, ker, dtype, isa) \ + if (d < YMM_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kLT8); \ + } else if (d == YMM_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kEQ8); \ + } else if (d > YMM_FLOAT_BLOCK && d < ZMM_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kGT8LT16); \ + } else if (d == ZMM_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kEQ16); \ + } else { \ + macro_(ker, dtype, isa, kGT16); \ } #define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index 926221f0a75c461e275a72f16b4339ae28a8e988..e79b0400ab75d1488a26450bd8cde4a0979fc995 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -175,26 +175,26 @@ class LSTMKernelImpl : public LSTMKernel { void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, T* checked) const override { // gates: W_ch, W_ih, W_fh, W_oh - act_gate_d3_->ComputeDeprecated(gates + d_, gates + d_); + act_gate_d3_->Compute(gates + d_, gates + d_, d3_); /* C_t = C_t-1 * fgated + cand_gated * igated */ - act_cand_d_->ComputeDeprecated(gates, gates); + act_cand_d_->Compute(gates, gates, d_); vmul_d_->Compute(gates, gates + d_, gates + d_, d_); vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); vadd_d_->Compute(gates + d_, gates + d2_, ct, d_); /* H_t = act_cell(C_t) * ogated */ - act_cell_d_->ComputeDeprecated(ct, gates + d2_); + act_cell_d_->Compute(ct, gates + d2_, d_); vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); } void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { /* C_t = igated * cgated*/ - act_gate_d_->ComputeDeprecated(gates + d_, gates + d_); - act_cand_d_->ComputeDeprecated(gates, gates); + act_gate_d_->Compute(gates + d_, gates + d_, d_); + act_cand_d_->Compute(gates, gates, d_); vmul_d_->Compute(gates, gates + d_, ct, d_); /* H_t = act_cell(C_t) * ogated */ - act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_); - act_cell_d_->ComputeDeprecated(ct, gates + d2_); + act_gate_d_->Compute(gates + d3_, gates + d3_, d_); + act_cell_d_->Compute(ct, gates + d2_, d_); vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); } @@ -292,32 +292,32 @@ class PeepholeKernelImpl : public LSTMKernel { vmul_d_->Compute(wp_data, ct_1, checked, d_); vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_); vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_); - act_gate_d2_->ComputeDeprecated(gates + d_, gates + d_); + act_gate_d2_->Compute(gates + d_, gates + d_, d2_); /* C_t = C_t-1 * fgated + cand_gated * igated*/ - act_cand_d_->ComputeDeprecated(gates, gates); + act_cand_d_->Compute(gates, gates, d_); vmul_d_->Compute(gates, gates + d_, gates + d_, d_); vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); vadd_d_->Compute(gates + d_, gates + d2_, ct, d_); /* get ogated*/ vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_); - act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_); + act_gate_d_->Compute(gates + d3_, gates + d3_, d_); /* H_t = act_cell(C_t) * ogated */ - act_cell_d_->ComputeDeprecated(ct, gates + d2_); + act_cell_d_->Compute(ct, gates + d2_, d_); vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); } void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { /* C_t = igated * cgated*/ - act_gate_d_->ComputeDeprecated(gates + d_, gates + d_); - act_cand_d_->ComputeDeprecated(gates, gates); + act_gate_d_->Compute(gates + d_, gates + d_, d_); + act_cand_d_->Compute(gates, gates, d_); vmul_d_->Compute(gates, gates + d_, ct, d_); /* get outgated, put W_oc * C_t on igated */ vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_); /* H_t = act_cell(C_t) * ogated */ - act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_); - act_cell_d_->ComputeDeprecated(ct, gates + d2_); + act_gate_d_->Compute(gates + d3_, gates + d3_, d_); + act_cell_d_->Compute(ct, gates + d2_, d_); vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); } @@ -376,20 +376,20 @@ class GRUKernelImpl : public GRUKernel { } void ComputeH1(T* gates, T* ht) const override { - act_gate_d_->ComputeDeprecated(gates, gates); - act_state_d_->ComputeDeprecated(gates + d2_, gates + d2_); + act_gate_d_->Compute(gates, gates, d_); + act_state_d_->Compute(gates + d2_, gates + d2_, d_); vmul_d_->Compute(gates, gates + d2_, ht, d_); } void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override { // W: {W_update, W_reset; W_state} - act_gate_d2_->ComputeDeprecated(gates, gates); + act_gate_d2_->Compute(gates, gates, d2_); vmul_d_->Compute(ht_1, gates + d_, ht, d_); } void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override { T* y = gates + d2_; - act_state_d_->ComputeDeprecated(y, y); + act_state_d_->Compute(y, y, d_); // out = zt*ht~ + (1-zt)*ht_1 for (int i = 0; i < d_; ++i) { ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index 5e1f91ffae03796be2817d0461900c2512938c77..5a6f87fe1f7d10d65d03d78c168d61719cec772e 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -181,7 +181,8 @@ TEST(JitKernel, vexp) { auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->ComputeDeprecated(x_data, ztgt_data); + // ker->Compute(x_data, ztgt_data); + ker->Compute(x_data, ztgt_data, d); } auto ttgte = GetCurrentUS(); @@ -222,7 +223,7 @@ void vsigmoid_better( y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); y[i] = 0.f - y[i]; } - vexp->ComputeDeprecated(y, y); + vexp->Compute(y, y, n); for (int i = 0; i < n; ++i) { y[i] = 1.f / (1.f + y[i]); } @@ -253,7 +254,7 @@ TEST(JitKernel, vsigmoid) { auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->ComputeDeprecated(x_data, ztgt_data); + ker->Compute(x_data, ztgt_data, d); } auto ttgte = GetCurrentUS(); @@ -287,7 +288,7 @@ void vtanh_better( const int n, const float* x, float* y) { const float a = 2.f, b = -1.f; vscal->Compute(&a, x, y, n); - vsigmoid->ComputeDeprecated(y, y); + vsigmoid->Compute(y, y, n); vscal->Compute(&a, y, y, n); vaddbias->Compute(&b, y, y, n); } @@ -321,7 +322,7 @@ TEST(JitKernel, vtanh) { auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->ComputeDeprecated(x_data, ztgt_data); + ker->Compute(x_data, ztgt_data, d); } auto ttgte = GetCurrentUS(); @@ -344,8 +345,8 @@ void lstm_ctht_ref( const std::shared_ptr< const paddle::operators::math::jitkernel::VExpKernel>& vexp_1, const int d, float* gates, const float* ct_1, float* ct, float* ht) { - vsigmoid_3d->ComputeDeprecated(gates + d, gates + d); - vtanh_d->ComputeDeprecated(gates, gates); + vsigmoid_3d->Compute(gates + d, gates + d, 3 * d); + vtanh_d->Compute(gates, gates, d); const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3; const float min = SIGMOID_THRESHOLD_MIN; const float max = SIGMOID_THRESHOLD_MAX; @@ -355,7 +356,7 @@ void lstm_ctht_ref( // H_t = act_cell(C_t) * ogated float tmp = ct[k] * 2; tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp)); - vexp_1->ComputeDeprecated(&tmp, &tmp); + vexp_1->Compute(&tmp, &tmp, 1); tmp = 2.f / (1.f + tmp) - 1.f; ht[k] = tmp * o[k]; } @@ -373,13 +374,13 @@ void lstm_ctht_better( const paddle::operators::math::jitkernel::VAddKernel>& vadd_d, const int d, float* gates, const float* ct_1, float* ct, float* ht) { int d2 = d * 2; - vsigmoid_3d->ComputeDeprecated(gates + d, gates + d); - vtanh_d->ComputeDeprecated(gates, gates); + vsigmoid_3d->Compute(gates + d, gates + d, 3 * d); + vtanh_d->Compute(gates, gates, d); vmul_d->Compute(gates, gates + d, gates + d, d); vmul_d->Compute(ct_1, gates + d2, gates + d2, d); vadd_d->Compute(gates + d, gates + d2, ct, d); /* H_t = act_cell(C_t) * ogated */ - vtanh_d->ComputeDeprecated(ct, gates + d2); + vtanh_d->Compute(ct, gates + d2, d); vmul_d->Compute(gates + d2, gates + d * 3, ht, d); } @@ -736,7 +737,7 @@ void vaddrelu_better( const paddle::operators::math::jitkernel::VReluKernel>& vrelu, const float* x, const float* y, float* z, int d) { vadd->Compute(x, y, z, d); - vrelu->ComputeDeprecated(z, z); + vrelu->Compute(z, z, d); } TEST(JitKernel, vaddrelu) {