From e6d8aca3bf249df98bb2a3e27c2bd5663cc7ebd8 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 8 Oct 2018 15:37:03 +0800 Subject: [PATCH] refine code and fix --- paddle/fluid/operators/math/jit_kernel.h | 12 +- .../fluid/operators/math/jit_kernel_blas.cc | 190 +++++++++-------- paddle/fluid/operators/math/jit_kernel_exp.cc | 201 +++++++++--------- .../fluid/operators/math/jit_kernel_macro.h | 2 +- .../fluid/operators/math/jit_kernel_test.cc | 22 +- 5 files changed, 214 insertions(+), 213 deletions(-) diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index 8a247da4503..173cc368874 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -64,32 +64,32 @@ class KernelPool { template class VMulKernel : public Kernel { public: - virtual void Compute(const int n, const T *x, const T *y, T *z) const = 0; + virtual void Compute(const T *x, const T *y, T *z) const = 0; }; template class VAddKernel : public Kernel { public: - virtual void Compute(const int n, const T *x, const T *y, T *z) const = 0; + virtual void Compute(const T *x, const T *y, T *z) const = 0; }; template class VScalKernel : public Kernel { public: - virtual void Compute(const int n, const T a, const T *x, T *y) const = 0; - virtual void Compute(const int n, const T a, T *x) const = 0; + virtual void Compute(const T a, const T *x, T *y) const = 0; + virtual void Compute(const T a, T *x) const = 0; }; template class VAddBiasKernel : public Kernel { public: - virtual void Compute(const int n, const T a, const T *x, T *y) const = 0; + virtual void Compute(const T a, const T *x, T *y) const = 0; }; template class VExpKernel : public Kernel { public: - virtual void Compute(const int n, const T *x, T *y) const = 0; + virtual void Compute(const T *x, T *y) const = 0; }; template diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index d0ee97a43c6..4ea1a8cd5c6 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -34,41 +34,42 @@ namespace jit = platform::jit; template class VMulKernelImpl : public VMulKernel { public: - void Compute(const int n, const T* x, const T* y, T* z) const override { - for (int i = 0; i < n; ++i) { + explicit VMulKernelImpl(int d) : VMulKernel() { this->num_ = d; } + void Compute(const T* x, const T* y, T* z) const override { + for (int i = 0; i < this->num_; ++i) { z[i] = x[i] * y[i]; } } }; #ifdef PADDLE_WITH_MKLML -#define MKL_FLOAT(isa, block) \ - template <> \ - void VMulKernelImpl::Compute( \ - const int n, const float* x, const float* y, float* z) const { \ - platform::dynload::vsMul(n, x, y, z); \ +#define MKL_FLOAT(isa, block) \ + template <> \ + void VMulKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + platform::dynload::vsMul(this->num_, x, y, z); \ } -#define MKL_DOUBLE(isa, block) \ - template <> \ - void VMulKernelImpl::Compute( \ - const int n, const double* x, const double* y, double* z) const { \ - platform::dynload::vdMul(n, x, y, z); \ +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VMulKernelImpl::Compute( \ + const double* x, const double* y, double* z) const { \ + platform::dynload::vdMul(this->num_, x, y, z); \ } FOR_EACH_ISA(MKL_FLOAT, kGT16); FOR_EACH_ISA_BLOCK(MKL_DOUBLE); #endif -#define INTRI8_FLOAT(isa) \ - template <> \ - void VMulKernelImpl::Compute( \ - const int n, const float* x, const float* y, float* z) const { \ - __m256 tmpx, tmpy; \ - tmpx = _mm256_loadu_ps(x); \ - tmpy = _mm256_loadu_ps(y); \ - tmpx = _mm256_mul_ps(tmpx, tmpy); \ - _mm256_storeu_ps(z, tmpx); \ +#define INTRI8_FLOAT(isa) \ + template <> \ + void VMulKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + __m256 tmpx, tmpy; \ + tmpx = _mm256_loadu_ps(x); \ + tmpy = _mm256_loadu_ps(y); \ + tmpx = _mm256_mul_ps(tmpx, tmpy); \ + _mm256_storeu_ps(z, tmpx); \ } // avx > for > mkl @@ -90,41 +91,42 @@ INTRI8_FLOAT(jit::avx512f); template class VAddKernelImpl : public VAddKernel { public: - void Compute(const int n, const T* x, const T* y, T* z) const override { - for (int i = 0; i < n; ++i) { + explicit VAddKernelImpl(int d) : VAddKernel() { this->num_ = d; } + void Compute(const T* x, const T* y, T* z) const override { + for (int i = 0; i < this->num_; ++i) { z[i] = x[i] + y[i]; } } }; #ifdef PADDLE_WITH_MKLML -#define MKL_FLOAT(isa, block) \ - template <> \ - void VAddKernelImpl::Compute( \ - const int n, const float* x, const float* y, float* z) const { \ - platform::dynload::vsAdd(n, x, y, z); \ +#define MKL_FLOAT(isa, block) \ + template <> \ + void VAddKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + platform::dynload::vsAdd(this->num_, x, y, z); \ } -#define MKL_DOUBLE(isa, block) \ - template <> \ - void VAddKernelImpl::Compute( \ - const int n, const double* x, const double* y, double* z) const { \ - platform::dynload::vdAdd(n, x, y, z); \ +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VAddKernelImpl::Compute( \ + const double* x, const double* y, double* z) const { \ + platform::dynload::vdAdd(this->num_, x, y, z); \ } FOR_EACH_ISA(MKL_FLOAT, kGT16); FOR_EACH_ISA_BLOCK(MKL_DOUBLE); #endif -#define INTRI8_FLOAT(isa) \ - template <> \ - void VAddKernelImpl::Compute( \ - const int n, const float* x, const float* y, float* z) const { \ - __m256 tmpx, tmpy; \ - tmpx = _mm256_loadu_ps(x); \ - tmpy = _mm256_loadu_ps(y); \ - tmpx = _mm256_add_ps(tmpx, tmpy); \ - _mm256_storeu_ps(z, tmpx); \ +#define INTRI8_FLOAT(isa) \ + template <> \ + void VAddKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + __m256 tmpx, tmpy; \ + tmpx = _mm256_loadu_ps(x); \ + tmpy = _mm256_loadu_ps(y); \ + tmpx = _mm256_add_ps(tmpx, tmpy); \ + _mm256_storeu_ps(z, tmpx); \ } #ifdef __AVX__ INTRI8_FLOAT(jit::avx); @@ -145,56 +147,57 @@ INTRI8_FLOAT(jit::avx512f); template class VScalKernelImpl : public VScalKernel { public: - void Compute(const int n, const T a, const T* x, T* y) const override { - for (int i = 0; i < n; ++i) { + explicit VScalKernelImpl(int d) : VScalKernel() { this->num_ = d; } + void Compute(const T a, const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { y[i] = a * x[i]; } } - void Compute(const int n, const T a, T* x) const override { - for (int i = 0; i < n; ++i) { + void Compute(const T a, T* x) const override { + for (int i = 0; i < this->num_; ++i) { x[i] = a * x[i]; } } }; #ifdef PADDLE_WITH_MKLML -#define MKL_FLOAT(isa, block) \ - template <> \ - void VScalKernelImpl::Compute(const int n, const float a, \ - float* x) const { \ - platform::dynload::cblas_sscal(n, a, x, 1); \ +#define MKL_FLOAT(isa, block) \ + template <> \ + void VScalKernelImpl::Compute(const float a, float* x) \ + const { \ + platform::dynload::cblas_sscal(this->num_, a, x, 1); \ } -#define MKL_DOUBLE(isa, block) \ - template <> \ - void VScalKernelImpl::Compute( \ - const int n, const double a, double* x) const { \ - platform::dynload::cblas_dscal(n, a, x, 1); \ +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VScalKernelImpl::Compute(const double a, double* x) \ + const { \ + platform::dynload::cblas_dscal(this->num_, a, x, 1); \ } FOR_EACH_ISA(MKL_FLOAT, kGT16); FOR_EACH_ISA_BLOCK(MKL_DOUBLE); #endif -#define INTRI8_FLOAT(isa) \ - template <> \ - void VScalKernelImpl::Compute( \ - const int n, const float a, const float* x, float* y) const { \ - __m256 tmp; \ - __m256 scalar = _mm256_set1_ps(a); \ - tmp = _mm256_loadu_ps(x); \ - tmp = _mm256_mul_ps(tmp, scalar); \ - _mm256_storeu_ps(y, tmp); \ +#define INTRI8_FLOAT(isa) \ + template <> \ + void VScalKernelImpl::Compute( \ + const float a, const float* x, float* y) const { \ + __m256 tmp; \ + __m256 scalar = _mm256_set1_ps(a); \ + tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_mul_ps(tmp, scalar); \ + _mm256_storeu_ps(y, tmp); \ } -#define INTRI8_INPLACE_FLOAT(isa) \ - template <> \ - void VScalKernelImpl::Compute(const int n, const float a, \ - float* x) const { \ - __m256 tmp; \ - __m256 scalar = _mm256_set1_ps(a); \ - tmp = _mm256_loadu_ps(x); \ - tmp = _mm256_mul_ps(tmp, scalar); \ - _mm256_storeu_ps(x, tmp); \ +#define INTRI8_INPLACE_FLOAT(isa) \ + template <> \ + void VScalKernelImpl::Compute(const float a, float* x) \ + const { \ + __m256 tmp; \ + __m256 scalar = _mm256_set1_ps(a); \ + tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_mul_ps(tmp, scalar); \ + _mm256_storeu_ps(x, tmp); \ } #ifdef __AVX__ @@ -220,32 +223,33 @@ INTRI8_INPLACE_FLOAT(jit::avx512f); template class VAddBiasKernelImpl : public VAddBiasKernel { public: - void Compute(const int n, const T a, const T* x, T* y) const override { - for (int i = 0; i < n; ++i) { + explicit VAddBiasKernelImpl(int d) : VAddBiasKernel() { this->num_ = d; } + void Compute(const T a, const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { y[i] = x[i] + a; } } }; -#define INTRI8_FLOAT(isa) \ - template <> \ - void VAddBiasKernelImpl::Compute( \ - const int n, const float a, const float* x, float* y) const { \ - __m256 tmp = _mm256_loadu_ps(x); \ - tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a)); \ - _mm256_storeu_ps(y, tmp); \ +#define INTRI8_FLOAT(isa) \ + template <> \ + void VAddBiasKernelImpl::Compute( \ + const float a, const float* x, float* y) const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a)); \ + _mm256_storeu_ps(y, tmp); \ } -#define INTRI16_FLOAT(isa) \ - template <> \ - void VAddBiasKernelImpl::Compute( \ - const int n, const float a, const float* x, float* y) const { \ - __m256 tmp0 = _mm256_loadu_ps(x); \ - __m256 tmp1 = _mm256_loadu_ps(x + 8); \ - tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a)); \ - tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a)); \ - _mm256_storeu_ps(y, tmp0); \ - _mm256_storeu_ps(y + 8, tmp1); \ +#define INTRI16_FLOAT(isa) \ + template <> \ + void VAddBiasKernelImpl::Compute( \ + const float a, const float* x, float* y) const { \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a)); \ + tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a)); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ } #ifdef __AVX__ diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index ca4c4f4a42a..7e28a3a1877 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -40,26 +40,27 @@ namespace jit = platform::jit; template class VExpKernelImpl : public VExpKernel { public: - void Compute(const int n, const T* x, T* y) const override { - for (int i = 0; i < n; ++i) { + explicit VExpKernelImpl(int d) : VExpKernel() { this->num_ = d; } + void Compute(const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { y[i] = std::exp(x[i]); } } }; #ifdef PADDLE_WITH_MKLML -#define MKL_FLOAT(isa, block) \ - template <> \ - void VExpKernelImpl::Compute(const int n, const float* x, \ - float* y) const { \ - platform::dynload::vsExp(n, x, y); \ +#define MKL_FLOAT(isa, block) \ + template <> \ + void VExpKernelImpl::Compute(const float* x, float* y) \ + const { \ + platform::dynload::vsExp(this->num_, x, y); \ } -#define MKL_DOUBLE(isa, block) \ - template <> \ - void VExpKernelImpl::Compute( \ - const int n, const double* x, double* y) const { \ - platform::dynload::vdExp(n, x, y); \ +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VExpKernelImpl::Compute(const double* x, double* y) \ + const { \ + platform::dynload::vdExp(this->num_, x, y); \ } FOR_EACH_ISA(MKL_FLOAT, kLT8); FOR_EACH_ISA(MKL_FLOAT, kGT8LT16); @@ -67,24 +68,24 @@ FOR_EACH_ISA(MKL_FLOAT, kGT16); FOR_EACH_ISA_BLOCK(MKL_DOUBLE); #endif -#define INTRI8_FLOAT(isa) \ - template <> \ - void VExpKernelImpl::Compute(const int n, const float* x, \ - float* y) const { \ - __m256 tmp = _mm256_loadu_ps(x); \ - _mm256_storeu_ps(y, detail::Exp(tmp)); \ +#define INTRI8_FLOAT(isa) \ + template <> \ + void VExpKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + _mm256_storeu_ps(y, detail::Exp(tmp)); \ } -#define INTRI16_FLOAT(isa) \ - template <> \ - void VExpKernelImpl::Compute(const int n, const float* x, \ - float* y) const { \ - __m256 tmp0 = _mm256_loadu_ps(x); \ - __m256 tmp1 = _mm256_loadu_ps(x + 8); \ - tmp0 = detail::Exp(tmp0); \ - tmp1 = detail::Exp(tmp1); \ - _mm256_storeu_ps(y, tmp0); \ - _mm256_storeu_ps(y + 8, tmp1); \ +#define INTRI16_FLOAT(isa) \ + template <> \ + void VExpKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + tmp0 = detail::Exp(tmp0); \ + tmp1 = detail::Exp(tmp1); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ } #ifdef __AVX__ @@ -123,7 +124,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel { y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); y[i] = static_cast(0) - y[i]; } - vexp_->Compute(this->num_, y, y); + vexp_->Compute(y, y); for (int i = 0; i < this->num_; ++i) { y[i] = static_cast(1) / (static_cast(1) + y[i]); } @@ -166,64 +167,66 @@ class VSigmoidKernelImpl : public VSigmoidKernel { _mm256_storeu_ps(y + 8, tmp1); \ } -#define INTRI_GT8LT16_FLOAT(isa) \ - template <> \ - VSigmoidKernelImpl::VSigmoidKernelImpl(int d) \ - : VSigmoidKernel() { \ - this->num_ = d; \ - this->end_ = AVX_FLOAT_BLOCK; \ - this->rest_ = d - this->end_; \ - vexp_ = KernelPool::Instance().template Get>(d); \ - } \ - template <> \ - void VSigmoidKernelImpl::Compute(const float* x, \ - float* y) const { \ - __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ - __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ - __m256 tmp = _mm256_loadu_ps(x); \ - INTRI_SIGMOID(tmp, min, max); \ - _mm256_storeu_ps(y, tmp); \ - const float min_ = SIGMOID_THRESHOLD_MIN; \ - const float max_ = SIGMOID_THRESHOLD_MAX; \ - for (int i = this->end_; i < this->num_; ++i) { \ - y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ - y[i] = 0.f - y[i]; \ - } \ - vexp_->Compute(this->rest_, y + this->end_, y + this->end_); \ - for (int i = this->end_; i < this->num_; ++i) { \ - y[i] = 1.f / (1.f + y[i]); \ - } \ +#define INTRI_GT8LT16_FLOAT(isa) \ + template <> \ + VSigmoidKernelImpl::VSigmoidKernelImpl(int d) \ + : VSigmoidKernel() { \ + this->num_ = d; \ + this->end_ = AVX_FLOAT_BLOCK; \ + this->rest_ = d - this->end_; \ + vexp_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + } \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_SIGMOID(tmp, min, max); \ + _mm256_storeu_ps(y, tmp); \ + const float min_ = SIGMOID_THRESHOLD_MIN; \ + const float max_ = SIGMOID_THRESHOLD_MAX; \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ + y[i] = 0.f - y[i]; \ + } \ + vexp_->Compute(y + this->end_, y + this->end_); \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = 1.f / (1.f + y[i]); \ + } \ } -#define INTRI_GT16_FLOAT(isa) \ - template <> \ - VSigmoidKernelImpl::VSigmoidKernelImpl(int d) \ - : VSigmoidKernel() { \ - this->num_ = d; \ - this->rest_ = d % AVX_FLOAT_BLOCK; \ - this->end_ = d - this->rest_; \ - vexp_ = KernelPool::Instance().template Get>(d); \ - } \ - template <> \ - void VSigmoidKernelImpl::Compute(const float* x, \ - float* y) const { \ - __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ - __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ - for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ - __m256 tmp = _mm256_loadu_ps(x + i); \ - INTRI_SIGMOID(tmp, min, max); \ - _mm256_storeu_ps(y + i, tmp); \ - } \ - const float min_ = SIGMOID_THRESHOLD_MIN; \ - const float max_ = SIGMOID_THRESHOLD_MAX; \ - for (int i = this->end_; i < this->num_; ++i) { \ - y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ - y[i] = 0.f - y[i]; \ - } \ - vexp_->Compute(this->rest_, y + this->end_, y + this->end_); \ - for (int i = this->end_; i < this->num_; ++i) { \ - y[i] = 1.f / (1.f + y[i]); \ - } \ +#define INTRI_GT16_FLOAT(isa) \ + template <> \ + VSigmoidKernelImpl::VSigmoidKernelImpl(int d) \ + : VSigmoidKernel() { \ + this->num_ = d; \ + this->rest_ = d % AVX_FLOAT_BLOCK; \ + this->end_ = d - this->rest_; \ + vexp_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + } \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ + __m256 tmp = _mm256_loadu_ps(x + i); \ + INTRI_SIGMOID(tmp, min, max); \ + _mm256_storeu_ps(y + i, tmp); \ + } \ + const float min_ = SIGMOID_THRESHOLD_MIN; \ + const float max_ = SIGMOID_THRESHOLD_MAX; \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ + y[i] = 0.f - y[i]; \ + } \ + vexp_->Compute(y + this->end_, y + this->end_); \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = 1.f / (1.f + y[i]); \ + } \ } #ifdef __AVX__ @@ -251,12 +254,7 @@ INTRI16_FLOAT(jit::avx512f); #undef INTRI_GT16_FLOAT #undef INTRI_VSIGMOID -#define JITKERNEL_NEW_ACT_IMPL(ker, dtype, isa, k) \ - p = std::dynamic_pointer_cast>( \ - std::make_shared>(d)) - -REGISTER_JITKERNEL_ARGS(vsigmoid, VSigmoidKernel, JITKERNEL_DECLARE, - JITKERNEL_KEY, JITKERNEL_NEW_ACT_IMPL); +REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel); /* VTanh JitKernel */ template @@ -269,10 +267,10 @@ class VTanhKernelImpl : public VTanhKernel { vaddbias_ = KernelPool::Instance().template Get>(d); } void Compute(const T* x, T* y) const override { - vscal_->Compute(this->num_, static_cast(2), x, y); + vscal_->Compute(static_cast(2), x, y); vsigmoid_->Compute(y, y); - vscal_->Compute(this->num_, static_cast(2), y); - vaddbias_->Compute(this->num_, static_cast(-1), y, y); + vscal_->Compute(static_cast(2), y); + vaddbias_->Compute(static_cast(-1), y, y); } private: @@ -332,10 +330,10 @@ class VTanhKernelImpl : public VTanhKernel { _mm256_storeu_ps(y, tmp); \ x += AVX_FLOAT_BLOCK; \ y += AVX_FLOAT_BLOCK; \ - vscal_->Compute(this->rest_, 2.f, x, y); \ + vscal_->Compute(2.f, x, y); \ vsigmoid_->Compute(y, y); \ - vscal_->Compute(this->rest_, 2.f, y); \ - vaddbias_->Compute(this->rest_, -1.f, y, y); \ + vscal_->Compute(2.f, y); \ + vaddbias_->Compute(-1.f, y, y); \ } #define INTRI_GT16_FLOAT(isa) \ @@ -362,10 +360,10 @@ class VTanhKernelImpl : public VTanhKernel { } \ x += this->end_; \ y += this->end_; \ - vscal_->Compute(this->rest_, 2.f, x, y); \ + vscal_->Compute(2.f, x, y); \ vsigmoid_->Compute(y, y); \ - vscal_->Compute(this->rest_, 2.f, y); \ - vaddbias_->Compute(this->rest_, -1.f, y, y); \ + vscal_->Compute(2.f, y); \ + vaddbias_->Compute(-1.f, y, y); \ } #ifdef __AVX__ @@ -391,8 +389,7 @@ INTRI16_FLOAT(jit::avx512f); #undef INTRI_GT16_FLOAT #undef INTRI_VTANH -REGISTER_JITKERNEL_ARGS(vtanh, VTanhKernel, JITKERNEL_DECLARE, JITKERNEL_KEY, - JITKERNEL_NEW_ACT_IMPL); +REGISTER_JITKERNEL(vtanh, VTanhKernel); #undef JITKERNEL_NEW_ACT_IMPL diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h index 2b63c695243..d8e55f26735 100644 --- a/paddle/fluid/operators/math/jit_kernel_macro.h +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -57,7 +57,7 @@ namespace jit = platform::jit; #define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \ p = std::dynamic_pointer_cast>( \ - std::make_shared>()) + std::make_shared>(d)) #define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \ marco_declare, macro_key, macro_impl) \ diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index 290605749f5..5e9e5c5b292 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -73,7 +73,7 @@ TEST(JitKernel, vaddbias) { auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(d, a, x_data, ztgt_data); + ker->Compute(a, x_data, ztgt_data); } auto ttgte = GetCurrentUS(); @@ -99,7 +99,7 @@ void vexp_mkl(const int n, const float* x, float* y) { TEST(JitKernel, vexp) { namespace jit = paddle::operators::math::jitkernel; - for (int d : {7, 8, 15, 16, 30, 128}) { + for (int d : {7, 8, 15, 16, 30, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); RandomVec(d, x.data(), -2.f, 2.f); @@ -124,7 +124,7 @@ TEST(JitKernel, vexp) { auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(d, x_data, ztgt_data); + ker->Compute(x_data, ztgt_data); } auto ttgte = GetCurrentUS(); @@ -164,7 +164,7 @@ void vsigmoid_better( y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); y[i] = 0.f - y[i]; } - vexp->Compute(n, y, y); + vexp->Compute(y, y); for (int i = 0; i < n; ++i) { y[i] = 1.f / (1.f + y[i]); } @@ -226,10 +226,10 @@ void vtanh_better( const paddle::operators::math::jitkernel::VAddBiasKernel>& vaddbias, const int n, const float* x, float* y) { - vscal->Compute(n, 2.f, x, y); + vscal->Compute(2.f, x, y); vsigmoid->Compute(y, y); - vscal->Compute(n, 2.f, y); - vaddbias->Compute(n, -1.f, y, y); + vscal->Compute(2.f, y); + vaddbias->Compute(-1.f, y, y); } TEST(JitKernel, vtanh) { @@ -359,12 +359,12 @@ TEST(JitKernel, vscal) { auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(d, a, x_data, ztgt_data); + ker->Compute(a, x_data, ztgt_data); } auto ttgte = GetCurrentUS(); auto ttgts1 = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(d, a, y_data); + ker->Compute(a, y_data); } auto ttgte1 = GetCurrentUS(); VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat @@ -444,7 +444,7 @@ TEST(JitKernel, vmul) { auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(d, x_data, y_data, ztgt_data); + ker->Compute(x_data, y_data, ztgt_data); } auto ttgte = GetCurrentUS(); @@ -523,7 +523,7 @@ TEST(JitKernel, vadd) { auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(d, x_data, y_data, ztgt_data); + ker->Compute(x_data, y_data, ztgt_data); } auto ttgte = GetCurrentUS(); -- GitLab