diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 52b459a6a2e56b7c256efdb535b4652c64bae23c..61c3cb34a2472c0ba7d2a7ea5abf8e826a793951 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/lrn_op.h" #include +#include "paddle/fluid/operators/math/blas.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -29,34 +30,43 @@ struct LRNFunctor { const framework::Tensor& input, framework::Tensor* out, framework::Tensor* mid, int N, int C, int H, int W, int n, T k, T alpha, T beta) { - auto x_v = framework::EigenVector::Flatten(input); - - const int start = -(n - 1) / 2; - const int end = start + n; - - auto e_mid = framework::EigenTensor::From(*mid); - e_mid = e_mid.constant(k); - - auto e_x = framework::EigenTensor::From(input); - for (int m = 0; m < N; m++) { - for (int i = 0; i < C; i++) { - for (int c = start; c < end; c++) { - int ch = i + c; - if (ch >= 0 && ch < C) { - auto s = e_mid.slice(Eigen::array({{m, i, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto r = e_x.slice(Eigen::array({{m, ch, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - s += alpha * r.square(); - } - } + const T* idata = input.data(); + auto place = ctx.GetPlace(); + auto blas = math::GetBlas(ctx); + T* odata = out->mutable_data(place); + T* mdata = mid->mutable_data(place); + Tensor squared; + T* sdata = squared.mutable_data({1, C + n - 1, H, W}, place); + std::memset(sdata, 0, sizeof(T) * squared.numel()); + for (int i = 0; i < mid->numel(); ++i) { + mdata[i] = k; + } + int img_size = H * W; + int fea_size = C * img_size; + int pre_pad = (n - 1) / 2; + // compute batches one by one + for (int i = 0; i < N; ++i) { + blas.VSQR(fea_size, idata + i * fea_size, sdata + pre_pad * img_size); + // init the first channel of mid + for (int c = 0; c < n; ++c) { + blas.AXPY(img_size, alpha, sdata + c * img_size, mdata + i * fea_size); + } + for (int c = 1; c < C; ++c) { + // copy previous scale + int mid_offset = i * fea_size + c * img_size; + std::memcpy(mdata + mid_offset, mdata + mid_offset - img_size, + img_size * sizeof(T)); + // add last + blas.AXPY(img_size, alpha, sdata + (c + n - 1) * img_size, + mdata + mid_offset); + // sub rest + blas.AXPY(img_size, -alpha, sdata + (c - 1) * img_size, + mdata + mid_offset); } } - - auto out_e = framework::EigenVector::Flatten(*out); - out_e = x_v * e_mid.reshape(Eigen::DSizes(e_mid.size())).pow(-beta); + // compute the final output + blas.VPOW(mid->numel(), mdata, -beta, odata); + blas.VMUL(mid->numel(), odata, idata, odata); } }; template struct LRNFunctor; @@ -156,6 +166,9 @@ class LRNOp : public framework::OperatorWithKernel { auto x_dim = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4."); + int n = ctx->Attrs().Get("n"); + PADDLE_ENFORCE(n > 0 && n % 2 == 1, "n should be positive odd value"); + ctx->SetOutputDim("Out", x_dim); ctx->ShareLoD("X", /*->*/ "Out"); ctx->SetOutputDim("MidOut", x_dim); diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h index 0fd3175e8579df9e61368cc151a94fa45e433884..12d39c3815395896343238b536110aecac66a376 100644 --- a/paddle/fluid/operators/lrn_op.h +++ b/paddle/fluid/operators/lrn_op.h @@ -60,7 +60,6 @@ class LRNKernel : public framework::OpKernel { T beta = ctx.Attr("beta"); T k = ctx.Attr("k"); - PADDLE_ENFORCE(n > 0, "n should >= 0"); PADDLE_ENFORCE(alpha >= 0.0, "alpha should >= 0.0"); PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0"); PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0"); diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index da185d93c09f9b06bd5968b9c8e93176f9ef014b..5d0d562030d2a20e4a1cefd3c36c6533fd35dc96 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -152,6 +152,12 @@ class Blas { template void VEXP(int n, const T* x, T* y) const; + template + void VSQR(int n, const T* x, T* y) const; + + template + void VPOW(int n, const T* x, T alpha, T* y) const; + template void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta, T* C) const; @@ -238,6 +244,16 @@ class BlasT : private Blas { Base()->template VEXP(args...); } + template + void VSQR(ARGS... args) const { + Base()->template VSQR(args...); + } + + template + void VPOW(ARGS... args) const { + Base()->template VPOW(args...); + } + template void GEMV(ARGS... args) const { Base()->template GEMV(args...); diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index e1df78d11e41c5f74e244643f40c6d0581fa6a4a..59454669be9e0f92a6fc0db52445307d88e1c7d8 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #pragma once +#include #include #include #include "paddle/fluid/operators/math/math_function.h" @@ -102,6 +103,16 @@ struct CBlas { static void VEXP(ARGS... args) { platform::dynload::vsExp(args...); } + + template + static void VSQR(ARGS... args) { + platform::dynload::vsSqr(args...); + } + + template + static void VPOW(ARGS... args) { + platform::dynload::vsPowx(args...); + } }; template <> @@ -182,6 +193,16 @@ struct CBlas { static void VEXP(ARGS... args) { platform::dynload::vdExp(args...); } + + template + static void VSQR(ARGS... args) { + platform::dynload::vdSqr(args...); + } + + template + static void VPOW(ARGS... args) { + platform::dynload::vdPowx(args...); + } }; #else @@ -241,6 +262,8 @@ struct CBlas { } static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); } static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); } + static void VSQR(...) { PADDLE_THROW("float16 VSQR not supported on CPU"); } + static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); } static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); }; static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); }; #ifdef PADDLE_WITH_MKLML @@ -398,6 +421,31 @@ void Blas::VEXP(int n, const T *x, T *y) const { #endif } +template <> +template +void Blas::VSQR(int n, const T *x, T *y) const { +#ifdef PADDLE_WITH_MKLML + CBlas::VSQR(n, x, y); +#else + for (int i = 0; i < n; ++i) { + y[i] = std::sqrt(x[i]); + } +#endif +} + +template <> +template +void Blas::VPOW(int n, const T *x, T a, + T *y) const { +#ifdef PADDLE_WITH_MKLML + CBlas::VPOW(n, x, a, y); +#else + for (int i = 0; i < n; ++i) { + y[i] = std::pow(x[i], a); + } +#endif +} + template <> template T Blas::DOT(int n, const T *x, const T *y) const { diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index aa20553ceffceded09447693c6e92f55fb48702d..9273e9b1e72f0ad7abd6c20d4a34283fbe24378a 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -76,6 +76,10 @@ extern void* mklml_dso_handle; __macro(vdMul); \ __macro(vsExp); \ __macro(vdExp); \ + __macro(vsSqr); \ + __macro(vdSqr); \ + __macro(vsPowx); \ + __macro(vdPowx); \ __macro(MKL_Set_Num_Threads) MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);