From c7e739f5425b894d513dfbd853cb30ef01797fb1 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 6 Dec 2017 16:38:12 +0800 Subject: [PATCH] Add LRN efficient GPU implement. (#5894) Add LRN efficient GPU implement --- paddle/operators/lrn_op.cc | 104 ++++++++++++- paddle/operators/lrn_op.cu | 160 +++++++++++++++++++- paddle/operators/lrn_op.h | 115 ++++---------- python/paddle/v2/fluid/tests/test_lrn_op.py | 3 +- 4 files changed, 289 insertions(+), 93 deletions(-) diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc index 00392b7967d..e20340e77bf 100644 --- a/paddle/operators/lrn_op.cc +++ b/paddle/operators/lrn_op.cc @@ -19,6 +19,103 @@ namespace operators { using framework::Tensor; +template +struct LRNFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& input, framework::Tensor* out, + framework::Tensor* mid, int N, int C, int H, int W, int n, + T k, T alpha, T beta) { + auto x_v = framework::EigenVector::Flatten(input); + + const int start = -(n - 1) / 2; + const int end = start + n; + + auto e_mid = framework::EigenTensor::From(*mid); + e_mid = e_mid.constant(k); + + auto e_x = framework::EigenTensor::From(input); + for (int m = 0; m < N; m++) { + for (int i = 0; i < C; i++) { + for (int c = start; c <= end; c++) { + int ch = i + c; + if (ch >= 0 && ch < C) { + auto s = e_mid.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto r = e_x.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + s += alpha * r.square(); + } + } + } + } + + auto out_e = framework::EigenVector::Flatten(*out); + out_e = x_v * e_mid.reshape(Eigen::DSizes(e_mid.size())).pow(-beta); + } +}; +template struct LRNFunctor; +template struct LRNFunctor; + +template +struct LRNGradFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& x, const framework::Tensor& out, + const framework::Tensor& mid, framework::Tensor* x_g, + const framework::Tensor& out_g, int N, int C, int H, int W, + int n, T alpha, T beta) { + T ratio = -2 * alpha * beta; + auto x_g_e = framework::EigenVector::Flatten(*x_g); + x_g_e = x_g_e.constant(0.0); + + auto e_x = framework::EigenTensor::From(x); + auto e_x_g = framework::EigenTensor::From(*x_g); + auto e_out = framework::EigenTensor::From(out); + auto e_out_g = framework::EigenTensor::From(out_g); + auto e_mid = framework::EigenTensor::From(mid); + + const int start = -(n - 1) / 2; + const int end = start + n; + for (int m = 0; m < N; m++) { + for (int i = 0; i < C; i++) { + auto i_x = e_x.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto i_x_g = e_x_g.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto i_out_g = e_out_g.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto i_mid = e_mid.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + i_x_g = i_mid.pow(-beta) * i_out_g; + for (int c = start; c <= end; c++) { + int ch = i + c; + if (ch < 0 || ch >= C) { + continue; + } + + auto c_out = e_out.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto c_mid = e_mid.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto c_out_g = e_out_g.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + i_x_g += ratio * c_out_g * c_out * i_x / c_mid; + } + } + } + } +}; +template struct LRNGradFunctor; +template struct LRNGradFunctor; + class LRNOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -83,8 +180,8 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Local Response Normalization Operator. -This operator comes from the paper -"ImageNet Classification with Deep Convolutional Neural Networks". +This operator comes from the paper: +<>. The original formula is: @@ -119,8 +216,7 @@ class LRNOpGrad : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("MidOut")), - "Input(MidOut@GRAD) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("MidOut"), "Input(MidOut) should not be null"); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null"); diff --git a/paddle/operators/lrn_op.cu b/paddle/operators/lrn_op.cu index 607dc6d86a7..e9a86712333 100644 --- a/paddle/operators/lrn_op.cu +++ b/paddle/operators/lrn_op.cu @@ -12,11 +12,167 @@ See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU #include "paddle/operators/lrn_op.h" -namespace ops = paddle::operators; +namespace paddle { +namespace operators { + +template +__global__ void KeCMRNormFillScale(int img_size, const T* in, T* mid, int C, + int H, int W, int size, T k, T alpha) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < img_size) { + const int w = idx % W; + const int h = (idx / W) % H; + const int n = idx / W / H; + const int offset = (n * C * H + h) * W + w; + + in += offset; + mid += offset; + const int step = H * W; + const int pre_pad = (size - 1) / 2; + const int post_pad = size - pre_pad - 1; + + T accum = 0; + int index = 0; + while (index < C + post_pad) { + if (index < C) { + T val = in[index * step]; + accum += val * val; + } + if (index >= size) { + T val = in[(index - size) * step]; + accum -= val * val; + } + if (index >= post_pad) { + mid[(index - post_pad) * step] = k + accum * alpha; + } + ++index; + } + } +} + +template +__global__ void KeCMRNormOutput(int input_size, const T* in, const T* mid, + T negative_beta, T* out) { + const int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < input_size) { + out[index] = in[index] * pow(mid[index], negative_beta); + } +} + +template +void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs, + T* outputs, T* mid, int N, int C, int H, int W, int n, T k, + T alpha, T beta) { + int img_size = N * H * W; + const int block_size = 1024; + int grid_size = (img_size + block_size - 1) / block_size; + + KeCMRNormFillScale< + T><<>>( + img_size, inputs, mid, C, H, W, n, k, alpha); + + int input_size = N * H * W * C; + grid_size = (input_size + block_size - 1) / block_size; + KeCMRNormOutput< + T><<>>( + input_size, inputs, mid, -beta, outputs); +} + +template +struct LRNFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& input, framework::Tensor* out, + framework::Tensor* mid, int N, int C, int H, int W, int n, + T k, T alpha, T beta) { + CrossMapNormal( + ctx, input.data(), out->mutable_data(ctx.GetPlace()), + mid->mutable_data(ctx.GetPlace()), N, C, H, W, n, k, alpha, beta); + } +}; + +template struct LRNFunctor; +template struct LRNFunctor; +template +__global__ void KeCMRNormDiff(int img_size, const T* x, const T* out, + const T* mid, T* x_g, const T* out_g, int C, + int H, int W, int size, T negative_beta, + T ratio) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < img_size) { + const int w = idx % W; + const int h = (idx / W) % H; + const int n = idx / W / H; + const int offset = (n * C * H + h) * W + w; + x += offset; + out += offset; + mid += offset; + out_g += offset; + x_g += offset; + + const int step = H * W; + const int pre_pad = size - (size + 1) / 2; + const int post_pad = size - pre_pad - 1; + + int index = 0; + T accum = 0; + // TODO(gongwb): optimize this with thread shared array. + while (index < C + post_pad) { + if (index < C) { + x_g[index * step] = 0.0; + accum += out_g[index * step] * out[index * step] / mid[index * step]; + } + if (index >= size) { + accum -= out_g[(index - size) * step] * out[(index - size) * step] / + mid[(index - size) * step]; + } + if (index >= post_pad) { + x_g[(index - post_pad) * step] += + out_g[(index - post_pad) * step] * + pow(mid[(index - post_pad) * step], negative_beta) - + ratio * x[(index - post_pad) * step] * accum; + } + ++index; + } + } +} + +template +void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x, + const T* out, const T* mid, T* x_g, const T* out_g, + int N, int C, int H, int W, int n, T alpha, T beta) { + int img_size = N * H * W; + + const int block_size = 1024; + int grid_size = (img_size + block_size - 1) / block_size; + + KeCMRNormDiff< + T><<>>( + img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta, + 2.0f * alpha * beta); +} + +template +struct LRNGradFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& x, const framework::Tensor& out, + const framework::Tensor& mid, framework::Tensor* x_g, + const framework::Tensor& out_g, int N, int C, int H, int W, + int n, T alpha, T beta) { + CrossMapNormalGrad(ctx, x.data(), out.data(), mid.data(), + x_g->mutable_data(ctx.GetPlace()), out_g.data(), + N, C, H, W, n, alpha, beta); + } +}; + +template struct LRNGradFunctor; +template struct LRNGradFunctor; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL(lrn, ops::LRNKernel); REGISTER_OP_GPU_KERNEL(lrn_grad, ops::LRNGradKernel); diff --git a/paddle/operators/lrn_op.h b/paddle/operators/lrn_op.h index 606c6574430..aa7539db4a2 100644 --- a/paddle/operators/lrn_op.h +++ b/paddle/operators/lrn_op.h @@ -21,6 +21,14 @@ namespace paddle { namespace operators { +template +struct LRNFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& input, framework::Tensor* out, + framework::Tensor* mid, int N, int C, int H, int W, int n, + T k, T alpha, T beta); +}; + template class LRNKernel : public framework::OpKernel { public: @@ -31,8 +39,8 @@ class LRNKernel : public framework::OpKernel { // f(x) represents outputs void Compute(const framework::ExecutionContext& ctx) const override { // input - const Tensor* x = ctx.Input("X"); - auto x_dims = x->dims(); + const Tensor& x = *ctx.Input("X"); + auto x_dims = x.dims(); // NCHW int N = x_dims[0]; @@ -57,38 +65,20 @@ class LRNKernel : public framework::OpKernel { PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0"); PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0"); - auto x_v = framework::EigenVector::Flatten(*x); - - const int start = -(n - 1) / 2; - const int end = start + n; - - auto e_mid = framework::EigenTensor::From(*mid); - e_mid.device(ctx.GetEigenDevice()) = e_mid.constant(k); - - auto e_x = framework::EigenTensor::From(*x); - for (int m = 0; m < N; m++) { - for (int i = 0; i < C; i++) { - for (int c = start; c <= end; c++) { - int ch = i + c; - if (ch >= 0 && ch < C) { - auto s = e_mid.slice(Eigen::array({{m, i, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto r = e_x.slice(Eigen::array({{m, ch, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - s.device(ctx.GetEigenDevice()) += alpha * r.square(); - } - } - } - } - - auto out_e = framework::EigenVector::Flatten(*out); - out_e.device(ctx.GetEigenDevice()) = - x_v * e_mid.reshape(Eigen::DSizes(e_mid.size())).pow(-beta); + LRNFunctor f; + f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta); } }; +template +struct LRNGradFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& x, const framework::Tensor& out, + const framework::Tensor& mid, framework::Tensor* x_g, + const framework::Tensor& out_g, int N, int C, int H, int W, + int n, T alpha, T beta); +}; + /** * \brief Backward calculation for normalization with across maps. * @@ -97,7 +87,7 @@ class LRNKernel : public framework::OpKernel { * The implementation of this Function is derived from the * CrossMapNormalFunc implementation. * - * InputGrad = OutputGrad * denoms ^ (-beta) + * InputGrad = OutputGrad * MidOut ^ (-beta) * -- upper * + > (OutputGrad * OutputValue * (-2 * alpha * beta) / MidOut) * InputValue * -- lower @@ -113,18 +103,15 @@ class LRNGradKernel : public framework::OpKernel { public: using Tensor = framework::Tensor; void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* x = ctx.Input("X"); - const Tensor* out = ctx.Input("Out"); - const Tensor* out_g = ctx.Input(framework::GradVarName("Out")); - const Tensor* mid = ctx.Input("MidOut"); + const Tensor& x = *ctx.Input("X"); + const Tensor& out = *ctx.Input("Out"); + const Tensor& out_g = *ctx.Input(framework::GradVarName("Out")); + const Tensor& mid = *ctx.Input("MidOut"); auto x_g = ctx.Output(framework::GradVarName("X")); x_g->mutable_data(ctx.GetPlace()); - auto x_g_e = framework::EigenVector::Flatten(*x_g); - x_g_e.device(ctx.GetEigenDevice()) = x_g_e.constant(0.0); - - auto x_dims = x->dims(); + auto x_dims = x.dims(); int N = x_dims[0]; int C = x_dims[1]; int H = x_dims[2]; @@ -133,51 +120,9 @@ class LRNGradKernel : public framework::OpKernel { int n = ctx.Attr("n"); T alpha = ctx.Attr("alpha"); T beta = ctx.Attr("beta"); - T ratio = -2 * alpha * beta; - - auto e_x = framework::EigenTensor::From(*x); - auto e_x_g = framework::EigenTensor::From(*x_g); - auto e_out = framework::EigenTensor::From(*out); - auto e_out_g = framework::EigenTensor::From(*out_g); - auto e_mid = framework::EigenTensor::From(*mid); - - const int start = -(n - 1) / 2; - const int end = start + n; - for (int m = 0; m < N; m++) { - for (int i = 0; i < C; i++) { - auto i_x = e_x.slice(Eigen::array({{m, i, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto i_x_g = e_x_g.slice(Eigen::array({{m, i, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto i_out_g = e_out_g.slice(Eigen::array({{m, i, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto i_mid = e_mid.slice(Eigen::array({{m, i, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - i_x_g.device(ctx.GetEigenDevice()) = i_mid.pow(-beta) * i_out_g; - for (int c = start; c <= end; c++) { - int ch = i + c; - if (ch < 0 || ch >= C) { - continue; - } - - auto c_out = e_out.slice(Eigen::array({{m, ch, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto c_mid = e_mid.slice(Eigen::array({{m, ch, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto c_out_g = e_out_g.slice(Eigen::array({{m, ch, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - i_x_g.device(ctx.GetEigenDevice()) += - ratio * c_out_g * c_out * i_x / c_mid; - } - } - } + + LRNGradFunctor f; + f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta); } }; diff --git a/python/paddle/v2/fluid/tests/test_lrn_op.py b/python/paddle/v2/fluid/tests/test_lrn_op.py index 7e34b3c91c1..9abb09e53a7 100644 --- a/python/paddle/v2/fluid/tests/test_lrn_op.py +++ b/python/paddle/v2/fluid/tests/test_lrn_op.py @@ -23,7 +23,7 @@ class TestLRNOp(OpTest): start = -(self.n - 1) / 2 end = start + self.n - mid = np.empty((self.N, self.C, self.H, self.W), dtype=float) + mid = np.empty((self.N, self.C, self.H, self.W)).astype("float32") mid.fill(self.k) for m in range(0, self.N): for i in range(0, self.C): @@ -74,5 +74,4 @@ class TestLRNOp(OpTest): if __name__ == "__main__": - exit(0) # LRN grad implement wrong unittest.main() -- GitLab