diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 47e31dce1fc14639ab4809a7d67f1e7e70881f62..687e9b0145682bcae8b51fefe580d804202a41d4 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -133,6 +133,7 @@ paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'po paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '52343203de40afe29607397e13aaf0d2')) paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '55db6ae7275fb9678a6814aebab81a9c')) paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '404741b5690228c493a2d9f59c6b1122')) +paddle.fluid.layers.instance_norm (ArgSpec(args=['input', 'epsilon', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None)), ('document', 'c124b947a6ac4d01f491275561b9c1ab')) paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', '2460b30fb87037555208fa8ac6fc1787')) paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '83e08f21af41ac8bac37aeab1f86fdd0')) paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '6d3b135bb3834d58ef2cb581ead1487c')) diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h index 6e89d73eb236ee7844c7de3c273e0b0f275a3e33..6c7dbe0db4e3545200ff77c9331b6b656d9de2ea 100644 --- a/paddle/fluid/operators/batch_norm_op.h +++ b/paddle/fluid/operators/batch_norm_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/norm_utils.h" namespace paddle { namespace operators { @@ -96,26 +97,5 @@ class BatchNormGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override; }; -inline void ExtractNCWHD(const framework::DDim &dims, - const DataLayout &data_layout, int *N, int *C, int *H, - int *W, int *D) { - *N = dims[0]; - if (dims.size() == 2) { - *C = dims[1]; - *H = 1; - *W = 1; - *D = 1; - } else { - *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; - *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; - *W = dims.size() > 3 - ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) - : 1; - *D = dims.size() > 4 - ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) - : 1; - } -} - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6375c92de2d219e9e66ce8899fed991a1a75d00d --- /dev/null +++ b/paddle/fluid/operators/instance_norm_op.cc @@ -0,0 +1,646 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/instance_norm_op.h" +#include +#include +#include +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +void InstanceNormOp::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of Instance Norm Op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true, + "Input(Scale) of Instance Norm Op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Bias"), true, + "Input(Bias) of Instance Norm Op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"), true, + "Output(Y) of Instance Norm Op should not be null."); + + PADDLE_ENFORCE_EQ( + ctx->HasOutput("SavedMean"), true, + "Output(SavedMean) of Instance Norm Op should not be null."); + PADDLE_ENFORCE_EQ( + ctx->HasOutput("SavedVariance"), true, + "Output(SavedVariance) of Instance Norm Op should not be null."); + + const auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "the dimension of input X must greater than or equal to 2"); + PADDLE_ENFORCE_LE(x_dims.size(), 5, + "the dimension of input X must smaller than or equal to 5"); + auto N = x_dims[0]; + auto C = x_dims[1]; + auto NxC = N * C; + + auto scale_dim = ctx->GetInputDim("Scale"); + auto bias_dim = ctx->GetInputDim("Bias"); + + PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL); + PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL); + + bool check = !((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 || + framework::product(bias_dim) <= 0)); + + if (check) { + PADDLE_ENFORCE_EQ(scale_dim[0], C); + PADDLE_ENFORCE_EQ(bias_dim[0], C); + } + + ctx->SetOutputDim("Y", x_dims); + ctx->SetOutputDim("SavedMean", {NxC}); + ctx->SetOutputDim("SavedVariance", {NxC}); + ctx->ShareLoD("X", "Y"); +} + +framework::OpKernelType InstanceNormOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + auto input_data_type = ctx.Input("X")->type(); + // By default, the type of the scale, bias, mean, + // and var tensors should both be float. (For float or float16 input tensor) + // or double (For double input tensor). + auto in_param_type = framework::proto::VarType::FP32; + if (input_data_type == framework::proto::VarType::FP64) { + in_param_type = framework::proto::VarType::FP64; + } + PADDLE_ENFORCE_EQ(in_param_type, ctx.Input("Scale")->type(), + "Scale input should be of float type"); + PADDLE_ENFORCE_EQ(in_param_type, ctx.Input("Bias")->type(), + "Bias input should be of float type"); + + return framework::OpKernelType(input_data_type, ctx.GetPlace()); +} + +void InstanceNormOpMaker::Make() { + AddAttr("epsilon", "") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddInput("X", "The input tensor"); + AddInput("Scale", + "Scale is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Bias", + "Bias is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddOutput("Y", "result after normalization"); + AddOutput("SavedMean", + "Mean of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddOutput("SavedVariance", + "Variance of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddComment(R"DOC( +Instance Normalization. + +Instance Norm has been implemented as disscussed in the paper: +https://arxiv.org/pdf/1607.08022.pdf +Can be used as a normalizer function for conv2d and fully_connected operations. +The required data format for this layer is as following: +NCHW `[batch, in_channels, in_height, in_width]` + +)DOC"); +} + +template +class InstanceNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + T epsilon = static_cast(ctx.Attr("epsilon")); + + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + + const int N = x_dims[0]; + const int C = x_dims[1]; + const int NxC = N * C; + + const int sample_size = x->numel() / N / C; + + auto *y = ctx.Output("Y"); + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + + auto &dev_ctx = ctx.template device_context(); + auto *place = dev_ctx.eigen_device(); + + Eigen::DSizes bcast(1, sample_size); + Eigen::DSizes C_shape(C, 1); + Eigen::DSizes NxC_shape(NxC, 1); + Eigen::DSizes shape(NxC, sample_size); + + math::SetConstant set_constant; + + saved_mean->mutable_data(ctx.GetPlace()); + saved_variance->mutable_data(ctx.GetPlace()); + set_constant(dev_ctx, saved_mean, static_cast(0)); + set_constant(dev_ctx, saved_variance, static_cast(0)); + + auto saved_mean_a = framework::EigenVector::Flatten(*saved_mean); + auto saved_mean_e = saved_mean_a.reshape(NxC_shape); + auto saved_variance_a = framework::EigenVector::Flatten(*saved_variance); + auto saved_variance_e = saved_variance_a.reshape(NxC_shape); + + auto x_e = framework::EigenVector::Flatten(*x); + auto x_arr = x_e.reshape(shape); + + Eigen::DSizes rdims(1); + + saved_mean_e.device(*place) = x_arr.mean(rdims); + auto saved_variance_arr = + (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon; + + saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse(); + + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + auto scale_e = framework::EigenVector::Flatten(*scale); + auto scale_arr = scale_e.reshape(C_shape); + auto bias_e = framework::EigenVector::Flatten(*bias); + auto bias_arr = bias_e.reshape(C_shape); + + y->mutable_data(ctx.GetPlace()); + auto y_e = framework::EigenVector::Flatten(*y); + auto y_arr = y_e.reshape(shape); + + // (x - mean) * inv_std * scale + bias + Eigen::DSizes bcast_param(N, sample_size); + y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) * + saved_variance_e.broadcast(bcast) * + scale_arr.broadcast(bcast_param) + + bias_arr.broadcast(bcast_param); + } +}; + +void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true, + "Input(scale) should not be null"); + + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Y")), true, + "Input(Y@GRAD) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("SavedMean"), true, + "Input(SavedMean) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("SavedVariance"), true, + "Input(SavedVariance) should not be null"); + + // check output + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, + "Output(x@GRAD) should not be null"); + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Bias")), true, + "Output(Scale@GRAD) and Output(Bias@GRAD) should not be " + "null at the same time"); + } + const auto x_dims = ctx->GetInputDim("X"); + const int C = x_dims[1]; + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); + ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); + } +} + +framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW("cannot find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("cannot find Y@GRAD"); + } + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); +} + +template +class InstanceNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + const auto *saved_mean = ctx.Input("SavedMean"); + const auto *saved_inv_variance = ctx.Input("SavedVariance"); + + const auto &x_dims = x->dims(); + + const int N = x_dims[0]; + const int C = x_dims[1]; + const int NxC = N * C; + const int sample_size = x->numel() / N / C; + + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + d_x->mutable_data(ctx.GetPlace()); + + auto &dev_ctx = ctx.template device_context(); + auto *place = dev_ctx.eigen_device(); + + Eigen::DSizes rdims(0); + Eigen::DSizes mean_rdims(1); + Eigen::DSizes rshape(NxC, sample_size); + Eigen::DSizes bcast(1, sample_size); + Eigen::DSizes C_shape(C, 1); + Eigen::DSizes NxC_shape(NxC, 1); + Eigen::DSizes param_shape(N, C); + Eigen::DSizes shape(NxC, sample_size); + + auto scale_e = framework::EigenVector::Flatten(*scale); + auto mean_e = framework::EigenVector::Flatten(*saved_mean); + auto inv_var_e = framework::EigenVector::Flatten(*saved_inv_variance); + auto dy_e = framework::EigenVector::Flatten(*d_y); + auto x_e = framework::EigenVector::Flatten(*x); + + auto scale_arr = scale_e.reshape(C_shape); + auto mean_arr = mean_e.reshape(NxC_shape); + auto inv_var_arr = inv_var_e.reshape(NxC_shape); + auto dy_arr = dy_e.reshape(shape); + auto x_arr = x_e.reshape(shape); + + auto tmp = + (x_arr - mean_arr.broadcast(bcast)) * inv_var_arr.broadcast(bcast); + + math::SetConstant set_constant; + // math: d_bias = np.sum(d_y, axis=(n,h,w)) + // math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w)) + if (d_scale && d_bias) { + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + set_constant(dev_ctx, d_scale, static_cast(0)); + set_constant(dev_ctx, d_bias, static_cast(0)); + + auto d_scale_e = framework::EigenVector::Flatten(*d_scale); + auto d_bias_e = framework::EigenVector::Flatten(*d_bias); + auto d_scale_data = d_scale_e.reshape(C_shape); + auto d_bias_data = d_bias_e.reshape(C_shape); + d_bias_data.device(*place) = + dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims); + d_scale_data.device(*place) = + (tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims); + } + + auto dy_mean = dy_arr.mean(mean_rdims).reshape(NxC_shape).broadcast(bcast); + + Eigen::DSizes bcast_param(N, sample_size); + set_constant(dev_ctx, d_x, static_cast(0)); + // math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y, + // axis=(h,w)) + // - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X - + // mean), + // axis=(h,w)) + auto dx_e = framework::EigenVector::Flatten(*d_x); + auto dx_arr = dx_e.reshape(shape); + dx_arr.device(*place) = scale_arr.broadcast(bcast_param) * + inv_var_arr.broadcast(bcast) * + (dy_arr - dy_mean - + tmp * + (dy_arr * tmp) + .mean(mean_rdims) + .reshape(NxC_shape) + .broadcast(bcast)); + } +}; + +std::unique_ptr InstanceNormGradMaker::Apply() const { + auto *op = new framework::OpDesc(); + op->SetType("instance_norm_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + + op->SetInput("Scale", Input("Scale")); + op->SetInput("Bias", Input("Bias")); + op->SetInput("SavedMean", Output("SavedMean")); + op->SetInput("SavedVariance", Output("SavedVariance")); + + op->SetAttrMap(Attrs()); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + + return std::unique_ptr(op); +} + +void InstanceNormDoubleGradOp::InferShape( + framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true, + "Input(Scale) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("SavedMean"), true, + "Input(SavedMean) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("SavedVariance"), true, + "Input(SavedVariance) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("DDX"), true, + "Input(DDX) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("DY"), true, + "Input(Y@GRAD) should not be null"); + + // check output + PADDLE_ENFORCE_EQ(ctx->HasOutput("DX"), true, + "Output(DX) should not be null"); + + const auto x_dims = ctx->GetInputDim("X"); + const int C = x_dims[1]; + if (ctx->HasOutput("DX")) { + ctx->SetOutputDim("DX", x_dims); + } + if (ctx->HasOutput("DScale")) { + ctx->SetOutputDim("DScale", {C}); + } + if (ctx->HasOutput("DDY")) { + ctx->ShareDim("X", "DDY"); + } +} + +framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + const auto *var = ctx.InputVar("DY"); + if (var == nullptr) { + PADDLE_THROW("cannot find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("cannot find Y@GRAD"); + } + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); +} + +std::unique_ptr InstanceNormDoubleGradMaker::Apply() const { + auto *op = new framework::OpDesc(); + op->SetType("instance_norm_grad_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Scale", Input("Scale")); + op->SetInput("SavedMean", Input("SavedMean")); + op->SetInput("SavedVariance", Input("SavedVariance")); + op->SetInput("DDX", OutputGrad(framework::GradVarName("X"))); + op->SetInput("DDScale", OutputGrad(framework::GradVarName("Scale"))); + op->SetInput("DDBias", OutputGrad(framework::GradVarName("Bias"))); + op->SetInput("DY", Input(framework::GradVarName("Y"))); + + op->SetAttrMap(Attrs()); + op->SetOutput("DX", InputGrad("X")); + op->SetOutput("DScale", InputGrad("Scale")); + op->SetOutput("DDY", InputGrad(framework::GradVarName("Y"))); + return std::unique_ptr(op); +} + +template +class InstanceNormDoubleGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *X = ctx.Input("X"); + const auto *Scale = ctx.Input("Scale"); + const auto *dY = ctx.Input("DY"); + const auto *Saved_mean = ctx.Input("SavedMean"); + const auto *Saved_variance = ctx.Input("SavedVariance"); + const auto *ddX = ctx.Input("DDX"); + const auto *ddScale = ctx.Input("DDScale"); + const auto *ddBias = ctx.Input("DDBias"); + + auto *dX = ctx.Output("DX"); + auto *dScale = ctx.Output("DScale"); + auto *ddY = ctx.Output("DDY"); + + const auto &x_dims = X->dims(); + int N, C, H, W, D; + ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + const int sample_size = X->numel() / N / C; + const int NxC = N * C; + + const T *mean_data = Saved_mean->data(); + const T *inv_var_data = Saved_variance->data(); + Tensor mean_tensor; + Tensor inv_var_tensor; + ConstEigenArrayMap x_arr(X->data(), sample_size, NxC); + ConstEigenVectorArrayMap mean_arr(mean_data, NxC); + ConstEigenVectorArrayMap inv_var_arr(inv_var_data, NxC); + + Tensor mean_tile; + mean_tile.Resize({sample_size, NxC}); + mean_tile.mutable_data(ctx.GetPlace()); + EigenArrayMap mean_tile_data(mean_tile.mutable_data(ctx.GetPlace()), + sample_size, NxC); + + Tensor inv_var_tile; + inv_var_tile.Resize({sample_size, NxC}); + inv_var_tile.mutable_data(ctx.GetPlace()); + EigenArrayMap inv_var_tile_data( + inv_var_tile.mutable_data(ctx.GetPlace()), sample_size, NxC); + + mean_tile_data = mean_arr.transpose().replicate(sample_size, 1); + inv_var_tile_data = inv_var_arr.transpose().replicate(sample_size, 1); + + ConstEigenVectorArrayMap scale_arr(Scale->data(), C); + + Tensor scale_tile; + scale_tile.Resize({sample_size, NxC}); + scale_tile.mutable_data(ctx.GetPlace()); + EigenArrayMap scale_tile_data(scale_tile.mutable_data(ctx.GetPlace()), + sample_size, NxC); + scale_tile_data = scale_arr.transpose().replicate(sample_size, N); + + ConstEigenArrayMap dy_arr(dY->data(), sample_size, NxC); + ConstEigenArrayMap ddx_arr(ddX->data(), sample_size, NxC); + + // math: dx = scale * ((x - mean) * inv_var / HxW * (np.mean(ddx, + // axis=(h,w)) * + // np.sum(dy, axis=(h,w)) - + // np.sum(dy * ddx, axis=(h,w)) + 3 * np.mean(dy * (x - mean), + // axis=(h,w)) * inv_var.pow(2) * + // np.sum(ddx * (x - mean), axis=(h,w))) + inv_var.pow(3) / HxW * + // np.sum(ddx * (x - mean)) * + // (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW * + // np.sum(dy, + // axis=(h,w)) * (x - mean) * + // (np.mean(ddx, axis=(h,w)) - ddx) + ddr * (dy * inv_var - inv_var + // * + // np.mean(dy, axis=(h,w)) - + // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), + // axis=(h,w)))) + + auto &dev_ctx = ctx.template device_context(); + math::SetConstant set_constant; + + Tensor x_sub_mean_mul_invstd; + x_sub_mean_mul_invstd.Resize({sample_size, NxC}); + x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()); + EigenArrayMap x_sub_mean_mul_invstd_arr( + x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()), sample_size, + NxC); + x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data; + + if (dX) { + dX->mutable_data(ctx.GetPlace()); + set_constant(dev_ctx, dX, static_cast(0)); + EigenArrayMap dx_arr(dX->mutable_data(ctx.GetPlace()), sample_size, + NxC); + + if (ddX) { + dx_arr += + x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data / + sample_size * + (ddx_arr.colwise().sum() * dy_arr.colwise().sum() / sample_size - + (dy_arr * ddx_arr).colwise().sum() + + 3. * (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() * + (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size); + + dx_arr += (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size * inv_var_tile_data * inv_var_tile_data * + (dy_arr.colwise().sum() / sample_size - dy_arr); + + dx_arr += (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size * inv_var_tile_data * inv_var_tile_data * + (ddx_arr.colwise().sum() / sample_size - ddx_arr); + + dx_arr = scale_tile_data * dx_arr.eval(); + } + if (ddScale) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + + Tensor ddscale_tile; + ddscale_tile.Resize({sample_size, NxC}); + ddscale_tile.mutable_data(ctx.GetPlace()); + EigenArrayMap ddscale_tile_data( + ddscale_tile.mutable_data(ctx.GetPlace()), sample_size, NxC); + ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N); + + dx_arr += (dy_arr * inv_var_tile_data - + dy_arr.colwise().sum() / sample_size * inv_var_tile_data - + x_sub_mean_mul_invstd_arr * inv_var_tile_data * + (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size) * + ddscale_tile_data; + } + } + if (dScale) { + // math: dscale = inv_var * (dy - np.mean(dy, axis=(h,w) - (x-mean) * + // inv_var.pow(2) * np.mean(dy * (x-mean), axis=(h,w)))) * ddx + dScale->mutable_data(ctx.GetPlace()); + set_constant(dev_ctx, dScale, static_cast(0)); + EigenVectorArrayMap dscale_arr(dScale->mutable_data(ctx.GetPlace()), + C); + if (ddX) { + Tensor first_grad; + first_grad.Resize({sample_size, NxC}); + first_grad.mutable_data(ctx.GetPlace()); + set_constant(dev_ctx, &first_grad, static_cast(0)); + EigenArrayMap first_grad_arr( + first_grad.mutable_data(ctx.GetPlace()), sample_size, NxC); + + first_grad_arr += + inv_var_tile_data * + (dy_arr - dy_arr.colwise().sum() / sample_size - + x_sub_mean_mul_invstd_arr * + (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size); + first_grad_arr = first_grad_arr.eval() * ddx_arr; + for (int nc = 0; nc < NxC; ++nc) { + int c = nc % C; + dscale_arr(c) += first_grad_arr.colwise().sum()(nc); + } + } + } + if (ddY) { + // math: ddy = (x - mean) * inv_var * ddscale + ddbias + + // scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) * + // np.mean(ddx * (x - mean), axis=(h,w))) + ddY->mutable_data(ctx.GetPlace()); + set_constant(dev_ctx, ddY, static_cast(0)); + EigenArrayMap ddy_arr(ddY->mutable_data(ctx.GetPlace()), + sample_size, NxC); + if (ddX) { + ddy_arr += scale_tile_data * inv_var_tile_data * + (ddx_arr - ddx_arr.colwise().sum() / sample_size - + x_sub_mean_mul_invstd_arr * + (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size); + } + if (ddScale && ddBias) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + Tensor ddscale_tile; + ddscale_tile.Resize({sample_size, NxC}); + ddscale_tile.mutable_data(ctx.GetPlace()); + EigenArrayMap ddscale_tile_data( + ddscale_tile.mutable_data(ctx.GetPlace()), sample_size, NxC); + ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N); + + ConstEigenVectorArrayMap ddbias_arr(ddBias->data(), C); + Tensor ddbias_tile; + ddbias_tile.Resize({sample_size, NxC}); + ddbias_tile.mutable_data(ctx.GetPlace()); + EigenArrayMap ddbias_tile_data( + ddbias_tile.mutable_data(ctx.GetPlace()), sample_size, NxC); + ddbias_tile_data = ddbias_arr.transpose().replicate(sample_size, N); + + ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; + ddy_arr += ddbias_tile_data; + } + } + } +}; + +DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInference, + {"DY", "DDY"}); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(instance_norm, ops::InstanceNormOp, ops::InstanceNormOpMaker, + ops::InstanceNormOpInferVarType, ops::InstanceNormGradMaker); +REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp, + ops::InstanceNormDoubleGradMaker); +REGISTER_OPERATOR(instance_norm_grad_grad, ops::InstanceNormDoubleGradOp, + ops::InstanceNormDoubleGradOpInplaceInference); + +REGISTER_OP_CPU_KERNEL( + instance_norm, + ops::InstanceNormKernel, + ops::InstanceNormKernel); +REGISTER_OP_CPU_KERNEL( + instance_norm_grad, + ops::InstanceNormGradKernel, + ops::InstanceNormGradKernel); +REGISTER_OP_CPU_KERNEL( + instance_norm_grad_grad, + ops::InstanceNormDoubleGradKernel, + ops::InstanceNormDoubleGradKernel); diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..20954342371fa6ecace76fdfc5726638ab9ce78e --- /dev/null +++ b/paddle/fluid/operators/instance_norm_op.cu @@ -0,0 +1,593 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "cub/cub.cuh" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/operators/instance_norm_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; +template +using CudnnDataType = platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +static __global__ void repeat_param(const T *input, T *output, + const int repeat_num, const int C) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < repeat_num * C; + i += blockDim.x * gridDim.x) { + int index = i % C; + output[i] = input[index]; + } +} + +template +static __global__ void add_param(const T *input, T *output, + const int repeat_num, const int C) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage ou_storage; + for (int i = blockIdx.x; i < C; i += gridDim.x) { + T ou = static_cast(0); + for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) { + const int index = j * C + i; + ou += static_cast(input[index]); + } + ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum()); + if (threadIdx.x == 0) { + output[i] = ou; + } + __syncthreads(); + + if (AVG) { + output[i] /= repeat_num; + } + } +} + +template +class InstanceNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, + "It must be CUDAPlace."); + double epsilon = static_cast(ctx.Attr("epsilon")); + + auto *x = ctx.Input("X"); + auto &x_dims = x->dims(); + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + "the dimension of input X must greater than or equal to 2"); + PADDLE_ENFORCE_LE( + x_dims.size(), 5, + "the dimension of input X must smaller than or equal to 5"); + int N, C, H, W, D; + ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + int NxC = N * C; + Tensor x_tmp; + x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D}); + + auto *y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t in_param_desc_; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); + + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + + VLOG(3) << "Setting descriptors."; + std::vector dims; + std::vector strides; + dims = {1, NxC, H, W, D}; + strides = {NxC * H * W * D, H * W * D, W * D, D, 1}; + + auto &dev_ctx = ctx.template device_context(); + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); + + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + + Tensor scale_tmp = + ctx.AllocateTmpTensor({NxC}, dev_ctx); + scale_tmp.mutable_data(ctx.GetPlace()); + Tensor bias_tmp = + ctx.AllocateTmpTensor({NxC}, dev_ctx); + bias_tmp.mutable_data(ctx.GetPlace()); + + const int n = x->numel(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min((NxC + block - 1) / block, max_blocks); + + repeat_param<<>>( + scale->data(), scale_tmp.data(), N, C); + repeat_param<<>>( + bias->data(), bias_tmp.data(), N, C); + + auto handle = dev_ctx.cudnn_handle(); + + math::SetConstant> + functor; + + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + saved_mean->mutable_data>(ctx.GetPlace()); + saved_variance->mutable_data>(ctx.GetPlace()); + functor(dev_ctx, saved_mean, static_cast>(0)); + functor(dev_ctx, saved_variance, static_cast>(0)); + + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( + handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x_tmp.template data(), + data_desc_, y->template mutable_data(ctx.GetPlace()), in_param_desc_, + scale_tmp.template data>(), + bias_tmp.template data>(), 0, nullptr, nullptr, + epsilon, saved_mean->template mutable_data>( + ctx.GetPlace()), + saved_variance->template mutable_data>( + ctx.GetPlace()))); + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); + } +}; + +template +class InstanceNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, + "It must use CUDAPlace."); + double epsilon = static_cast(ctx.Attr("epsilon")); + const auto *scale = ctx.Input("Scale"); + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + + const auto &x_dims = x->dims(); + + int N, C, H, W, D; + ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + int NxC = N * C; + + Tensor x_tmp, d_y_tmp; + x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D}); + d_y_tmp.ShareDataWith(*d_y).Resize({1, NxC, H, W, D}); + + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + d_x->mutable_data(ctx.GetPlace()); + if (d_scale && d_bias) { + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + } + PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL); + PADDLE_ENFORCE_EQ(scale->dims()[0], C); + + auto &dev_ctx = ctx.template device_context(); + + const int n = x->numel(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(NxC, max_blocks); + const int grid1 = (C + block - 1) / block; + + Tensor scale_tmp = + ctx.AllocateTmpTensor({NxC}, dev_ctx); + scale_tmp.mutable_data(ctx.GetPlace()); + Tensor d_scale_tmp = + ctx.AllocateTmpTensor({NxC}, dev_ctx); + Tensor d_bias_tmp = + ctx.AllocateTmpTensor({NxC}, dev_ctx); + repeat_param<<>>( + scale->data(), scale_tmp.data(), N, C); + + std::vector dims; + std::vector strides; + dims = {1, NxC, H, W, D}; + strides = {NxC * H * W * D, H * W * D, W * D, D, 1}; + + if ((H * W * D) == 1) { + framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); + math::SetConstant> + functor; + functor(dev_ctx, d_scale, static_cast>(0)); + functor(dev_ctx, d_bias, static_cast>(0)); + return; + } + + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t in_param_desc_; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); + + const auto *saved_mean = ctx.Input("SavedMean"); + const auto *saved_var = ctx.Input("SavedVariance"); + const void *saved_mean_data = + saved_mean->template data>(); + const void *saved_var_data = + saved_var->template data>(); + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( + dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL, + CudnnDataType::kOne(), CudnnDataType::kZero(), + CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, + x_tmp.template data(), data_desc_, d_y_tmp.template data(), + data_desc_, d_x->template mutable_data(ctx.GetPlace()), + in_param_desc_, scale_tmp.template data>(), + d_scale_tmp.template mutable_data>( + ctx.GetPlace()), + d_bias_tmp.template mutable_data>(ctx.GetPlace()), + epsilon, saved_mean_data, saved_var_data)); + + if (d_scale && d_bias) { + add_param<<>>( + d_scale_tmp.data(), d_scale->data(), N, C); + add_param<<>>( + d_bias_tmp.data(), d_bias->data(), N, C); + } + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); + } +}; + +static __device__ __forceinline__ float real_sqrt(float x) { + return 1. / sqrtf(x); +} +static __device__ __forceinline__ double real_sqrt(double x) { + return 1. / sqrt(x); +} + +template +__global__ void DoubleGradComputeDX(const T *x, const T *mean, + const T *variance, const T *ddx, + const T *dy, const T *scale, + const T *ddscale, int C, int sample_size, + const double epsilon, T *dx) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + + T mean_val = mean[ncid]; + T var_val = variance[ncid]; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage ddx_storage; + __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage; + __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage; + __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage; + __shared__ T dy_sum_val; + __shared__ T ddx_sum_val; + __shared__ T dy_mul_ddx_sum_val; + __shared__ T dy_mul_x_sub_mean_sum_val; + __shared__ T ddx_mul_x_sub_mean_sum_val; + + T dy_sum = 0; + T ddx_sum = 0; + T dy_mul_ddx_sum = 0; + T dy_mul_x_sub_mean_sum = 0; + T ddx_mul_x_sub_mean_sum = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + T ddx_i = ddx[i]; + T dy_i = dy[i]; + T tmp = x[i] - mean_val; + + dy_sum += dy_i; + ddx_sum += ddx_i; + dy_mul_ddx_sum += (ddx_i * dy_i); + + dy_mul_x_sub_mean_sum += (dy_i * tmp); + ddx_mul_x_sub_mean_sum += (ddx_i * tmp); + } + + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum()); + dy_mul_ddx_sum = + BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum()); + dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage) + .Reduce(dy_mul_x_sub_mean_sum, cub::Sum()); + ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage) + .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + ddx_sum_val = ddx_sum; + dy_mul_ddx_sum_val = dy_mul_ddx_sum; + dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum; + ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum; + } + __syncthreads(); + + if (ddx != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + dx[i] += + ((x[i] - mean_val) * var_val * var_val * var_val / sample_size * + (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val + + 3. * dy_mul_x_sub_mean_sum_val * var_val * + ddx_mul_x_sub_mean_sum_val * var_val / sample_size) + + ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val * + var_val * (dy_sum_val / sample_size - dy[i]) + + dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val * + var_val * (ddx_sum_val / sample_size - ddx[i])) * + scale[c]; + } + } + __syncthreads(); + if (ddscale != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val - + (x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val * + var_val / sample_size) * + ddscale[c]; + } + } +} + +template +__global__ void DoubleGradComputeDDY(const T *x, const T *mean, + const T *variance, const T *ddscale, + const T *ddbias, const T *ddx, + const T *scale, int C, int sample_size, + const double epsilon, T *ddy) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + + T mean_val = mean[ncid]; + T var_val = variance[ncid]; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage ddx_storage; + __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage; + __shared__ T ddx_sum_val; + __shared__ T ddx_mul_x_sub_mean_sum_val; + + T ddx_sum = 0; + T ddx_mul_x_sub_mean_sum = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + T ddx_i = ddx[i]; + ddx_sum += ddx_i; + ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val)); + } + ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum()); + ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage) + .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + ddx_sum_val = ddx_sum; + ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum; + } + __syncthreads(); + + if (ddx != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + ddy[i] += scale[c] * var_val * + (ddx[i] - ddx_sum_val / sample_size - + (x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val * + var_val / sample_size); + } + } + __syncthreads(); + if (ddscale != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + ddy[i] += (x[i] - mean_val) * var_val * ddscale[c]; + } + } + __syncthreads(); + if (ddbias != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + ddy[i] += ddbias[c]; + } + } +} + +template +__global__ void DoubleGradComputeDScale(const T *x, const T *mean, + const T *variance, const T *ddx, + const T *dy, int C, int sample_size, + const double epsilon, T *dscale) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + + T mean_val = mean[ncid]; + T var_val = variance[ncid]; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage; + __shared__ typename BlockReduce::TempStorage dscale_tmp_storage; + __shared__ T dy_sum_val; + __shared__ T dy_mul_x_sub_mean_sum_val; + + T dy_sum = 0; + T dy_mul_x_sub_mean_sum = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + T dy_i = dy[i]; + dy_sum += dy_i; + dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val)); + } + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage) + .Reduce(dy_mul_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum; + } + __syncthreads(); + + if (ddx != nullptr) { + T dscale_tmp = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + dscale_tmp += + ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size - + dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) * + var_val * var_val / sample_size); + } + dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum()); + + if (threadIdx.x == 0) { + dscale[ncid] += dscale_tmp; + } + __syncthreads(); + } +} + +template +class InstanceNormDoubleGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *X = ctx.Input("X"); + const auto *Scale = ctx.Input("Scale"); + const auto *dY = ctx.Input("DY"); + const auto *Saved_mean = ctx.Input("SavedMean"); + const auto *Saved_variance = ctx.Input("SavedVariance"); + const auto *running_mean = ctx.Input("Mean"); + const auto *running_var = ctx.Input("Variance"); + const auto *ddX = ctx.Input("DDX"); + const auto *ddScale = ctx.Input("DDScale"); + const auto *ddBias = ctx.Input("DDBias"); + const double epsilon = static_cast(ctx.Attr("epsilon")); + + auto *dX = ctx.Output("DX"); + auto *dScale = ctx.Output("DScale"); + auto *ddY = ctx.Output("DDY"); + + const T *x_data = X->data(); + const T *scale_data = Scale->data(); + const T *dy_data = dY->data(); + const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data()); + + const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data()); + const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data()); + + const T *mean_data = Saved_mean->data(); + const T *variance_data = Saved_variance->data(); + + auto &x_dims = X->dims(); + int N, C, H, W, D; + ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + int NxC = N * C; + const int n = X->numel(); + int sample_size = n / N / C; + + auto &dev_ctx = ctx.template device_context(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = NxC; + const int grid1 = (C + block - 1) / block; + + math::SetConstant set_zero; + + if (dX) { + T *dx_data = dX->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, dX, static_cast(0)); + DoubleGradComputeDX<<>>( + x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, + ddscale_data, C, sample_size, epsilon, dx_data); + } + if (dScale) { + Tensor dscale_tmp = + ctx.AllocateTmpTensor({NxC}, dev_ctx); + set_zero(dev_ctx, &dscale_tmp, static_cast(0)); + T *dscale_tmp_data = dscale_tmp.mutable_data(ctx.GetPlace()); + + T *dscale_data = dScale->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, dScale, static_cast(0)); + DoubleGradComputeDScale<<>>( + x_data, mean_data, variance_data, ddx_data, dy_data, C, sample_size, + epsilon, dscale_tmp_data); + add_param<<>>( + dscale_tmp.data(), dScale->data(), N, C); + } + if (ddY) { + T *ddy_data = ddY->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, ddY, static_cast(0)); + DoubleGradComputeDDY<<>>( + x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data, + scale_data, C, sample_size, epsilon, ddy_data); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL( + instance_norm, ops::InstanceNormKernel, + ops::InstanceNormKernel); +REGISTER_OP_CUDA_KERNEL( + instance_norm_grad, + ops::InstanceNormGradKernel, + ops::InstanceNormGradKernel); +REGISTER_OP_CUDA_KERNEL( + instance_norm_grad_grad, + ops::InstanceNormDoubleGradKernel, + ops::InstanceNormDoubleGradKernel); diff --git a/paddle/fluid/operators/instance_norm_op.h b/paddle/fluid/operators/instance_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..509c1ff038d1f2169bc54bbdef5f8dc210a78120 --- /dev/null +++ b/paddle/fluid/operators/instance_norm_op.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/norm_utils.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +class InstanceNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override; +}; + +class InstanceNormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override; +}; + +class InstanceNormDoubleGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override; +}; + +class InstanceNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +class InstanceNormGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override; +}; + +class InstanceNormDoubleGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override; +}; + +class InstanceNormOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", "Y"}}; + } +}; + +template +class InstanceNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override; +}; + +template +class InstanceNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override; +}; + +template +class InstanceNormDoubleGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/norm_utils.h b/paddle/fluid/operators/norm_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..fee06fe5dd4fae2e843bc639bba4afc259b78ea5 --- /dev/null +++ b/paddle/fluid/operators/norm_utils.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using DataLayout = framework::DataLayout; + +inline void ExtractNCWHD(const framework::DDim &dims, + const DataLayout &data_layout, int *N, int *C, int *H, + int *W, int *D) { + *N = dims[0]; + if (dims.size() == 2) { + *C = dims[1]; + *H = 1; + *W = 1; + *D = 1; + } else { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *W = dims.size() > 3 + ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) + : 1; + *D = dims.size() > 4 + ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) + : 1; + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu index 059effd22d851b14b73fd9ae974e362d563f3cbd..4f584cbb56add3d1783450d0ac0d313b001860c1 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cu +++ b/paddle/fluid/operators/sync_batch_norm_op.cu @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/operators/norm_utils.h" #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/nccl_helper.h" diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 606acbfbeeeb814d03feb1ed3258259eecdc690f..76ceff6fde2d431cbafc4f1ebe3fa89904f915dc 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -61,6 +61,7 @@ __all__ = [ 'adaptive_pool2d', 'adaptive_pool3d', 'batch_norm', + 'instance_norm', 'data_norm', 'beam_search_decode', 'conv2d_transpose', @@ -3498,6 +3499,128 @@ def batch_norm(input, return helper.append_activation(batch_norm_out) +def instance_norm(input, + epsilon=1e-05, + param_attr=None, + bias_attr=None, + name=None): + """ + **Instance Normalization Layer** + + Can be used as a normalizer function for conv2d and fully_connected operations. + The required data format for this layer is one of the following: + + DataLayout: NCHW `[batch, in_channels, in_height, in_width]` + + Refer to `Instance Normalization: The Missing Ingredient for + Fast Stylization `_ + for more details. + + :math:`input` is the input features over a mini-batch. + + .. math:: + + \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\ + \\ mean of one feature map in mini-batch \\\\ + \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\ + \\mu_{\\beta})^2 \\qquad &//\ variance of one feature map in mini-batch \\\\ + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + + + When use_global_stats = True, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. + They are global (or running) statistics. (It usually got from the + pre-trained model.) + The training and testing (or inference) have the same behavior: + + .. math:: + + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta + + Args: + input(variable): The rank of input variable can be 2, 3, 4, 5. + epsilon(float, Default 1e-05): A value added to the denominator for + numerical stability. Default is 1e-5. + param_attr(ParamAttr|None): The parameter attribute for Parameter `scale` + of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm + will create ParamAttr as param_attr, the name of scale can be set in ParamAttr. + If the Initializer of the param_attr is not set, the parameter is initialized + with Xavier. Default: None. + bias_attr(ParamAttr|None): The parameter attribute for the bias of instance_norm. + If it is set to None or one attribute of ParamAttr, instance_norm + will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. + If the Initializer of the bias_attr is not set, the bias is initialized zero. + Default: None. + name(string, Default None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: A tensor variable which is the result after applying instance normalization on the input. + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + x = fluid.layers.data(name='x', shape=[3, 7, 3, 7], dtype='float32', append_batch_size=False) + hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w') + hidden2 = fluid.layers.instance_norm(input=hidden1) + """ + assert bias_attr is not False, "bias_attr should not be False in instance_norm." + helper = LayerHelper('instance_norm', **locals()) + dtype = helper.input_dtype() + + # use fp32 for in parameter + if dtype == core.VarDesc.VarType.FP16: + dtype = core.VarDesc.VarType.FP32 + + input_shape = input.shape + channel_num = input_shape[1] + + param_shape = [channel_num] + + # create parameter + scale = helper.create_parameter( + attr=helper.param_attr, + shape=param_shape, + dtype=dtype, + default_initializer=Constant(1.0)) + bias = helper.create_parameter( + attr=helper.bias_attr, + shape=param_shape, + dtype=dtype, + is_bias=True, + default_initializer=Constant(0.0)) + + # create output + saved_mean = helper.create_variable_for_type_inference( + dtype=dtype, stop_gradient=True) + saved_variance = helper.create_variable_for_type_inference( + dtype=dtype, stop_gradient=True) + + instance_norm_out = helper.create_variable_for_type_inference(dtype) + + helper.append_op( + type="instance_norm", + inputs={ + "X": input, + "Scale": scale, + "Bias": bias, + }, + outputs={ + "Y": instance_norm_out, + "SavedMean": saved_mean, + "SavedVariance": saved_variance + }, + attrs={"epsilon": epsilon, }) + + return instance_norm_out + + def data_norm(input, act=None, epsilon=1e-05, diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op.py new file mode 100644 index 0000000000000000000000000000000000000000..7c2a4d212faad42b3bb41490732a7ad1d082e302 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op.py @@ -0,0 +1,188 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid.op import Operator +from op_test import OpTest + + +def _reference_instance_norm_naive(x, scale, bias, epsilon, mean, var): + x_shape = x.shape + if len(x_shape) == 2: + x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1)) + n, c, h, w = x.shape + + mean_tile = np.reshape(mean, (n, c, 1, 1)) + mean_tile = np.tile(mean_tile, (1, 1, h, w)) + var_tile = np.reshape(var, (n, c, 1, 1)) + var_tile = np.tile(var_tile, (1, 1, h, w)) + + x_norm = (x - mean_tile) / np.sqrt(var_tile + epsilon).astype('float32') + scale_tile = np.reshape(scale, (1, c, 1, 1)) + scale_tile = np.tile(scale_tile, (n, 1, h, w)) + bias_tile = np.reshape(bias, (1, c, 1, 1)) + bias_tile = np.tile(bias_tile, (n, 1, h, w)) + y = scale_tile * x_norm + bias_tile + if len(x_shape) == 2: + y = np.reshape(y, x_shape) + return y, mean, var + + +def _reference_instance_norm_grad(x, d_y, scale, mean, var, epsilon): + # d_scale = sum(d_y * (x-mean) / sqrt(var+epsilon)) + # d_offset = sum(d_y) + # d_x = scale / sqrt(var+epsilon) * (d_y - np.mean(d_y, axis=(2,3)) - (x-mean)/sqrt(var+epsilon)* np.mean(y_grad * (x-mean)/sqrt(var+epsilon), axis=(2,3))) + n, c, h, w = x.shape + + d_bias = np.sum(d_y, axis=(0, 2, 3)) + + mean_tile = np.reshape(mean, (n, c, 1, 1)) + mean_tile = np.tile(mean_tile, (1, 1, h, w)) + var_tile = np.reshape(var, (n, c, 1, 1)) + var_tile = np.tile(var_tile, (1, 1, h, w)) + + d_scale = np.sum(d_y * (x - mean_tile) * var_tile, axis=(0, 2, 3)) + var_inv = var_tile + scale_tile = np.reshape(scale, (1, c, 1, 1)) + scale_tile = np.tile(scale_tile, (n, 1, h, w)) + + d_x = scale_tile * var_inv * (d_y - np.mean( + d_y, axis=(2, 3), keepdims=True) - (x - mean_tile) * var_inv * np.mean( + d_y * (x - mean_tile) * var_inv, axis=(2, 3), keepdims=True)) + return d_x, d_scale, d_bias + + +def _cal_mean_variance(x, epsilon, mean_shape): + mean = np.reshape(np.mean(x, axis=(2, 3)), mean_shape) + var = np.reshape(np.var(x, axis=(2, 3)), mean_shape) + return mean, var + + +class TestInstanceNormOpTraining(unittest.TestCase): + def setUp(self): + self.epsilon = 1e-5 + self.init_test_case() + + def init_test_case(self): + self.use_global_stats = False + self.no_grad_set = set() + self.fetch_list = [ + 'y', 'saved_mean', 'saved_variance', 'x@GRAD', 'scale@GRAD', + 'bias@GRAD' + ] + + def __assert_close(self, tensor, np_array, msg, atol=1e-4): + self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) + + def set_global_mean_var(self, mean_shape, x): + mean, variance = _cal_mean_variance(x, self.epsilon, mean_shape) + return mean, variance + + def test_forward_backward(self): + def test_with_place(place, shape): + epsilon = self.epsilon + n, c, h, w = shape[0], shape[1], shape[2], shape[3] + scale_shape = [c] + mean_shape = [n * c] + + np.random.seed() + x = np.random.random_sample(shape).astype(np.float32) + scale = np.random.random_sample(scale_shape).astype(np.float32) + bias = np.random.random_sample(scale_shape).astype(np.float32) + mean, variance = self.set_global_mean_var(mean_shape, x) + d_y = np.random.random_sample(shape).astype(np.float32) + + y, saved_mean, variance_tmp = _reference_instance_norm_naive( + x, scale, bias, epsilon, mean, variance) + + saved_variance = 1 / np.sqrt(variance_tmp + epsilon) + + d_x, d_scale, d_bias = _reference_instance_norm_grad( + x, d_y, scale, saved_mean, saved_variance, epsilon) + + var_dict = locals() + var_dict['y@GRAD'] = d_y + var_dict['x@GRAD'] = d_x + var_dict['scale@GRAD'] = d_scale + var_dict['bias@GRAD'] = d_bias + + var_names = [ + 'x', 'scale', 'bias', 'y', 'saved_mean', 'saved_variance' + ] + ground_truth = {name: var_dict[name] for name in var_names} + + program = fluid.Program() + with fluid.program_guard(program): + block = program.global_block() + for name in ground_truth: + block.create_var( + name=name, + dtype='float32', + shape=ground_truth[name].shape) + in_op = block.append_op( + type="instance_norm", + inputs={ + "X": block.var("x"), + "Scale": block.var("scale"), + "Bias": block.var("bias"), + }, + outputs={ + "Y": block.var("y"), + "SavedMean": block.var("saved_mean"), + "SavedVariance": block.var("saved_variance") + }, + attrs={"epsilon": epsilon, }) + + block.create_var(name="y@GRAD", dtype='float32', shape=y.shape) + + grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( + in_op.desc, self.no_grad_set, []) + grad_op_desc = grad_op_desc_list[0] + new_op_desc = block.desc.append_op() + new_op_desc.copy_from(grad_op_desc) + for var_name in grad_op_desc.output_arg_names(): + block.desc.var(var_name.encode("ascii")) + grad_op_desc.infer_var_type(block.desc) + grad_op_desc.infer_shape(block.desc) + for arg in grad_op_desc.output_arg_names(): + grad_var = block.desc.find_var(arg.encode("ascii")) + grad_var.set_dtype(core.VarDesc.VarType.FP32) + + exe = fluid.Executor(place) + out = exe.run(program, + feed={ + name: var_dict[name] + for name in ['x', 'scale', 'bias', 'y@GRAD'] + }, + fetch_list=self.fetch_list) + + for id, name in enumerate(self.fetch_list): + self.__assert_close(var_dict[name], out[id], name) + print("op test forward passes: ", str(place)) + + places = [core.CPUPlace()] + + if core.is_compiled_with_cuda() and core.op_support_gpu( + "instance_norm"): + places.append(core.CUDAPlace(0)) + for place in places: + test_with_place(place, [2, 3, 4, 5]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py new file mode 100644 index 0000000000000000000000000000000000000000..4f29467a3c5515eaff985e22aed4eccf16867757 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py @@ -0,0 +1,53 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.layers as layers +import paddle.fluid.core as core +import gradient_checker + +from decorator_helper import prog_scope + + +class TestInstanceNormDoubleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + prog = fluid.Program() + with fluid.program_guard(prog): + np.random.seed() + shape = [2, 3, 4, 5] + dtype = "float32" + eps = 0.005 + atol = 1e-4 + x = layers.create_parameter(dtype=dtype, shape=shape, name='x') + z = fluid.layers.instance_norm(input=x) + x_arr = np.random.uniform(-1, 1, shape).astype(dtype) + gradient_checker.double_grad_check( + [x], z, x_init=x_arr, atol=atol, place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + +if __name__ == "__main__": + unittest.main()