diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc index 2cbd48cf093e263c3d152f72606de65bde613688..de92de453a354110dcaedae9b4bc1e53ab46a554 100644 --- a/paddle/fluid/operators/instance_norm_op.cc +++ b/paddle/fluid/operators/instance_norm_op.cc @@ -17,93 +17,16 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/ternary.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { -void InstanceNormOp::InferShape(framework::InferShapeContext *ctx) const { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNorm"); - OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "InstanceNorm"); - OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean", - "InstanceNorm"); - OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), "Output", "SavedVariance", - "InstanceNorm"); - - const auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_NE(phi::product(x_dims), 0, - platform::errors::PreconditionNotMet( - "The Input variable X(%s) has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function.", - ctx->Inputs("X").front())); - PADDLE_ENFORCE_GE( - x_dims.size(), 2, - platform::errors::InvalidArgument( - "ShapeError: the dimension of input X must " - "greater than or equal to 2. But received: the shape of input " - "X = [%s], the dimension of input X =[%d]", - x_dims, x_dims.size())); - PADDLE_ENFORCE_LE( - x_dims.size(), 5, - platform::errors::InvalidArgument( - "ShapeError: the dimension of input X must " - "smaller than or equal to 5, But received: the shape of input " - "X = [%s], the dimension of input X = [%d]", - x_dims, x_dims.size())); - auto N = x_dims[0]; - auto C = x_dims[1]; - auto NxC = N * C; - - if (ctx->HasInput("Scale")) { - auto scale_dim = ctx->GetInputDim("Scale"); - - PADDLE_ENFORCE_EQ( - scale_dim.size(), 1UL, - platform::errors::InvalidArgument( - "ShapeError: the dimension of scale must equal to 1." - "But received: the shape of scale is [%s], the dimension " - "of scale is [%d]", - scale_dim, scale_dim.size())); - - bool check = !((!ctx->IsRuntime()) && (phi::product(scale_dim) <= 0)); - - if (check) { - PADDLE_ENFORCE_EQ(scale_dim[0], C, - platform::errors::InvalidArgument( - "ShapeError: the shape of scale must equal to [%d]" - "But received: the shape of scale is [%d]", - C, scale_dim[0])); - } - } - if (ctx->HasInput("Bias")) { - auto bias_dim = ctx->GetInputDim("Bias"); - PADDLE_ENFORCE_EQ( - bias_dim.size(), 1UL, - platform::errors::InvalidArgument( - "ShapeError: the dimension of bias must equal to 1." - "But received: the shape of bias is [%s],the dimension " - "of bias is [%d]", - bias_dim, bias_dim.size())); - - bool check = !((!ctx->IsRuntime()) && (phi::product(bias_dim) <= 0)); - if (check) { - PADDLE_ENFORCE_EQ(bias_dim[0], C, - platform::errors::InvalidArgument( - "ShapeError: the shape of bias must equal to [%d]" - "But received: the shape of bias is [%d]", - C, bias_dim[0])); - } - } - - ctx->SetOutputDim("Y", x_dims); - ctx->SetOutputDim("SavedMean", {NxC}); - ctx->SetOutputDim("SavedVariance", {NxC}); - ctx->ShareLoD("X", "Y"); -} - framework::OpKernelType InstanceNormOp::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); @@ -170,29 +93,6 @@ NCHW `[batch, in_channels, in_height, in_width]` )DOC"); } -void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormGrad"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input", - framework::GradVarName("Y"), "InstanceNormGrad"); - OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean", - "InstanceNormGrad"); - OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance", - "InstanceNormGrad"); - - // check output - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", - framework::GradVarName("X"), "InstanceNormGrad"); - const auto x_dims = ctx->GetInputDim("X"); - const int C = x_dims[1]; - ctx->SetOutputDim(framework::GradVarName("X"), x_dims); - if (ctx->HasOutput(framework::GradVarName("Scale"))) { - ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); - } - if (ctx->HasOutput(framework::GradVarName("Bias"))) { - ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); - } -} - framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { const auto *var = ctx.InputVar(framework::GradVarName("Y")); @@ -214,34 +114,6 @@ framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); } -void InstanceNormDoubleGradOp::InferShape( - framework::InferShapeContext *ctx) const { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormDoubleGrad"); - OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean", - "InstanceNormDoubleGrad"); - OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance", - "InstanceNormDoubleGrad"); - OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX", - "InstanceNormDoubleGrad"); - OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "InstanceNormDoubleGrad"); - - // check output - OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX", - "InstanceNormDoubleGrad"); - - const auto x_dims = ctx->GetInputDim("X"); - const int C = x_dims[1]; - if (ctx->HasOutput("DX")) { - ctx->SetOutputDim("DX", x_dims); - } - if (ctx->HasOutput("DScale")) { - ctx->SetOutputDim("DScale", {C}); - } - if (ctx->HasOutput("DDY")) { - ctx->ShareDim("X", "DDY"); - } -} - framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { const auto *var = ctx.InputVar("DY"); @@ -263,213 +135,6 @@ framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); } -template -class InstanceNormDoubleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *X = ctx.Input("X"); - const auto *Scale = ctx.Input("Scale"); - const auto *dY = ctx.Input("DY"); - const auto *Saved_mean = ctx.Input("SavedMean"); - const auto *Saved_variance = ctx.Input("SavedVariance"); - const auto *ddX = ctx.Input("DDX"); - const auto *ddScale = ctx.Input("DDScale"); - const auto *ddBias = ctx.Input("DDBias"); - - auto *dX = ctx.Output("DX"); - auto *dScale = ctx.Output("DScale"); - auto *ddY = ctx.Output("DDY"); - - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant set_constant; - - const auto &x_dims = X->dims(); - int N, C, H, W, D; - ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); - const int sample_size = X->numel() / N / C; - const int NxC = N * C; - - const T *mean_data = Saved_mean->data(); - const T *inv_var_data = Saved_variance->data(); - Tensor mean_tensor; - Tensor inv_var_tensor; - ConstEigenArrayMap x_arr(X->data(), sample_size, NxC); - ConstEigenVectorArrayMap mean_arr(mean_data, NxC); - ConstEigenVectorArrayMap inv_var_arr(inv_var_data, NxC); - - Tensor mean_tile; - mean_tile.Resize({sample_size, NxC}); - mean_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap mean_tile_data(mean_tile.mutable_data(ctx.GetPlace()), - sample_size, NxC); - - Tensor inv_var_tile; - inv_var_tile.Resize({sample_size, NxC}); - inv_var_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap inv_var_tile_data( - inv_var_tile.mutable_data(ctx.GetPlace()), sample_size, NxC); - - mean_tile_data = mean_arr.transpose().replicate(sample_size, 1); - inv_var_tile_data = inv_var_arr.transpose().replicate(sample_size, 1); - - Tensor Scale_data; - if (!Scale) { - Scale_data.mutable_data({C}, ctx.GetPlace()); - set_constant(dev_ctx, &Scale_data, static_cast(1)); - } - ConstEigenVectorArrayMap scale_arr( - Scale ? Scale->data() : Scale_data.data(), C); - - Tensor scale_tile; - scale_tile.Resize({sample_size, NxC}); - scale_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap scale_tile_data(scale_tile.mutable_data(ctx.GetPlace()), - sample_size, NxC); - scale_tile_data = scale_arr.transpose().replicate(sample_size, N); - - ConstEigenArrayMap dy_arr(dY->data(), sample_size, NxC); - ConstEigenArrayMap ddx_arr(ddX->data(), sample_size, NxC); - - // math: dx = scale * ((x - mean) * inv_var / HxW * (np.mean(ddx, - // axis=(h,w)) * - // np.sum(dy, axis=(h,w)) - - // np.sum(dy * ddx, axis=(h,w)) + 3 * np.mean(dy * (x - mean), - // axis=(h,w)) * inv_var.pow(2) * - // np.sum(ddx * (x - mean), axis=(h,w))) + inv_var.pow(3) / HxW * - // np.sum(ddx * (x - mean)) * - // (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW * - // np.sum(dy, - // axis=(h,w)) * (x - mean) * - // (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var - - // inv_var * - // np.mean(dy, axis=(h,w)) - - // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), - // axis=(h,w))) - - Tensor x_sub_mean_mul_invstd; - x_sub_mean_mul_invstd.Resize({sample_size, NxC}); - x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()); - EigenArrayMap x_sub_mean_mul_invstd_arr( - x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()), sample_size, - NxC); - x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data; - - if (dX) { - dX->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, dX, static_cast(0)); - EigenArrayMap dx_arr(dX->mutable_data(ctx.GetPlace()), sample_size, - NxC); - - if (ddX) { - dx_arr += - x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data / - sample_size * - (ddx_arr.colwise().sum() * dy_arr.colwise().sum() / sample_size - - (dy_arr * ddx_arr).colwise().sum() + - 3. * (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() * - (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / - sample_size); - - dx_arr += (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / - sample_size * inv_var_tile_data * inv_var_tile_data * - (dy_arr.colwise().sum() / sample_size - dy_arr); - - dx_arr += (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / - sample_size * inv_var_tile_data * inv_var_tile_data * - (ddx_arr.colwise().sum() / sample_size - ddx_arr); - - dx_arr = scale_tile_data * dx_arr; - } - if (ddScale) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - - Tensor ddscale_tile; - ddscale_tile.Resize({sample_size, NxC}); - ddscale_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), sample_size, NxC); - ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N); - - dx_arr += (dy_arr * inv_var_tile_data - - dy_arr.colwise().sum() / sample_size * inv_var_tile_data - - x_sub_mean_mul_invstd_arr * inv_var_tile_data * - (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / - sample_size) * - ddscale_tile_data; - } - } - if (dScale) { - // math: dscale = inv_var * (dy - np.mean(dy, axis=(h,w) - (x-mean) * - // inv_var.pow(2) * np.mean(dy * (x-mean), axis=(h,w)))) * ddx - dScale->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, dScale, static_cast(0)); - EigenVectorArrayMap dscale_arr(dScale->mutable_data(ctx.GetPlace()), - C); - if (ddX) { - Tensor first_grad; - first_grad.Resize({sample_size, NxC}); - first_grad.mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, &first_grad, static_cast(0)); - EigenArrayMap first_grad_arr( - first_grad.mutable_data(ctx.GetPlace()), sample_size, NxC); - - first_grad_arr += - inv_var_tile_data * - (dy_arr - - dy_arr.colwise().sum().replicate(sample_size, 1) / sample_size - - x_sub_mean_mul_invstd_arr * - (dy_arr * x_sub_mean_mul_invstd_arr) - .colwise() - .sum() - .replicate(sample_size, 1) / - sample_size); - first_grad_arr = first_grad_arr * ddx_arr; - for (int nc = 0; nc < NxC; ++nc) { - int c = nc % C; - dscale_arr(c) += first_grad_arr.colwise().sum()(nc); - } - } - } - if (ddY) { - // math: ddy = (x - mean) * inv_var * ddscale + ddbias + - // scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) * - // np.mean(ddx * (x - mean), axis=(h,w))) - ddY->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, ddY, static_cast(0)); - EigenArrayMap ddy_arr(ddY->mutable_data(ctx.GetPlace()), - sample_size, NxC); - if (ddX) { - ddy_arr += scale_tile_data * inv_var_tile_data * - (ddx_arr - ddx_arr.colwise().sum() / sample_size - - x_sub_mean_mul_invstd_arr * - (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / - sample_size); - } - if (ddScale && ddBias) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({sample_size, NxC}); - ddscale_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), sample_size, NxC); - ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N); - - ConstEigenVectorArrayMap ddbias_arr(ddBias->data(), C); - Tensor ddbias_tile; - ddbias_tile.Resize({sample_size, NxC}); - ddbias_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap ddbias_tile_data( - ddbias_tile.mutable_data(ctx.GetPlace()), sample_size, NxC); - ddbias_tile_data = ddbias_arr.transpose().replicate(sample_size, N); - - ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; - ddy_arr += ddbias_tile_data; - } - } - } -}; - DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInferer, {"DY", "DDY"}); @@ -477,22 +142,26 @@ DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInferer, } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(instance_norm, InstanceNormInferShapeFunctor, + PD_INFER_META(phi::InstanceNormInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(instance_norm_grad, + InstanceNormGradInferShapeFunctor, + PD_INFER_META(phi::InstanceNormGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR( + instance_norm_grad_grad, InstanceNormDoubleGradInferShapeFunctor, + PD_INFER_META(phi::InstanceNormDoubleGradInferMeta)); REGISTER_OPERATOR(instance_norm, ops::InstanceNormOp, ops::InstanceNormOpMaker, ops::InstanceNormOpInferVarType, ops::InstanceNormGradMaker, - ops::InstanceNormGradMaker); + ops::InstanceNormGradMaker, + InstanceNormInferShapeFunctor); REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp, ops::InstanceNormDoubleGradMaker, - ops::InstanceNormDoubleGradMaker); + ops::InstanceNormDoubleGradMaker, + InstanceNormGradInferShapeFunctor); REGISTER_OPERATOR(instance_norm_grad_grad, ops::InstanceNormDoubleGradOp, - ops::InstanceNormDoubleGradOpInplaceInferer); - -REGISTER_OP_CPU_KERNEL( - instance_norm_grad_grad, - ops::InstanceNormDoubleGradKernel, - ops::InstanceNormDoubleGradKernel); + ops::InstanceNormDoubleGradOpInplaceInferer, + InstanceNormDoubleGradInferShapeFunctor); REGISTER_OP_VERSION(instance_norm) .AddCheckpoint( diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu deleted file mode 100644 index 192422429371b45243b6724f652d4891e1266fd3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/instance_norm_op.cu +++ /dev/null @@ -1,434 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/framework/data_layout.h" -#include "paddle/fluid/operators/instance_norm_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using DataLayout = framework::DataLayout; -template -using CudnnDataType = platform::CudnnDataType; -template -using BatchNormParamType = typename CudnnDataType::BatchNormParamType; - -template -static __global__ void repeat_param(const T *input, T *output, - const int repeat_num, const int C) { - CUDA_KERNEL_LOOP(i, repeat_num * C) { - int index = i % C; - output[i] = input[index]; - } -} - -template -static __global__ void add_param(const T *input, T *output, - const int repeat_num, const int C) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage ou_storage; - for (int i = blockIdx.x; i < C; i += gridDim.x) { - T ou = static_cast(0); - for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) { - const int index = j * C + i; - ou += static_cast(input[index]); - } - ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum()); - if (threadIdx.x == 0) { - output[i] = ou; - } - __syncthreads(); - - if (AVG) { - output[i] /= repeat_num; - } - } -} - -template -static __global__ void GradComputeDX(const T *dy, - const BatchNormParamType *scale, - const BatchNormParamType *mean, - const T *x, - const BatchNormParamType *variance, - const int C, const int sample_size, - T *dx) { - int beg_idx = blockIdx.x * sample_size + threadIdx.x; - int end_idx = (blockIdx.x + 1) * sample_size; - int ncid = blockIdx.x; - int c = ncid % C; - - BatchNormParamType mean_val = mean[ncid]; - BatchNormParamType inv_var_val = variance[ncid]; - - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage dy_storage; - __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; - __shared__ BatchNormParamType dy_sum_val; - __shared__ BatchNormParamType dy_x_sub_mean_sum_val; - - BatchNormParamType dy_sum = static_cast>(0); - BatchNormParamType dy_x_sub_mean_sum = - static_cast>(0); - - for (int i = beg_idx; i < end_idx; i += BlockDim) { - BatchNormParamType dy_i = static_cast>(dy[i]); - dy_sum += dy_i; - dy_x_sub_mean_sum += - dy_i * (static_cast>(x[i]) - mean_val); - } - dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); - dy_x_sub_mean_sum = - BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum()); - - if (threadIdx.x == 0) { - dy_sum_val = dy_sum; - dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; - } - __syncthreads(); - - for (int i = beg_idx; i < end_idx; i += BlockDim) { - dx[i] = - (static_cast>(dy[i]) - - dy_sum_val / static_cast>(sample_size) - - (static_cast>(x[i]) - mean_val) * - dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) * - scale[c] * inv_var_val; - } -} - -static __device__ __forceinline__ float real_sqrt(float x) { - return 1. / sqrtf(x); -} -static __device__ __forceinline__ double real_sqrt(double x) { - return 1. / sqrt(x); -} - -template -__global__ void DoubleGradComputeDX(const T *x, const T *mean, - const T *variance, const T *ddx, - const T *dy, const T *scale, - const T *ddscale, int C, int sample_size, - const double epsilon, T *dx) { - int beg_idx = blockIdx.x * sample_size + threadIdx.x; - int end_idx = (blockIdx.x + 1) * sample_size; - int ncid = blockIdx.x; - int c = ncid % C; - - T mean_val = mean[ncid]; - T var_val = variance[ncid]; - - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage dy_storage; - __shared__ typename BlockReduce::TempStorage ddx_storage; - __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage; - __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage; - __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage; - __shared__ T dy_sum_val; - __shared__ T ddx_sum_val; - __shared__ T dy_mul_ddx_sum_val; - __shared__ T dy_mul_x_sub_mean_sum_val; - __shared__ T ddx_mul_x_sub_mean_sum_val; - - T dy_sum = 0; - T ddx_sum = 0; - T dy_mul_ddx_sum = 0; - T dy_mul_x_sub_mean_sum = 0; - T ddx_mul_x_sub_mean_sum = 0; - for (int i = beg_idx; i < end_idx; i += BlockDim) { - T ddx_i = ddx[i]; - T dy_i = dy[i]; - T tmp = x[i] - mean_val; - - dy_sum += dy_i; - ddx_sum += ddx_i; - dy_mul_ddx_sum += (ddx_i * dy_i); - - dy_mul_x_sub_mean_sum += (dy_i * tmp); - ddx_mul_x_sub_mean_sum += (ddx_i * tmp); - } - - dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); - ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum()); - dy_mul_ddx_sum = - BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum()); - dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage) - .Reduce(dy_mul_x_sub_mean_sum, cub::Sum()); - ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage) - .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum()); - - if (threadIdx.x == 0) { - dy_sum_val = dy_sum; - ddx_sum_val = ddx_sum; - dy_mul_ddx_sum_val = dy_mul_ddx_sum; - dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum; - ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum; - } - __syncthreads(); - - if (ddx != nullptr) { - for (int i = beg_idx; i < end_idx; i += BlockDim) { - dx[i] += - ((x[i] - mean_val) * var_val * var_val * var_val / sample_size * - (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val + - 3. * dy_mul_x_sub_mean_sum_val * var_val * - ddx_mul_x_sub_mean_sum_val * var_val / sample_size) + - ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val * - var_val * (dy_sum_val / sample_size - dy[i]) + - dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val * - var_val * (ddx_sum_val / sample_size - ddx[i])) * - scale[c]; - } - } - __syncthreads(); - if (ddscale != nullptr) { - for (int i = beg_idx; i < end_idx; i += BlockDim) { - dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val - - (x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val * - var_val / sample_size) * - ddscale[c]; - } - } -} - -template -__global__ void DoubleGradComputeDDY(const T *x, const T *mean, - const T *variance, const T *ddscale, - const T *ddbias, const T *ddx, - const T *scale, int C, int sample_size, - const double epsilon, T *ddy) { - int beg_idx = blockIdx.x * sample_size + threadIdx.x; - int end_idx = (blockIdx.x + 1) * sample_size; - int ncid = blockIdx.x; - int c = ncid % C; - - T mean_val = mean[ncid]; - T var_val = variance[ncid]; - - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage ddx_storage; - __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage; - __shared__ T ddx_sum_val; - __shared__ T ddx_mul_x_sub_mean_sum_val; - - T ddx_sum = 0; - T ddx_mul_x_sub_mean_sum = 0; - for (int i = beg_idx; i < end_idx; i += BlockDim) { - T ddx_i = ddx[i]; - ddx_sum += ddx_i; - ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val)); - } - ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum()); - ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage) - .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum()); - - if (threadIdx.x == 0) { - ddx_sum_val = ddx_sum; - ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum; - } - __syncthreads(); - - if (ddx != nullptr) { - for (int i = beg_idx; i < end_idx; i += BlockDim) { - ddy[i] += scale[c] * var_val * - (ddx[i] - ddx_sum_val / sample_size - - (x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val * - var_val / sample_size); - } - } - __syncthreads(); - if (ddscale != nullptr) { - for (int i = beg_idx; i < end_idx; i += BlockDim) { - ddy[i] += (x[i] - mean_val) * var_val * ddscale[c]; - } - } - __syncthreads(); - if (ddbias != nullptr) { - for (int i = beg_idx; i < end_idx; i += BlockDim) { - ddy[i] += ddbias[c]; - } - } -} - -template -__global__ void DoubleGradComputeDScale(const T *x, const T *mean, - const T *variance, const T *ddx, - const T *dy, int C, int sample_size, - const double epsilon, T *dscale) { - int beg_idx = blockIdx.x * sample_size + threadIdx.x; - int end_idx = (blockIdx.x + 1) * sample_size; - int ncid = blockIdx.x; - int c = ncid % C; - - T mean_val = mean[ncid]; - T var_val = variance[ncid]; - - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage dy_storage; - __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage; - __shared__ typename BlockReduce::TempStorage dscale_tmp_storage; - __shared__ T dy_sum_val; - __shared__ T dy_mul_x_sub_mean_sum_val; - - T dy_sum = 0; - T dy_mul_x_sub_mean_sum = 0; - for (int i = beg_idx; i < end_idx; i += BlockDim) { - T dy_i = dy[i]; - dy_sum += dy_i; - dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val)); - } - dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); - dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage) - .Reduce(dy_mul_x_sub_mean_sum, cub::Sum()); - - if (threadIdx.x == 0) { - dy_sum_val = dy_sum; - dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum; - } - __syncthreads(); - - if (ddx != nullptr) { - T dscale_tmp = 0; - for (int i = beg_idx; i < end_idx; i += BlockDim) { - dscale_tmp += - ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size - - dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) * - var_val * var_val / sample_size); - } - dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum()); - - if (threadIdx.x == 0) { - dscale[ncid] += dscale_tmp; - } - __syncthreads(); - } -} - -template -class InstanceNormDoubleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *X = ctx.Input("X"); - const auto *Scale = ctx.Input("Scale"); - const auto *dY = ctx.Input("DY"); - const auto *Saved_mean = ctx.Input("SavedMean"); - const auto *Saved_variance = ctx.Input("SavedVariance"); - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); - const auto *ddX = ctx.Input("DDX"); - const auto *ddScale = ctx.Input("DDScale"); - const auto *ddBias = ctx.Input("DDBias"); - const double epsilon = static_cast(ctx.Attr("epsilon")); - - auto *dX = ctx.Output("DX"); - auto *dScale = ctx.Output("DScale"); - auto *ddY = ctx.Output("DDY"); - - const T *x_data = X->data(); - const T *dy_data = dY->data(); - const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data()); - - const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data()); - const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data()); - - const T *mean_data = Saved_mean->data(); - const T *variance_data = Saved_variance->data(); - - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant set_zero; - - auto &x_dims = X->dims(); - int N, C, H, W, D; - ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); - int NxC = N * C; - const int n = X->numel(); - int sample_size = n / N / C; - - Tensor scale_tmp; - if (!Scale) { - scale_tmp.mutable_data({C}, ctx.GetPlace()); - set_zero(dev_ctx, &scale_tmp, static_cast(1)); - } - const T *scale_data = Scale ? Scale->data() : scale_tmp.data(); - - const int block = 512; - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - const int grid = NxC; - const int grid1 = (C + block - 1) / block; - - if (dX) { - T *dx_data = dX->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, dX, static_cast(0)); - DoubleGradComputeDX<<>>( - x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, - ddscale_data, C, sample_size, epsilon, dx_data); - } - if (dScale) { - Tensor dscale_tmp = - ctx.AllocateTmpTensor({NxC}, dev_ctx); - set_zero(dev_ctx, &dscale_tmp, static_cast(0)); - T *dscale_tmp_data = dscale_tmp.mutable_data(ctx.GetPlace()); - - T *dscale_data = dScale->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, dScale, static_cast(0)); - DoubleGradComputeDScale<<>>( - x_data, mean_data, variance_data, ddx_data, dy_data, C, sample_size, - epsilon, dscale_tmp_data); - add_param<<>>( - dscale_tmp.data(), dScale->data(), N, C); - } - if (ddY) { - T *ddy_data = ddY->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, ddY, static_cast(0)); - DoubleGradComputeDDY<<>>( - x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data, - scale_data, C, sample_size, epsilon, ddy_data); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_CUDA_KERNEL(instance_norm_grad_grad, - ops::InstanceNormDoubleGradKernel< - paddle::platform::CUDADeviceContext, float>); -#else -REGISTER_OP_CUDA_KERNEL( - instance_norm_grad_grad, - ops::InstanceNormDoubleGradKernel, - ops::InstanceNormDoubleGradKernel); -#endif diff --git a/paddle/fluid/operators/instance_norm_op.h b/paddle/fluid/operators/instance_norm_op.h index 493f54ab3baa6dbf9166ed709b392fce1c9fb889..265e4acef0d7a28345d429f4ced4a7de588ea2e2 100644 --- a/paddle/fluid/operators/instance_norm_op.h +++ b/paddle/fluid/operators/instance_norm_op.h @@ -16,9 +16,7 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/norm_utils.h" namespace paddle { namespace operators { @@ -27,22 +25,9 @@ using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; using DataLayout = framework::DataLayout; -template -using EigenArrayMap = - Eigen::Map>; -template -using ConstEigenArrayMap = - Eigen::Map>; -template -using EigenVectorArrayMap = Eigen::Map>; -template -using ConstEigenVectorArrayMap = - Eigen::Map>; - class InstanceNormOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override; protected: framework::OpKernelType GetExpectedKernelType( @@ -52,7 +37,6 @@ class InstanceNormOp : public framework::OperatorWithKernel { class InstanceNormGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override; protected: framework::OpKernelType GetExpectedKernelType( @@ -62,7 +46,6 @@ class InstanceNormGradOp : public framework::OperatorWithKernel { class InstanceNormDoubleGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override; protected: framework::OpKernelType GetExpectedKernelType( @@ -130,23 +113,5 @@ class InstanceNormOpInferVarType } }; -template -class InstanceNormKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override; -}; - -template -class InstanceNormGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override; -}; - -template -class InstanceNormDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override; -}; - } // namespace operators } // namespace paddle diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 602942abf4d346ab4ebc4fd08ec55ebe18e3558d..6b13a28c7083744b5bddecb84ef3c8c373d9f1d4 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -312,6 +312,63 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out, dx->share_meta(dout); } +void InstanceNormGradInferMeta(const MetaTensor& x, + const MetaTensor& y_grad, + paddle::optional scale, + const MetaTensor& saved_mean, + const MetaTensor& saved_variance, + float epsilon, + MetaTensor* x_grad, + MetaTensor* scale_grad, + MetaTensor* bias_grad) { + PADDLE_ENFORCE_NE( + x_grad, + nullptr, + phi::errors::InvalidArgument( + "The X@GRAD in InstanceNormGradInferMeta can't be nullptr.")); + const auto x_dims = x.dims(); + const int C = x_dims[1]; + x_grad->set_dims(x_dims); + x_grad->set_dtype(x.dtype()); + x_grad->set_layout(x.layout()); + if (scale_grad) { + scale_grad->set_dims({C}); + } + if (bias_grad) { + bias_grad->set_dims({C}); + } +} +void InstanceNormDoubleGradInferMeta( + const MetaTensor& x, + paddle::optional scale, + const MetaTensor& saved_mean, + const MetaTensor& saved_variance, + const MetaTensor& dy, + paddle::optional ddx, + paddle::optional ddscale, + paddle::optional ddbias, + float epsilon, + MetaTensor* dx, + MetaTensor* dscale, + MetaTensor* ddy) { + PADDLE_ENFORCE_NE( + dx, + nullptr, + phi::errors::InvalidArgument( + "The DX in InstanceNormDoubleGradInferMeta can't be nullptr.")); + const auto x_dims = x.dims(); + const int C = x_dims[1]; + dx->set_dims(x_dims); + dx->set_dtype(x.dtype()); + dx->set_layout(x.layout()); + if (dscale) { + dscale->set_dims({C}); + } + if (ddy) { + ddy->share_dims(x); + } +} + void KernelWithXShapeInferMeta(const MetaTensor& xshape, MetaTensor* dx) { auto xshape_dims = xshape.dims(); auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index c35b58d0f56e41e073e5d76144ae846ffdb15d88..855b25d7ed4f8eebfd869d024477e1a220580954 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -144,6 +144,30 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out, int axis, MetaTensor* dx); +void InstanceNormGradInferMeta(const MetaTensor& x, + const MetaTensor& y_grad, + paddle::optional scale, + const MetaTensor& saved_mean, + const MetaTensor& saved_variance, + float epsilon, + MetaTensor* x_grad, + MetaTensor* scale_grad, + MetaTensor* bias_grad); + +void InstanceNormDoubleGradInferMeta( + const MetaTensor& x, + paddle::optional scale, + const MetaTensor& saved_mean, + const MetaTensor& saved_variance, + const MetaTensor& dy, + paddle::optional ddx, + paddle::optional ddscale, + paddle::optional ddbias, + float epsilon, + MetaTensor* dx, + MetaTensor* dscale, + MetaTensor* ddy); + void KernelWithXShapeInferMeta(const MetaTensor& xshape, MetaTensor* dx); void MaxPoolWithIndexGradInferMeta(const MetaTensor& x, diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index ae8c7dd61c3bbfca86fd3acea613527474d7901c..e3f946b247f0912f6988092a9d2306ac096e5dec 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -191,6 +191,111 @@ void ArangeInferMeta(const MetaTensor& start, out->set_dtype(start.dtype()); } +void InstanceNormInferMeta(const MetaTensor& x, + paddle::optional scale, + paddle::optional bias, + float epsilon, + MetaTensor* y, + MetaTensor* saved_mean, + MetaTensor* saved_variance, + MetaConfig config) { + PADDLE_ENFORCE_NE(y, + nullptr, + phi::errors::InvalidArgument( + "The y in InstanceNormInferMeta can't be nullptr.")); + PADDLE_ENFORCE_NE( + saved_mean, + nullptr, + phi::errors::InvalidArgument( + "The saved_mean in InstanceNormInferMeta can't be nullptr.")); + PADDLE_ENFORCE_NE( + saved_variance, + nullptr, + phi::errors::InvalidArgument( + "The saved_variance in InstanceNormInferMeta can't be nullptr.")); + const auto x_dims = x.dims(); + PADDLE_ENFORCE_NE(phi::product(x_dims), + 0, + phi::errors::PreconditionNotMet( + "The Input variable X has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.")); + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::InvalidArgument( + "ShapeError: the dimension of input X must " + "greater than or equal to 2. But received: the shape of input " + "X = [%s], the dimension of input X =[%d]", + x_dims, + x_dims.size())); + PADDLE_ENFORCE_LE( + x_dims.size(), + 5, + phi::errors::InvalidArgument( + "ShapeError: the dimension of input X must " + "smaller than or equal to 5, But received: the shape of input " + "X = [%s], the dimension of input X = [%d]", + x_dims, + x_dims.size())); + auto N = x_dims[0]; + auto C = x_dims[1]; + auto NxC = N * C; + const auto scale_ptr = scale.get_ptr(); + if (scale_ptr) { + auto scale_dim = scale_ptr->dims(); + PADDLE_ENFORCE_EQ( + scale_dim.size(), + 1UL, + phi::errors::InvalidArgument( + "ShapeError: the dimension of scale must equal to 1." + "But received: the shape of scale is [%s], the dimension " + "of scale is [%d]", + scale_dim, + scale_dim.size())); + bool check = !((!config.is_runtime) && (phi::product(scale_dim) <= 0)); + if (check) { + PADDLE_ENFORCE_EQ(scale_dim[0], + C, + phi::errors::InvalidArgument( + "ShapeError: the shape of scale must equal to [%d]" + "But received: the shape of scale is [%d]", + C, + scale_dim[0])); + } + } + const auto bias_ptr = bias.get_ptr(); + if (bias_ptr) { + auto bias_dim = bias_ptr->dims(); + PADDLE_ENFORCE_EQ( + bias_dim.size(), + 1UL, + phi::errors::InvalidArgument( + "ShapeError: the dimension of bias must equal to 1." + "But received: the shape of bias is [%s],the dimension " + "of bias is [%d]", + bias_dim, + bias_dim.size())); + bool check = !((!config.is_runtime) && (phi::product(bias_dim) <= 0)); + if (check) { + PADDLE_ENFORCE_EQ(bias_dim[0], + C, + phi::errors::InvalidArgument( + "ShapeError: the shape of bias must equal to [%d]" + "But received: the shape of bias is [%d]", + C, + bias_dim[0])); + } + } + y->set_dims(x_dims); + saved_mean->set_dims({NxC}); + saved_variance->set_dims({NxC}); + y->share_lod(x); + y->set_dtype(x.dtype()); + y->set_layout(x.layout()); +} + void GraphSendRecvInferMeta(const MetaTensor& x, const MetaTensor& src_index, const MetaTensor& dst_index, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 4f561e0adf19d9443e1404b9858be7a8caa6ae9f..b2fb30a4da2d62fad59e3759e326ed06338599ac 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -52,6 +52,15 @@ void ArangeInferMeta(const MetaTensor& start, const MetaTensor& step, MetaTensor* out); +void InstanceNormInferMeta(const MetaTensor& x, + paddle::optional scale, + paddle::optional bias, + float epsilon, + MetaTensor* y, + MetaTensor* saved_mean, + MetaTensor* saved_variance, + MetaConfig config = MetaConfig()); + void GraphSendRecvInferMeta(const MetaTensor& x, const MetaTensor& src_index, const MetaTensor& dst_index, diff --git a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc index 07b3c5a18fdb5a4fe024f88147b636bc25650773..dcb4289ae8d75c62a25a8d8180d80e5008b17722 100644 --- a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc @@ -23,8 +23,22 @@ #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/extensions.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/norm_utils.h" + namespace phi { +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; +template +using EigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; + template void InstanceNormGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -136,6 +150,188 @@ void InstanceNormGradKernel(const Context& dev_ctx, .broadcast(bcast)); } +template +void InstanceNormDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + paddle::optional scale, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + const DenseTensor& dy, + paddle::optional ddx, + paddle::optional ddscale, + paddle::optional ddbias, + float epsilon, + DenseTensor* dx, + DenseTensor* dscale, + DenseTensor* ddy) { + const auto* Scale = scale.get_ptr(); + const auto* ddScale = ddscale.get_ptr(); + const auto* ddX = ddx.get_ptr(); + const auto* ddBias = ddbias.get_ptr(); + phi::funcs::SetConstant set_constant; + const auto& x_dims = x.dims(); + int N, C, H, W, D; + funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + const int sample_size = x.numel() / N / C; + const int NxC = N * C; + + const T* mean_data = saved_mean.data(); + const T* inv_var_data = saved_variance.data(); + DenseTensor mean_tensor; + DenseTensor inv_var_tensor; + ConstEigenArrayMap x_arr(x.data(), sample_size, NxC); + ConstEigenVectorArrayMap mean_arr(mean_data, NxC); + ConstEigenVectorArrayMap inv_var_arr(inv_var_data, NxC); + + DenseTensor mean_tile; + mean_tile.Resize({sample_size, NxC}); + dev_ctx.template Alloc(&mean_tile); + EigenArrayMap mean_tile_data(mean_tile.data(), sample_size, NxC); + DenseTensor inv_var_tile; + inv_var_tile.Resize({sample_size, NxC}); + dev_ctx.template Alloc(&inv_var_tile); + EigenArrayMap inv_var_tile_data(inv_var_tile.data(), sample_size, NxC); + + mean_tile_data = mean_arr.transpose().replicate(sample_size, 1); + inv_var_tile_data = inv_var_arr.transpose().replicate(sample_size, 1); + + DenseTensor Scale_data; + if (!Scale) { + Scale_data.Resize({C}); + dev_ctx.template Alloc(&Scale_data); + set_constant(dev_ctx, &Scale_data, static_cast(1)); + } + ConstEigenVectorArrayMap scale_arr( + Scale ? Scale->data() : Scale_data.data(), C); + + DenseTensor scale_tile; + scale_tile.Resize({sample_size, NxC}); + dev_ctx.template Alloc(&scale_tile); + EigenArrayMap scale_tile_data(scale_tile.data(), sample_size, NxC); + scale_tile_data = scale_arr.transpose().replicate(sample_size, N); + ConstEigenArrayMap dy_arr(dy.data(), sample_size, NxC); + ConstEigenArrayMap ddx_arr(ddX->data(), sample_size, NxC); + // math: dx = scale * ((x - mean) * inv_var / HxW * (np.mean(ddx, + // axis=(h,w)) * np.sum(dy, axis=(h,w)) - + // np.sum(dy * ddx, axis=(h,w)) + 3 * np.mean(dy * (x - mean), + // axis=(h,w)) * inv_var.pow(2) * + // np.sum(ddx * (x - mean), axis=(h,w))) + inv_var.pow(3) / HxW * + // np.sum(ddx * (x - mean)) * + // (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW * + // np.sum(dy, axis=(h,w)) * (x - mean) * + // (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var - + // inv_var * np.mean(dy, axis=(h,w)) - inv_var.pow(3) * + // (x - mean) * np.mean(dy * (x - mean), axis=(h,w))) + + DenseTensor x_sub_mean_mul_invstd; + x_sub_mean_mul_invstd.Resize({sample_size, NxC}); + dev_ctx.template Alloc(&x_sub_mean_mul_invstd); + EigenArrayMap x_sub_mean_mul_invstd_arr( + x_sub_mean_mul_invstd.data(), sample_size, NxC); + x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data; + + if (dx) { + dev_ctx.template Alloc(dx); + set_constant(dev_ctx, dx, static_cast(0)); + EigenArrayMap dx_arr(dx->data(), sample_size, NxC); + if (ddX) { + dx_arr += + x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data / + sample_size * + (ddx_arr.colwise().sum() * dy_arr.colwise().sum() / sample_size - + (dy_arr * ddx_arr).colwise().sum() + + 3. * (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() * + (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size); + dx_arr += (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size * inv_var_tile_data * inv_var_tile_data * + (dy_arr.colwise().sum() / sample_size - dy_arr); + dx_arr += (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size * inv_var_tile_data * inv_var_tile_data * + (ddx_arr.colwise().sum() / sample_size - ddx_arr); + dx_arr = scale_tile_data * dx_arr; + } + if (ddScale) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + DenseTensor ddscale_tile; + ddscale_tile.Resize({sample_size, NxC}); + dev_ctx.template Alloc(&ddscale_tile); + EigenArrayMap ddscale_tile_data( + ddscale_tile.data(), sample_size, NxC); + ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N); + dx_arr += (dy_arr * inv_var_tile_data - + dy_arr.colwise().sum() / sample_size * inv_var_tile_data - + x_sub_mean_mul_invstd_arr * inv_var_tile_data * + (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size) * + ddscale_tile_data; + } + } + if (dscale) { + // math: dscale = inv_var * (dy - np.mean(dy, axis=(h,w) - (x-mean) * + // inv_var.pow(2) * np.mean(dy * (x-mean), axis=(h,w)))) * ddx + dev_ctx.template Alloc(dscale); + set_constant(dev_ctx, dscale, static_cast(0)); + EigenVectorArrayMap dscale_arr(dscale->data(), C); + if (ddX) { + DenseTensor first_grad; + first_grad.Resize({sample_size, NxC}); + dev_ctx.template Alloc(&first_grad); + set_constant(dev_ctx, &first_grad, static_cast(0)); + EigenArrayMap first_grad_arr(first_grad.data(), sample_size, NxC); + first_grad_arr += + inv_var_tile_data * + (dy_arr - + dy_arr.colwise().sum().replicate(sample_size, 1) / sample_size - + x_sub_mean_mul_invstd_arr * + (dy_arr * x_sub_mean_mul_invstd_arr) + .colwise() + .sum() + .replicate(sample_size, 1) / + sample_size); + first_grad_arr = first_grad_arr * ddx_arr; + for (int nc = 0; nc < NxC; ++nc) { + int c = nc % C; + dscale_arr(c) += first_grad_arr.colwise().sum()(nc); + } + } + } + if (ddy) { + // math: ddy = (x - mean) * inv_var * ddscale + ddbias + + // scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) * + // np.mean(ddx * (x - mean), axis=(h,w))) + dev_ctx.template Alloc(ddy); + set_constant(dev_ctx, ddy, static_cast(0)); + EigenArrayMap ddy_arr(ddy->data(), sample_size, NxC); + if (ddX) { + ddy_arr += scale_tile_data * inv_var_tile_data * + (ddx_arr - ddx_arr.colwise().sum() / sample_size - + x_sub_mean_mul_invstd_arr * + (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size); + } + if (ddScale && ddBias) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + DenseTensor ddscale_tile; + ddscale_tile.Resize({sample_size, NxC}); + dev_ctx.template Alloc(&ddscale_tile); + EigenArrayMap ddscale_tile_data( + ddscale_tile.data(), sample_size, NxC); + ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N); + + ConstEigenVectorArrayMap ddbias_arr(ddBias->data(), C); + DenseTensor ddbias_tile; + ddbias_tile.Resize({sample_size, NxC}); + dev_ctx.template Alloc(&ddbias_tile); + EigenArrayMap ddbias_tile_data( + ddbias_tile.data(), sample_size, NxC); + ddbias_tile_data = ddbias_arr.transpose().replicate(sample_size, N); + + ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; + ddy_arr += ddbias_tile_data; + } + } +} } // namespace phi PD_REGISTER_KERNEL(instance_norm_grad, @@ -144,3 +340,9 @@ PD_REGISTER_KERNEL(instance_norm_grad, phi::InstanceNormGradKernel, float, double) {} +PD_REGISTER_KERNEL(instance_norm_double_grad, + CPU, + ALL_LAYOUT, + phi::InstanceNormDoubleGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/funcs/norm_utils.h b/paddle/phi/kernels/funcs/norm_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..2d0a879e41c783a801021db38711d997f62011b4 --- /dev/null +++ b/paddle/phi/kernels/funcs/norm_utils.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/ddim.h" + +namespace phi { +namespace funcs { +inline void ExtractNCWHD(const phi::DDim &dims, + const DataLayout &data_layout, + int *N, + int *C, + int *H, + int *W, + int *D) { + *N = dims[0]; + if (dims.size() == 2) { + *C = dims[1]; + *H = 1; + *W = 1; + *D = 1; + } else { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *W = dims.size() > 3 + ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) + : 1; + *D = dims.size() > 4 + ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) + : 1; + } +} +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index ad3b8579ddf674b7194ef84ca045afb997665bcf..e808ef644a24635c7b43124db0d420a9bac132e5 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -20,7 +20,7 @@ #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/operators/norm_utils.cu.h" -#include "paddle/fluid/operators/norm_utils.h" +#include "paddle/phi/kernels/funcs/norm_utils.h" #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/layout_utils.h" @@ -351,7 +351,7 @@ void BatchNormGradRawKernel(const Context &ctx, x_dims.size(), x_dims)); int N, C, H, W, D; - paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + phi::funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); // init output if (d_x) { diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 361e62e566035e2ea783f0abbcda9ecf9a77039c..e2aeec723628c3c12952a952b3bd02cefd64e811 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -27,7 +27,7 @@ namespace cub = hipcub; #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/fluid/operators/norm_utils.cu.h" -#include "paddle/fluid/operators/norm_utils.h" +#include "paddle/phi/kernels/funcs/norm_utils.h" #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/layout_utils.h" @@ -179,7 +179,7 @@ void BatchNormKernel(const Context &ctx, ctx.template Alloc(y); int N, C, H, W, D; - paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + phi::funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); auto dtype = paddle::platform::CudnnDataType::type; diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu index 15c9c30626593f964b14df6fd2763c5af223c31b..387127de48deae33c3d2d3aceaab3ed35bf99558 100644 --- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu @@ -14,16 +14,15 @@ #include "paddle/phi/kernels/instance_norm_grad_kernel.h" -#include "paddle/fluid/operators/norm_utils.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/norm_utils.h" #include "paddle/phi/kernels/gpu/instance_norm_utils.h" namespace phi { - template static __global__ void GradComputeDX(const T *dy, const BatchNormParamType *scale, @@ -37,16 +36,13 @@ static __global__ void GradComputeDX(const T *dy, int end_idx = (blockIdx.x + 1) * sample_size; int ncid = blockIdx.x; int c = ncid % C; - BatchNormParamType mean_val = mean[ncid]; BatchNormParamType inv_var_val = variance[ncid]; - typedef cub::BlockReduce, BlockDim> BlockReduce; __shared__ typename BlockReduce::TempStorage dy_storage; __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; __shared__ BatchNormParamType dy_sum_val; __shared__ BatchNormParamType dy_x_sub_mean_sum_val; - BatchNormParamType dy_sum = static_cast>(0); BatchNormParamType dy_x_sub_mean_sum = static_cast>(0); @@ -60,13 +56,11 @@ static __global__ void GradComputeDX(const T *dy, dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum()); - if (threadIdx.x == 0) { dy_sum_val = dy_sum; dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; } __syncthreads(); - for (int i = beg_idx; i < end_idx; i += BlockDim) { dx[i] = (static_cast>(dy[i]) - @@ -77,6 +71,222 @@ static __global__ void GradComputeDX(const T *dy, } } +static __device__ __forceinline__ float real_sqrt(float x) { + return 1. / sqrtf(x); +} +static __device__ __forceinline__ double real_sqrt(double x) { + return 1. / sqrt(x); +} + +template +__global__ void DoubleGradComputeDX(const T *x, + const T *mean, + const T *variance, + const T *ddx, + const T *dy, + const T *scale, + const T *ddscale, + int C, + int sample_size, + const double epsilon, + T *dx) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + + T mean_val = mean[ncid]; + T var_val = variance[ncid]; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage ddx_storage; + __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage; + __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage; + __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage; + __shared__ T dy_sum_val; + __shared__ T ddx_sum_val; + __shared__ T dy_mul_ddx_sum_val; + __shared__ T dy_mul_x_sub_mean_sum_val; + __shared__ T ddx_mul_x_sub_mean_sum_val; + + T dy_sum = 0; + T ddx_sum = 0; + T dy_mul_ddx_sum = 0; + T dy_mul_x_sub_mean_sum = 0; + T ddx_mul_x_sub_mean_sum = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + T ddx_i = ddx[i]; + T dy_i = dy[i]; + T tmp = x[i] - mean_val; + + dy_sum += dy_i; + ddx_sum += ddx_i; + dy_mul_ddx_sum += (ddx_i * dy_i); + + dy_mul_x_sub_mean_sum += (dy_i * tmp); + ddx_mul_x_sub_mean_sum += (ddx_i * tmp); + } + + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum()); + dy_mul_ddx_sum = + BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum()); + dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage) + .Reduce(dy_mul_x_sub_mean_sum, cub::Sum()); + ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage) + .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + ddx_sum_val = ddx_sum; + dy_mul_ddx_sum_val = dy_mul_ddx_sum; + dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum; + ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum; + } + __syncthreads(); + + if (ddx != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + dx[i] += + ((x[i] - mean_val) * var_val * var_val * var_val / sample_size * + (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val + + 3. * dy_mul_x_sub_mean_sum_val * var_val * + ddx_mul_x_sub_mean_sum_val * var_val / sample_size) + + ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val * + var_val * (dy_sum_val / sample_size - dy[i]) + + dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val * + var_val * (ddx_sum_val / sample_size - ddx[i])) * + scale[c]; + } + } + __syncthreads(); + if (ddscale != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val - + (x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val * + var_val / sample_size) * + ddscale[c]; + } + } +} + +template +__global__ void DoubleGradComputeDDY(const T *x, + const T *mean, + const T *variance, + const T *ddscale, + const T *ddbias, + const T *ddx, + const T *scale, + int C, + int sample_size, + const double epsilon, + T *ddy) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + T mean_val = mean[ncid]; + T var_val = variance[ncid]; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage ddx_storage; + __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage; + __shared__ T ddx_sum_val; + __shared__ T ddx_mul_x_sub_mean_sum_val; + + T ddx_sum = 0; + T ddx_mul_x_sub_mean_sum = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + T ddx_i = ddx[i]; + ddx_sum += ddx_i; + ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val)); + } + ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum()); + ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage) + .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum()); + if (threadIdx.x == 0) { + ddx_sum_val = ddx_sum; + ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum; + } + __syncthreads(); + if (ddx != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + ddy[i] += scale[c] * var_val * + (ddx[i] - ddx_sum_val / sample_size - + (x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val * + var_val / sample_size); + } + } + __syncthreads(); + if (ddscale != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + ddy[i] += (x[i] - mean_val) * var_val * ddscale[c]; + } + } + __syncthreads(); + if (ddbias != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + ddy[i] += ddbias[c]; + } + } +} + +template +__global__ void DoubleGradComputeDScale(const T *x, + const T *mean, + const T *variance, + const T *ddx, + const T *dy, + int C, + int sample_size, + const double epsilon, + T *dscale) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + T mean_val = mean[ncid]; + T var_val = variance[ncid]; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage; + __shared__ typename BlockReduce::TempStorage dscale_tmp_storage; + __shared__ T dy_sum_val; + __shared__ T dy_mul_x_sub_mean_sum_val; + + T dy_sum = 0; + T dy_mul_x_sub_mean_sum = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + T dy_i = dy[i]; + dy_sum += dy_i; + dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val)); + } + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage) + .Reduce(dy_mul_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum; + } + __syncthreads(); + if (ddx != nullptr) { + T dscale_tmp = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + dscale_tmp += + ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size - + dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) * + var_val * var_val / sample_size); + } + dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum()); + if (threadIdx.x == 0) { + dscale[ncid] += dscale_tmp; + } + __syncthreads(); + } +} + template void InstanceNormGradKernel(const Context &dev_ctx, const DenseTensor &x, @@ -94,8 +304,7 @@ void InstanceNormGradKernel(const Context &dev_ctx, const auto &x_dims = x.dims(); int N, C, H, W, D; - paddle::operators::ExtractNCWHD( - x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); int NxC = N * C; DenseTensor x_tmp, d_y_tmp; @@ -303,12 +512,120 @@ void InstanceNormGradKernel(const Context &dev_ctx, paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); #endif } + +template +void InstanceNormDoubleGradKernel(const Context &dev_ctx, + const DenseTensor &x, + paddle::optional scale, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + const DenseTensor &dy, + paddle::optional ddx, + paddle::optional ddscale, + paddle::optional ddbias, + float epsilon_f, + DenseTensor *dx, + DenseTensor *dscale, + DenseTensor *ddy) { + const auto *Scale = scale.get_ptr(); + const auto *ddX = ddx.get_ptr(); + const auto *ddScale = ddscale.get_ptr(); + const auto *ddBias = ddbias.get_ptr(); + const double epsilon = static_cast(epsilon_f); + const T *x_data = x.data(); + const T *dy_data = dy.data(); + const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data()); + const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data()); + const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data()); + const T *mean_data = saved_mean.data(); + const T *variance_data = saved_variance.data(); + phi::funcs::SetConstant set_zero; + auto &x_dims = x.dims(); + int N, C, H, W, D; + funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + int NxC = N * C; + const int n = x.numel(); + int sample_size = n / N / C; + + DenseTensor scale_tmp; + if (!Scale) { + scale_tmp.Resize({C}); + dev_ctx.template Alloc(&scale_tmp); + set_zero(dev_ctx, &scale_tmp, static_cast(1)); + } + const T *scale_data = Scale ? Scale->data() : scale_tmp.data(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = NxC; + const int grid1 = (C + block - 1) / block; + + if (dx) { + T *dx_data = dev_ctx.template Alloc(dx); + set_zero(dev_ctx, dx, static_cast(0)); + DoubleGradComputeDX<<>>( + x_data, + mean_data, + variance_data, + ddx_data, + dy_data, + scale_data, + ddscale_data, + C, + sample_size, + epsilon, + dx_data); + } + if (dscale) { + DenseTensor dscale_tmp; + dscale_tmp.Resize({NxC}); + dev_ctx.template Alloc(&dscale_tmp); + set_zero(dev_ctx, &dscale_tmp, static_cast(0)); + T *dscale_tmp_data = dscale_tmp.data(); + + T *dscale_data = dev_ctx.template Alloc(dscale); + set_zero(dev_ctx, dscale, static_cast(0)); + DoubleGradComputeDScale<<>>( + x_data, + mean_data, + variance_data, + ddx_data, + dy_data, + C, + sample_size, + epsilon, + dscale_tmp_data); + add_param<<>>( + dscale_tmp.data(), dscale->data(), N, C); + } + if (ddy) { + T *ddy_data = dev_ctx.template Alloc(ddy); + set_zero(dev_ctx, ddy, static_cast(0)); + DoubleGradComputeDDY<<>>( + x_data, + mean_data, + variance_data, + ddscale_data, + ddbias_data, + ddx_data, + scale_data, + C, + sample_size, + epsilon, + ddy_data); + } +} } // namespace phi #ifdef PADDLE_WITH_HIP // MIOPEN do not support double PD_REGISTER_KERNEL( instance_norm_grad, GPU, ALL_LAYOUT, phi::InstanceNormGradKernel, float) {} +PD_REGISTER_KERNEL(instance_norm_double_grad, + GPU, + ALL_LAYOUT, + phi::InstanceNormDoubleGradKernel, + float) {} #else PD_REGISTER_KERNEL(instance_norm_grad, GPU, @@ -316,4 +633,10 @@ PD_REGISTER_KERNEL(instance_norm_grad, phi::InstanceNormGradKernel, float, double) {} +PD_REGISTER_KERNEL(instance_norm_double_grad, + GPU, + ALL_LAYOUT, + phi::InstanceNormDoubleGradKernel, + float, + double) {} #endif diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu index cf8f0fb78788ce118e25a7f4f8f0765bf1e50537..81d94007501904b3235ef39e9233384550a74b6d 100644 --- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu @@ -14,11 +14,11 @@ #include "paddle/phi/kernels/instance_norm_kernel.h" -#include "paddle/fluid/operators/norm_utils.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/norm_utils.h" #include "paddle/phi/kernels/gpu/instance_norm_utils.h" namespace phi { @@ -51,8 +51,7 @@ void InstanceNormKernel(const Context &dev_ctx, "the size of X's dimensions is [%d]", x_dims.size())); int N, C, H, W, D; - paddle::operators::ExtractNCWHD( - x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); int NxC = N * C; DenseTensor x_tmp; x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D}); diff --git a/paddle/phi/kernels/instance_norm_grad_kernel.h b/paddle/phi/kernels/instance_norm_grad_kernel.h index 041302a7cfb67bdb756de9cb0e639062fd6f2a7b..7924c767ab61e6e5ec6a0fc8b914c815c6a17b0e 100644 --- a/paddle/phi/kernels/instance_norm_grad_kernel.h +++ b/paddle/phi/kernels/instance_norm_grad_kernel.h @@ -30,4 +30,19 @@ void InstanceNormGradKernel(const Context& dev_ctx, DenseTensor* scale_grad, DenseTensor* bias_grad); +template +void InstanceNormDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + paddle::optional scale, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + const DenseTensor& dy, + paddle::optional ddx, + paddle::optional ddscale, + paddle::optional ddbias, + float epsilon, + DenseTensor* dx, + DenseTensor* dscale, + DenseTensor* ddy); + } // namespace phi diff --git a/paddle/phi/ops/compat/instance_norm_sig.cc b/paddle/phi/ops/compat/instance_norm_sig.cc index b65e84588db130bb8e909518d9591d3847129cbc..2b490078512b1ef6c3574b08f15e55594c578435 100644 --- a/paddle/phi/ops/compat/instance_norm_sig.cc +++ b/paddle/phi/ops/compat/instance_norm_sig.cc @@ -31,8 +31,26 @@ KernelSignature InstanceNormGradOpArgumentMapping( {"epsilon"}, {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); } +KernelSignature InstanceNormDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("instance_norm_double_grad", + {"X", + "Scale", + "SavedMean", + "SavedVariance", + "DY", + "DDX", + "DDScale", + "DDBias"}, + {"epsilon"}, + {"DX", "DScale", "DDY"}); +} } // namespace phi +PD_REGISTER_BASE_KERNEL_NAME(instance_norm_grad_grad, + instance_norm_double_grad); PD_REGISTER_ARG_MAPPING_FN(instance_norm, phi::InstanceNormOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(instance_norm_grad, phi::InstanceNormGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(instance_norm_grad_grad, + phi::InstanceNormDoubleGradOpArgumentMapping);