diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc index a7d96437e95c4c1880edddb64a409d5f1a45a359..2cbd48cf093e263c3d152f72606de65bde613688 100644 --- a/paddle/fluid/operators/instance_norm_op.cc +++ b/paddle/fluid/operators/instance_norm_op.cc @@ -170,104 +170,6 @@ NCHW `[batch, in_channels, in_height, in_width]` )DOC"); } -template -class InstanceNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - T epsilon = static_cast(ctx.Attr("epsilon")); - - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - - const int N = x_dims[0]; - const int C = x_dims[1]; - const int NxC = N * C; - - const int sample_size = x->numel() / N / C; - - auto *y = ctx.Output("Y"); - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - - auto &dev_ctx = ctx.template device_context(); - auto *place = dev_ctx.eigen_device(); - - Eigen::DSizes shape(NxC, sample_size); -// Once eigen on Windows is updated, the if branch can be removed. -#ifndef EIGEN_HAS_INDEX_LIST - Eigen::DSizes bcast(1, sample_size); - Eigen::DSizes C_shape(C, 1); - Eigen::DSizes NxC_shape(NxC, 1); - Eigen::DSizes rdims(1); -#else - Eigen::IndexList, int> bcast; - bcast.set(1, sample_size); - Eigen::IndexList> C_shape; - C_shape.set(0, C); - Eigen::IndexList> NxC_shape; - NxC_shape.set(0, NxC); - Eigen::IndexList> rdims; -#endif - - phi::funcs::SetConstant set_constant; - - saved_mean->mutable_data(ctx.GetPlace()); - saved_variance->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, saved_mean, static_cast(0)); - set_constant(dev_ctx, saved_variance, static_cast(0)); - - auto saved_mean_a = framework::EigenVector::Flatten(*saved_mean); - auto saved_mean_e = saved_mean_a.reshape(NxC_shape); - auto saved_variance_a = framework::EigenVector::Flatten(*saved_variance); - auto saved_variance_e = saved_variance_a.reshape(NxC_shape); - - auto x_e = framework::EigenVector::Flatten(*x); - auto x_arr = x_e.reshape(shape); - - saved_mean_e.device(*place) = x_arr.mean(rdims); - auto saved_variance_arr = - (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon; - - saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse(); - - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - Tensor scale_data; - Tensor bias_data; - if (!scale) { - scale_data.mutable_data({C}, ctx.GetPlace()); - set_constant(dev_ctx, &scale_data, static_cast(1)); - } - - if (!bias) { - bias_data.mutable_data({C}, ctx.GetPlace()); - set_constant(dev_ctx, &bias_data, static_cast(0)); - } - auto scale_e = scale - ? framework::EigenVector::Flatten(*scale) - : framework::EigenVector::Flatten( - const_cast(scale_data)); - auto scale_arr = scale_e.reshape(C_shape); - auto bias_e = bias ? framework::EigenVector::Flatten(*bias) - : framework::EigenVector::Flatten( - const_cast(bias_data)); - auto bias_arr = bias_e.reshape(C_shape); - - y->mutable_data(ctx.GetPlace()); - auto y_e = framework::EigenVector::Flatten(*y); - auto y_arr = y_e.reshape(shape); - - // (x - mean) * inv_std * scale + bias - Eigen::DSizes bcast_param(N, sample_size); - y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) * - saved_variance_e.broadcast(bcast) * - scale_arr.broadcast(bcast_param) + - bias_arr.broadcast(bcast_param); - } -}; - void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormGrad"); OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input", @@ -312,120 +214,6 @@ framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); } -template -class InstanceNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *saved_mean = ctx.Input("SavedMean"); - const auto *saved_inv_variance = ctx.Input("SavedVariance"); - - const auto &x_dims = x->dims(); - - const int N = x_dims[0]; - const int C = x_dims[1]; - const int NxC = N * C; - const int sample_size = x->numel() / N / C; - - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - d_x->mutable_data(ctx.GetPlace()); - - auto &dev_ctx = ctx.template device_context(); - auto *place = dev_ctx.eigen_device(); - - Eigen::DSizes rshape(NxC, sample_size); - Eigen::DSizes param_shape(N, C); - Eigen::DSizes shape(NxC, sample_size); -#ifndef EIGEN_HAS_INDEX_LIST - Eigen::DSizes rdims(0); - Eigen::DSizes mean_rdims(1); - Eigen::DSizes bcast(1, sample_size); - Eigen::DSizes C_shape(C, 1); - Eigen::DSizes NxC_shape(NxC, 1); -#else - Eigen::IndexList> rdims; - Eigen::IndexList> mean_rdims; - Eigen::IndexList, int> bcast; - bcast.set(1, sample_size); - Eigen::IndexList> C_shape; - C_shape.set(0, C); - Eigen::IndexList> NxC_shape; - NxC_shape.set(0, NxC); -#endif - - phi::funcs::SetConstant set_constant; - - Tensor scale_data; - if (!scale) { - scale_data.mutable_data({C}, ctx.GetPlace()); - set_constant(dev_ctx, &scale_data, static_cast(1)); - } - - auto scale_e = scale - ? framework::EigenVector::Flatten(*scale) - : framework::EigenVector::Flatten( - const_cast(scale_data)); - auto mean_e = framework::EigenVector::Flatten(*saved_mean); - auto inv_var_e = framework::EigenVector::Flatten(*saved_inv_variance); - auto dy_e = framework::EigenVector::Flatten(*d_y); - auto x_e = framework::EigenVector::Flatten(*x); - - auto scale_arr = scale_e.reshape(C_shape); - auto mean_arr = mean_e.reshape(NxC_shape); - auto inv_var_arr = inv_var_e.reshape(NxC_shape); - auto dy_arr = dy_e.reshape(shape); - auto x_arr = x_e.reshape(shape); - - auto tmp = (x_arr - mean_arr.eval().broadcast(bcast)) * - inv_var_arr.eval().broadcast(bcast); - - // math: d_bias = np.sum(d_y, axis=(n,h,w)) - // math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w)) - if (d_scale && d_bias) { - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, d_scale, static_cast(0)); - set_constant(dev_ctx, d_bias, static_cast(0)); - - auto d_scale_e = framework::EigenVector::Flatten(*d_scale); - auto d_scale_data = d_scale_e.reshape(C_shape); - auto d_bias_e = framework::EigenVector::Flatten(*d_bias); - auto d_bias_data = d_bias_e.reshape(C_shape); - d_bias_data.device(*place) = - dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims); - d_scale_data.device(*place) = - (tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims); - } - - auto dy_mean = - dy_arr.mean(mean_rdims).reshape(NxC_shape).eval().broadcast(bcast); - - Eigen::DSizes bcast_param(N, sample_size); - set_constant(dev_ctx, d_x, static_cast(0)); - // math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y, - // axis=(h,w)) - // - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X - - // mean), - // axis=(h,w)) - auto dx_e = framework::EigenVector::Flatten(*d_x); - auto dx_arr = dx_e.reshape(shape); - dx_arr.device(*place) = scale_arr.broadcast(bcast_param) * - inv_var_arr.broadcast(bcast) * - (dy_arr - dy_mean - - tmp * - (dy_arr * tmp) - .mean(mean_rdims) - .reshape(NxC_shape) - .eval() - .broadcast(bcast)); - } -}; - void InstanceNormDoubleGradOp::InferShape( framework::InferShapeContext *ctx) const { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormDoubleGrad"); @@ -699,14 +487,6 @@ REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp, REGISTER_OPERATOR(instance_norm_grad_grad, ops::InstanceNormDoubleGradOp, ops::InstanceNormDoubleGradOpInplaceInferer); -REGISTER_OP_CPU_KERNEL( - instance_norm, - ops::InstanceNormKernel, - ops::InstanceNormKernel); -REGISTER_OP_CPU_KERNEL( - instance_norm_grad, - ops::InstanceNormGradKernel, - ops::InstanceNormGradKernel); REGISTER_OP_CPU_KERNEL( instance_norm_grad_grad, ops::InstanceNormDoubleGradKernel -class InstanceNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("It must be CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - - auto *x = ctx.Input("X"); - auto &x_dims = x->dims(); - PADDLE_ENFORCE_GE(x_dims.size(), 2, - platform::errors::InvalidArgument( - "The `shape` in InstanceNormOp is invalid: " - "the size of X's dimensions must greater than " - "or equal to 2. But received: " - "the size of X's dimensions is [%d]", - x_dims.size())); - PADDLE_ENFORCE_LE(x_dims.size(), 5, - platform::errors::InvalidArgument( - "The `shape` in InstanceNormOp is invalid: " - "the size of X's dimensions must smaller than" - "or equal to 5. But received: " - "the size of X's dimensions is [%d]", - x_dims.size())); - int N, C, H, W, D; - ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); - int NxC = N * C; - Tensor x_tmp; - x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D}); - - auto *y = ctx.Output("Y"); - y->mutable_data(ctx.GetPlace()); - -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t data_desc_; - miopenTensorDescriptor_t in_param_desc_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_)); -#else - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t in_param_desc_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); -#endif - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - - VLOG(3) << "Setting descriptors."; - std::vector dims; - std::vector strides; - dims = {1, NxC, H, W, D}; - strides = {NxC * H * W * D, H * W * D, W * D, D, 1}; - - auto &dev_ctx = ctx.template device_context(); - -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), - const_cast(strides.data()))); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenDeriveBNTensorDescriptor( - in_param_desc_, data_desc_, miopenBNSpatial)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( - in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); -#endif - - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - Tensor scale_tmp = - ctx.AllocateTmpTensor({NxC}, dev_ctx); - scale_tmp.mutable_data(ctx.GetPlace()); - Tensor bias_tmp = - ctx.AllocateTmpTensor({NxC}, dev_ctx); - bias_tmp.mutable_data(ctx.GetPlace()); - - const int n = x->numel(); - const int block = 512; - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - const int grid = std::min((NxC + block - 1) / block, max_blocks); - - phi::funcs::SetConstant set_constant; - if (scale) { - repeat_param<<>>( - scale->data(), scale_tmp.data(), N, C); - } else { - set_constant(dev_ctx, &scale_tmp, static_cast(1)); - } - if (bias) { - repeat_param<<>>( - bias->data(), bias_tmp.data(), N, C); - } else { - set_constant(dev_ctx, &bias_tmp, static_cast(0)); - } - - auto handle = dev_ctx.cudnn_handle(); - - phi::funcs::SetConstant> - functor; - - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - saved_mean->mutable_data>(ctx.GetPlace()); - saved_variance->mutable_data>(ctx.GetPlace()); - functor(dev_ctx, saved_mean, static_cast>(0)); - functor(dev_ctx, saved_variance, static_cast>(0)); - -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenBatchNormalizationForwardTraining( - handle, miopenBNSpatial, - const_cast( - static_cast(CudnnDataType::kOne())), - const_cast( - static_cast(CudnnDataType::kZero())), - data_desc_, static_cast(x_tmp.template data()), - data_desc_, - static_cast(y->template mutable_data(ctx.GetPlace())), - in_param_desc_, - const_cast(static_cast( - scale_tmp.template data>())), - const_cast(static_cast( - bias_tmp.template data>())), - 0, nullptr, nullptr, epsilon, - static_cast( - saved_mean->template mutable_data>( - ctx.GetPlace())), - static_cast( - saved_variance->template mutable_data>( - ctx.GetPlace())))); - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardTraining( - handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, x_tmp.template data(), - data_desc_, y->template mutable_data(ctx.GetPlace()), - in_param_desc_, scale_tmp.template data>(), - bias_tmp.template data>(), 0, nullptr, - nullptr, epsilon, - saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()))); - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); -#endif - } -}; - template static __global__ void GradComputeDX(const T *dy, const BatchNormParamType *scale, @@ -297,203 +122,6 @@ static __global__ void GradComputeDX(const T *dy, } } -template -class InstanceNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - const auto *scale = ctx.Input("Scale"); - const auto *x = ctx.Input("X"); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - - const auto &x_dims = x->dims(); - - int N, C, H, W, D; - ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); - int NxC = N * C; - - Tensor x_tmp, d_y_tmp; - x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D}); - d_y_tmp.ShareDataWith(*d_y).Resize({1, NxC, H, W, D}); - - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - d_x->mutable_data(ctx.GetPlace()); - if (d_scale && d_bias) { - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); - } - if (scale) { - PADDLE_ENFORCE_EQ( - scale->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The `shape` in InstanceNormOp is invalid: " - "the size of scale's dimensions must be equal to 1. But " - "received: the size of scale's dimensions" - "is [%d]", - scale->dims().size())); - PADDLE_ENFORCE_EQ(scale->dims()[0], C, - platform::errors::InvalidArgument( - "The `shape` in InstanceNormOp is invalid: " - "the first dimension of scale must be equal to " - "Channels([%d]). But received: " - "the first dimension of scale is [%d]," - "the dimensions of scale is [%s], ", - C, scale->dims()[0], scale->dims())); - } - - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant set_constant; - - const int n = x->numel(); - const int block = 512; - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - const int grid = std::min(NxC, max_blocks); - const int grid1 = (C + block - 1) / block; - - Tensor scale_tmp = - ctx.AllocateTmpTensor({NxC}, dev_ctx); - scale_tmp.mutable_data(ctx.GetPlace()); - Tensor d_scale_tmp = - ctx.AllocateTmpTensor({NxC}, dev_ctx); - Tensor d_bias_tmp = - ctx.AllocateTmpTensor({NxC}, dev_ctx); - if (scale) { - repeat_param<<>>( - scale->data(), scale_tmp.data(), N, C); - } else { - set_constant(dev_ctx, &scale_tmp, static_cast(1)); - } - - std::vector dims; - std::vector strides; - dims = {1, NxC, H, W, D}; - strides = {NxC * H * W * D, H * W * D, W * D, D, 1}; - - if ((H * W * D) == 1) { - framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); - phi::funcs::SetConstant> - functor; - functor(dev_ctx, d_scale, static_cast>(0)); - functor(dev_ctx, d_bias, static_cast>(0)); - return; - } - -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t data_desc_; - miopenTensorDescriptor_t in_param_desc_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_)); -#else - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t in_param_desc_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); -#endif - - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), - const_cast(strides.data()))); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenDeriveBNTensorDescriptor( - in_param_desc_, data_desc_, miopenBNSpatial)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( - in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); -#endif - - const auto *saved_mean = ctx.Input("SavedMean"); - const auto *saved_var = ctx.Input("SavedVariance"); - const auto *saved_mean_data = - saved_mean->template data>(); - const auto *saved_var_data = - saved_var->template data>(); - if (d_scale && d_bias) { -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenBatchNormalizationBackward( - dev_ctx.cudnn_handle(), miopenBNSpatial, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, x_tmp.template data(), - data_desc_, d_y_tmp.template data(), data_desc_, - d_x->template mutable_data(ctx.GetPlace()), in_param_desc_, - scale_tmp.template data>(), - d_scale_tmp.template mutable_data>( - ctx.GetPlace()), - d_bias_tmp.template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationBackward( - dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL, - CudnnDataType::kOne(), CudnnDataType::kZero(), - CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, - x_tmp.template data(), data_desc_, d_y_tmp.template data(), - data_desc_, d_x->template mutable_data(ctx.GetPlace()), - in_param_desc_, scale_tmp.template data>(), - d_scale_tmp.template mutable_data>( - ctx.GetPlace()), - d_bias_tmp.template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); -#endif - } else { - if (d_x) { - GradComputeDX<<>>( - d_y->data(), scale_tmp.data>(), - saved_mean_data, x->data(), saved_var_data, C, H * W * D, - d_x->data()); - } - } - - if (d_scale && d_bias) { - add_param<<>>( - d_scale_tmp.data(), d_scale->data(), N, C); - add_param<<>>( - d_bias_tmp.data(), d_bias->data(), N, C); - } - -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); -#endif - } -}; - static __device__ __forceinline__ float real_sqrt(float x) { return 1. / sqrtf(x); } @@ -793,22 +421,10 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; #ifdef PADDLE_WITH_HIP // MIOPEN do not support double -REGISTER_OP_CUDA_KERNEL( - instance_norm, ops::InstanceNormKernel); -REGISTER_OP_CUDA_KERNEL( - instance_norm_grad, - ops::InstanceNormGradKernel); REGISTER_OP_CUDA_KERNEL(instance_norm_grad_grad, ops::InstanceNormDoubleGradKernel< paddle::platform::CUDADeviceContext, float>); #else -REGISTER_OP_CUDA_KERNEL( - instance_norm, ops::InstanceNormKernel, - ops::InstanceNormKernel); -REGISTER_OP_CUDA_KERNEL( - instance_norm_grad, - ops::InstanceNormGradKernel, - ops::InstanceNormGradKernel); REGISTER_OP_CUDA_KERNEL( instance_norm_grad_grad, ops::InstanceNormDoubleGradKernel +#include +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/extensions.h" +#include "paddle/phi/kernels/funcs/math_function.h" +namespace phi { + +template +void InstanceNormGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& d_y, + paddle::optional scale, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + float epsilon, + DenseTensor* d_x, + DenseTensor* d_scale, + DenseTensor* d_bias) { + const auto* scale_ptr = scale.get_ptr(); + + const auto& x_dims = x.dims(); + + const int N = x_dims[0]; + const int C = x_dims[1]; + const int NxC = N * C; + const int sample_size = x.numel() / N / C; + + dev_ctx.template Alloc(d_x); + auto* place = dev_ctx.eigen_device(); + + Eigen::DSizes rshape(NxC, sample_size); + Eigen::DSizes param_shape(N, C); + Eigen::DSizes shape(NxC, sample_size); +#ifndef EIGEN_HAS_INDEX_LIST + Eigen::DSizes rdims(0); + Eigen::DSizes mean_rdims(1); + Eigen::DSizes bcast(1, sample_size); + Eigen::DSizes C_shape(C, 1); + Eigen::DSizes NxC_shape(NxC, 1); +#else + Eigen::IndexList> rdims; + Eigen::IndexList> mean_rdims; + Eigen::IndexList, int> bcast; + bcast.set(1, sample_size); + Eigen::IndexList> C_shape; + C_shape.set(0, C); + Eigen::IndexList> NxC_shape; + NxC_shape.set(0, NxC); +#endif + + phi::funcs::SetConstant set_constant; + + DenseTensor scale_data; + if (!scale_ptr) { + scale_data.Resize({C}); + dev_ctx.template Alloc(&scale_data); + set_constant(dev_ctx, &scale_data, static_cast(1)); + } + + auto scale_e = + scale_ptr + ? EigenVector::Flatten(*scale_ptr) + : EigenVector::Flatten(const_cast(scale_data)); + auto mean_e = EigenVector::Flatten(saved_mean); + auto inv_var_e = EigenVector::Flatten(saved_variance); + auto dy_e = EigenVector::Flatten(d_y); + auto x_e = EigenVector::Flatten(x); + + auto scale_arr = scale_e.reshape(C_shape); + auto mean_arr = mean_e.reshape(NxC_shape); + auto inv_var_arr = inv_var_e.reshape(NxC_shape); + auto dy_arr = dy_e.reshape(shape); + auto x_arr = x_e.reshape(shape); + + auto tmp = (x_arr - mean_arr.eval().broadcast(bcast)) * + inv_var_arr.eval().broadcast(bcast); + + // math: d_bias = np.sum(d_y, axis=(n,h,w)) + // math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w)) + if (d_scale && d_bias) { + dev_ctx.template Alloc(d_scale); + dev_ctx.template Alloc(d_bias); + set_constant(dev_ctx, d_scale, static_cast(0)); + set_constant(dev_ctx, d_bias, static_cast(0)); + + auto d_scale_e = EigenVector::Flatten(*d_scale); + auto d_scale_data = d_scale_e.reshape(C_shape); + auto d_bias_e = EigenVector::Flatten(*d_bias); + auto d_bias_data = d_bias_e.reshape(C_shape); + d_bias_data.device(*place) = + dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims); + d_scale_data.device(*place) = + (tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims); + } + + auto dy_mean = + dy_arr.mean(mean_rdims).reshape(NxC_shape).eval().broadcast(bcast); + + Eigen::DSizes bcast_param(N, sample_size); + set_constant(dev_ctx, d_x, static_cast(0)); + // math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y, + // axis=(h,w)) + // - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X - + // mean), + // axis=(h,w)) + auto dx_e = EigenVector::Flatten(*d_x); + auto dx_arr = dx_e.reshape(shape); + dx_arr.device(*place) = scale_arr.broadcast(bcast_param) * + inv_var_arr.broadcast(bcast) * + (dy_arr - dy_mean - + tmp * + (dy_arr * tmp) + .mean(mean_rdims) + .reshape(NxC_shape) + .eval() + .broadcast(bcast)); +} + +} // namespace phi + +PD_REGISTER_KERNEL(instance_norm_grad, + CPU, + ALL_LAYOUT, + phi::InstanceNormGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/instance_norm_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f89ecba901c0436175342719c9142144177c032c --- /dev/null +++ b/paddle/phi/kernels/cpu/instance_norm_kernel.cc @@ -0,0 +1,126 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/instance_norm_kernel.h" + +#include +#include +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/eigen/extensions.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void InstanceNormKernel(const Context& dev_ctx, + const DenseTensor& x, + paddle::optional scale, + paddle::optional bias, + float epsilon_f, + DenseTensor* y, + DenseTensor* saved_mean, + DenseTensor* saved_variance) { + const auto& x_dims = x.dims(); + T epsilon = static_cast(epsilon_f); + const int N = x_dims[0]; + const int C = x_dims[1]; + const int NxC = N * C; + const int sample_size = x.numel() / N / C; + auto* place = dev_ctx.eigen_device(); + + Eigen::DSizes shape(NxC, sample_size); +// Once eigen on Windows is updated, the if branch can be removed. +#ifndef EIGEN_HAS_INDEX_LIST + Eigen::DSizes bcast(1, sample_size); + Eigen::DSizes C_shape(C, 1); + Eigen::DSizes NxC_shape(NxC, 1); + Eigen::DSizes rdims(1); +#else + Eigen::IndexList, int> bcast; + bcast.set(1, sample_size); + Eigen::IndexList> C_shape; + C_shape.set(0, C); + Eigen::IndexList> NxC_shape; + NxC_shape.set(0, NxC); + Eigen::IndexList> rdims; +#endif + + phi::funcs::SetConstant set_constant; + dev_ctx.template Alloc(saved_mean); + dev_ctx.template Alloc(saved_variance); + set_constant(dev_ctx, saved_mean, static_cast(0)); + set_constant(dev_ctx, saved_variance, static_cast(0)); + + auto saved_mean_a = EigenVector::Flatten(*saved_mean); + auto saved_mean_e = saved_mean_a.reshape(NxC_shape); + auto saved_variance_a = EigenVector::Flatten(*saved_variance); + auto saved_variance_e = saved_variance_a.reshape(NxC_shape); + + auto x_e = EigenVector::Flatten(x); + auto x_arr = x_e.reshape(shape); + + saved_mean_e.device(*place) = x_arr.mean(rdims); + auto saved_variance_arr = + (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon; + + saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse(); + + const auto scale_ptr = scale.get_ptr(); + const auto bias_ptr = bias.get_ptr(); + + DenseTensor scale_data; + DenseTensor bias_data; + if (!scale_ptr) { + scale_data.Resize({C}); + dev_ctx.template Alloc(&scale_data); + set_constant(dev_ctx, &scale_data, static_cast(1)); + } + + if (!bias_ptr) { + bias_data.Resize({C}); + dev_ctx.template Alloc(&bias_data); + set_constant(dev_ctx, &bias_data, static_cast(0)); + } + auto scale_e = + scale_ptr + ? EigenVector::Flatten(*scale_ptr) + : EigenVector::Flatten(const_cast(scale_data)); + auto scale_arr = scale_e.reshape(C_shape); + auto bias_e = + bias_ptr + ? EigenVector::Flatten(*bias_ptr) + : EigenVector::Flatten(const_cast(bias_data)); + auto bias_arr = bias_e.reshape(C_shape); + + dev_ctx.template Alloc(y); + auto y_e = EigenVector::Flatten(*y); + auto y_arr = y_e.reshape(shape); + + // (x - mean) * inv_std * scale + bias + Eigen::DSizes bcast_param(N, sample_size); + y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) * + saved_variance_e.broadcast(bcast) * + scale_arr.broadcast(bcast_param) + + bias_arr.broadcast(bcast_param); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + instance_norm, CPU, ALL_LAYOUT, phi::InstanceNormKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..15c9c30626593f964b14df6fd2763c5af223c31b --- /dev/null +++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu @@ -0,0 +1,319 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/instance_norm_grad_kernel.h" + +#include "paddle/fluid/operators/norm_utils.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/instance_norm_utils.h" + +namespace phi { + +template +static __global__ void GradComputeDX(const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *mean, + const T *x, + const BatchNormParamType *variance, + const int C, + const int sample_size, + T *dx) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + + BatchNormParamType mean_val = mean[ncid]; + BatchNormParamType inv_var_val = variance[ncid]; + + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; + __shared__ BatchNormParamType dy_sum_val; + __shared__ BatchNormParamType dy_x_sub_mean_sum_val; + + BatchNormParamType dy_sum = static_cast>(0); + BatchNormParamType dy_x_sub_mean_sum = + static_cast>(0); + + for (int i = beg_idx; i < end_idx; i += BlockDim) { + BatchNormParamType dy_i = static_cast>(dy[i]); + dy_sum += dy_i; + dy_x_sub_mean_sum += + dy_i * (static_cast>(x[i]) - mean_val); + } + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + dy_x_sub_mean_sum = + BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; + } + __syncthreads(); + + for (int i = beg_idx; i < end_idx; i += BlockDim) { + dx[i] = + (static_cast>(dy[i]) - + dy_sum_val / static_cast>(sample_size) - + (static_cast>(x[i]) - mean_val) * + dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) * + scale[c] * inv_var_val; + } +} + +template +void InstanceNormGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &d_y, + paddle::optional scale, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + float epsilon_f, + DenseTensor *d_x, + DenseTensor *d_scale, + DenseTensor *d_bias) { + double epsilon = static_cast(epsilon_f); + const auto *scale_ptr = scale.get_ptr(); + + const auto &x_dims = x.dims(); + + int N, C, H, W, D; + paddle::operators::ExtractNCWHD( + x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + int NxC = N * C; + + DenseTensor x_tmp, d_y_tmp; + x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D}); + d_y_tmp.ShareDataWith(d_y).Resize({1, NxC, H, W, D}); + + dev_ctx.template Alloc(d_x); + if (d_scale && d_bias) { + dev_ctx.template Alloc(d_scale); + dev_ctx.template Alloc(d_bias); + } + if (scale_ptr) { + PADDLE_ENFORCE_EQ( + scale_ptr->dims().size(), + 1UL, + phi::errors::InvalidArgument( + "The `shape` in InstanceNormOp is invalid: " + "the size of scale's dimensions must be equal to 1. But " + "received: the size of scale's dimensions" + "is [%d]", + scale_ptr->dims().size())); + PADDLE_ENFORCE_EQ(scale_ptr->dims()[0], + C, + phi::errors::InvalidArgument( + "The `shape` in InstanceNormOp is invalid: " + "the first dimension of scale must be equal to " + "Channels([%d]). But received: " + "the first dimension of scale is [%d]," + "the dimensions of scale is [%s], ", + C, + scale_ptr->dims()[0], + scale_ptr->dims())); + } + + phi::funcs::SetConstant set_constant; + + const int n = x.numel(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(NxC, max_blocks); + const int grid1 = (C + block - 1) / block; + + DenseTensor scale_tmp; + scale_tmp.Resize({NxC}); + dev_ctx.template Alloc(&scale_tmp); + + DenseTensor d_scale_tmp; + d_scale_tmp.Resize({NxC}); + dev_ctx.template Alloc(&d_scale_tmp); + + DenseTensor d_bias_tmp; + d_bias_tmp.Resize({NxC}); + dev_ctx.template Alloc(&d_bias_tmp); + + if (scale_ptr) { + repeat_param<<>>( + scale_ptr->data(), scale_tmp.data(), N, C); + } else { + set_constant(dev_ctx, &scale_tmp, static_cast(1)); + } + + std::vector dims; + std::vector strides; + dims = {1, NxC, H, W, D}; + strides = {NxC * H * W * D, H * W * D, W * D, D, 1}; + + if ((H * W * D) == 1) { + phi::Copy(dev_ctx, d_y, dev_ctx.GetPlace(), false, d_x); + phi::funcs::SetConstant> functor; + functor(dev_ctx, d_scale, static_cast>(0)); + functor(dev_ctx, d_bias, static_cast>(0)); + return; + } + +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t data_desc_; + miopenTensorDescriptor_t in_param_desc_; + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_)); +#else + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t in_param_desc_; + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); +#endif + + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenSetTensorDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + const_cast(dims.data()), + const_cast(strides.data()))); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, miopenBNSpatial)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + dims.data(), + strides.data())); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); +#endif + + const auto *saved_mean_data = + saved_mean.template data>(); + const auto *saved_var_data = + saved_variance.template data>(); + if (d_scale && d_bias) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenBatchNormalizationBackward( + dev_ctx.cudnn_handle(), + miopenBNSpatial, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + x_tmp.template data(), + data_desc_, + d_y_tmp.template data(), + data_desc_, + d_x->template data(), + in_param_desc_, + scale_tmp.template data>(), + d_scale_tmp.template data>(), + d_bias_tmp.template data>(), + epsilon, + saved_mean_data, + saved_var_data)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationBackward( + dev_ctx.cudnn_handle(), + CUDNN_BATCHNORM_SPATIAL, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + x_tmp.template data(), + data_desc_, + d_y_tmp.template data(), + data_desc_, + d_x->template data(), + in_param_desc_, + scale_tmp.template data>(), + d_scale_tmp.template data>(), + d_bias_tmp.template data>(), + epsilon, + saved_mean_data, + saved_var_data)); +#endif + } else { + if (d_x) { + GradComputeDX<<>>( + d_y.data(), + scale_tmp.data>(), + saved_mean_data, + x.data(), + saved_var_data, + C, + H * W * D, + d_x->data()); + } + } + + if (d_scale && d_bias) { + add_param<<>>( + d_scale_tmp.data(), d_scale->data(), N, C); + add_param<<>>( + d_bias_tmp.data(), d_bias->data(), N, C); + } + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); +#endif +} +} // namespace phi + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL( + instance_norm_grad, GPU, ALL_LAYOUT, phi::InstanceNormGradKernel, float) {} +#else +PD_REGISTER_KERNEL(instance_norm_grad, + GPU, + ALL_LAYOUT, + phi::InstanceNormGradKernel, + float, + double) {} +#endif diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..cf8f0fb78788ce118e25a7f4f8f0765bf1e50537 --- /dev/null +++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu @@ -0,0 +1,221 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/instance_norm_kernel.h" + +#include "paddle/fluid/operators/norm_utils.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/instance_norm_utils.h" + +namespace phi { + +template +void InstanceNormKernel(const Context &dev_ctx, + const DenseTensor &x, + paddle::optional scale, + paddle::optional bias, + float epsilon_f, + DenseTensor *y, + DenseTensor *saved_mean, + DenseTensor *saved_variance) { + double epsilon = static_cast(epsilon_f); + auto &x_dims = x.dims(); + PADDLE_ENFORCE_GE(x_dims.size(), + 2, + phi::errors::InvalidArgument( + "The `shape` in InstanceNormOp is invalid: " + "the size of X's dimensions must greater than " + "or equal to 2. But received: " + "the size of X's dimensions is [%d]", + x_dims.size())); + PADDLE_ENFORCE_LE(x_dims.size(), + 5, + phi::errors::InvalidArgument( + "The `shape` in InstanceNormOp is invalid: " + "the size of X's dimensions must smaller than" + "or equal to 5. But received: " + "the size of X's dimensions is [%d]", + x_dims.size())); + int N, C, H, W, D; + paddle::operators::ExtractNCWHD( + x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + int NxC = N * C; + DenseTensor x_tmp; + x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D}); + dev_ctx.template Alloc(y); + +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t data_desc_; + miopenTensorDescriptor_t in_param_desc_; + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_)); +#else + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t in_param_desc_; + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); +#endif + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + + VLOG(3) << "Setting descriptors."; + std::vector dims; + std::vector strides; + dims = {1, NxC, H, W, D}; + strides = {NxC * H * W * D, H * W * D, W * D, D, 1}; + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenSetTensorDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + const_cast(dims.data()), + const_cast(strides.data()))); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, miopenBNSpatial)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + dims.data(), + strides.data())); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); +#endif + + const auto scale_ptr = scale.get_ptr(); + const auto bias_ptr = bias.get_ptr(); + + DenseTensor scale_tmp; + scale_tmp.Resize({NxC}); + dev_ctx.template Alloc(&scale_tmp); + DenseTensor bias_tmp; + bias_tmp.Resize({NxC}); + dev_ctx.template Alloc(&bias_tmp); + + const int n = x.numel(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min((NxC + block - 1) / block, max_blocks); + + phi::funcs::SetConstant set_constant; + if (scale_ptr) { + repeat_param<<>>( + scale_ptr->data(), scale_tmp.data(), N, C); + } else { + set_constant(dev_ctx, &scale_tmp, static_cast(1)); + } + if (bias_ptr) { + repeat_param<<>>( + bias_ptr->data(), bias_tmp.data(), N, C); + } else { + set_constant(dev_ctx, &bias_tmp, static_cast(0)); + } + + auto handle = dev_ctx.cudnn_handle(); + + phi::funcs::SetConstant> functor; + dev_ctx.template Alloc>(saved_mean); + dev_ctx.template Alloc>(saved_variance); + functor(dev_ctx, saved_mean, static_cast>(0)); + functor(dev_ctx, saved_variance, static_cast>(0)); + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenBatchNormalizationForwardTraining( + handle, + miopenBNSpatial, + const_cast( + static_cast(CudnnDataType::kOne())), + const_cast( + static_cast(CudnnDataType::kZero())), + data_desc_, + static_cast(x_tmp.template data()), + data_desc_, + static_cast(y->template data()), + in_param_desc_, + const_cast(static_cast( + scale_tmp.template data>())), + const_cast(static_cast( + bias_tmp.template data>())), + 0, + nullptr, + nullptr, + epsilon, + static_cast( + saved_mean->template data>()), + static_cast( + saved_variance->template data>()))); + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardTraining( + handle, + CUDNN_BATCHNORM_SPATIAL, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + x_tmp.template data(), + data_desc_, + y->template data(), + in_param_desc_, + scale_tmp.template data>(), + bias_tmp.template data>(), + 0, + nullptr, + nullptr, + epsilon, + saved_mean->template data>(), + saved_variance->template data>())); + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); +#endif +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL( + instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float) {} +#else +PD_REGISTER_KERNEL( + instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float, double) {} +#endif diff --git a/paddle/phi/kernels/gpu/instance_norm_utils.h b/paddle/phi/kernels/gpu/instance_norm_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..50dfe4ad222c0c3ddbe4ba08c2a7e0082e7e4d84 --- /dev/null +++ b/paddle/phi/kernels/gpu/instance_norm_utils.h @@ -0,0 +1,73 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" + +namespace phi { + +template +using CudnnDataType = paddle::platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +static __global__ void repeat_param(const T *input, + T *output, + const int repeat_num, + const int C) { + CUDA_KERNEL_LOOP(i, repeat_num * C) { + int index = i % C; + output[i] = input[index]; + } +} + +template +static __global__ void add_param(const T *input, + T *output, + const int repeat_num, + const int C) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage ou_storage; + for (int i = blockIdx.x; i < C; i += gridDim.x) { + T ou = static_cast(0); + for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) { + const int index = j * C + i; + ou += static_cast(input[index]); + } + ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum()); + if (threadIdx.x == 0) { + output[i] = ou; + } + __syncthreads(); + + if (AVG) { + output[i] /= repeat_num; + } + } +} +} // namespace phi diff --git a/paddle/phi/kernels/instance_norm_grad_kernel.h b/paddle/phi/kernels/instance_norm_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..041302a7cfb67bdb756de9cb0e639062fd6f2a7b --- /dev/null +++ b/paddle/phi/kernels/instance_norm_grad_kernel.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void InstanceNormGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y_grad, + paddle::optional scale, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + float epsilon, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/instance_norm_kernel.h b/paddle/phi/kernels/instance_norm_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..8c50025a73ce0219def84699529ebb4fe347f78a --- /dev/null +++ b/paddle/phi/kernels/instance_norm_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void InstanceNormKernel(const Context& dev_ctx, + const DenseTensor& x, + paddle::optional scale, + paddle::optional bias, + float epsilon, + DenseTensor* y, + DenseTensor* saved_mean, + DenseTensor* saved_variance); + +} // namespace phi diff --git a/paddle/phi/ops/compat/instance_norm_sig.cc b/paddle/phi/ops/compat/instance_norm_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..b65e84588db130bb8e909518d9591d3847129cbc --- /dev/null +++ b/paddle/phi/ops/compat/instance_norm_sig.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature InstanceNormOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("instance_norm", + {"X", "Scale", "Bias"}, + {"epsilon"}, + {"Y", "SavedMean", "SavedVariance"}); +} + +KernelSignature InstanceNormGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("instance_norm_grad", + {"X", "Y@GRAD", "Scale", "SavedMean", "SavedVariance"}, + {"epsilon"}, + {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); +} +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(instance_norm, phi::InstanceNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(instance_norm_grad, + phi::InstanceNormGradOpArgumentMapping);