// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include #include #include "cub/cub.cuh" #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/fused/fused_bn_activation_op.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/norm_utils.h" #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" DECLARE_bool(cudnn_batchnorm_spatial_persistent); namespace paddle { namespace operators { using Tensor = framework::Tensor; template using CudnnDataType = platform::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; template class FusedBatchNormActKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx.GetPlace()), true, platform::errors::PreconditionNotMet("It must use CUDAPlace.")); double epsilon = static_cast(ctx.Attr("epsilon")); float momentum = ctx.Attr("momentum"); std::string act_type = ctx.Attr("act_type"); if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { LOG(ERROR) << "Provided epsilon is smaller than " << "CUDNN_BN_MIN_EPSILON. Setting it to " << "CUDNN_BN_MIN_EPSILON instead."; } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); // Get the size for each dimension. // NHWC [batch_size, in_height, in_width, in_channels] const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); PADDLE_ENFORCE_EQ(x_dims.size() >= 2 && x_dims.size() <= 5, true, platform::errors::PreconditionNotMet( "The Input dim size should be between 2 and 5")); const auto *scale = ctx.Input("Scale"); const auto *bias = ctx.Input("Bias"); // Run training mode. // obtain running mean and running inv var, and see if we need to // initialize them. auto *mean_out = ctx.Output("MeanOut"); auto *variance_out = ctx.Output("VarianceOut"); mean_out->mutable_data>(ctx.GetPlace()); variance_out->mutable_data>(ctx.GetPlace()); auto *saved_mean = ctx.Output("SavedMean"); auto *saved_variance = ctx.Output("SavedVariance"); saved_mean->mutable_data>(ctx.GetPlace()); saved_variance->mutable_data>(ctx.GetPlace()); auto *y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); int N, C, H, W, D; const DataLayout data_layout = DataLayout::kNHWC; ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); auto &dev_ctx = ctx.template device_context(); if ((N * H * W * D) == 1) { // Only 1 element in normalization dimension, // skip the batch norm calculation, let y = act(x). auto x_v = framework::EigenVector::Flatten(*x); auto y_v = framework::EigenVector::Flatten(*y); auto &dev = *dev_ctx.eigen_device(); if (act_type == "relu") { ReluFunctor()(dev, x_v, y_v); } else { PADDLE_THROW( platform::errors::Unimplemented("Unsupported activation type")); } return; } // ------------------- cudnn descriptors --------------------- auto handle = dev_ctx.cudnn_handle(); cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); VLOG(3) << "Setting descriptors."; std::vector dims = {N, C, H, W, D}; std::vector strides = {H * W * D * C, 1, W * D * C, D * C, C}; PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, data_desc_, mode_)); double this_factor = 1. - momentum; cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION; platform::ScopedActivationDescriptor scope_act_desc; cudnnActivationDescriptor_t activation_desc_ = scope_act_desc.descriptor(act_type); size_t workspace_size = 0; size_t reserve_space_size = 0; void *reserve_space_ptr = nullptr; void *workspace_ptr = nullptr; Tensor workspace_tensor; // Create reserve space and workspace for batch norm. // Create tensor for each batchnorm op, it will be used in the // backward. Thus this tensor shouldn't be temp. auto *reserve_space = ctx.Output("ReserveSpace"); PADDLE_ENFORCE_NOT_NULL( reserve_space, platform::errors::NotFound( "The argument ReserveSpace of batch_norm op is not found.")); // --------------- cudnn batchnorm workspace --------------- PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload:: cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( /*handle=*/handle, /*mode=*/mode_, /*bnOps=*/bnOps_, /*xDesc=*/data_desc_, /*zDesc=*/nullptr, /*yDesc=*/data_desc_, /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, /*activationDesc=*/activation_desc_, /*sizeInBytes=*/&workspace_size)); // -------------- cudnn batchnorm reserve space -------------- PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize( /*handle=*/handle, /*mode=*/mode_, /*bnOps=*/bnOps_, /*activationDesc=*/activation_desc_, /*xDesc=*/data_desc_, /*sizeInBytes=*/&reserve_space_size)); reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(), reserve_space_size); workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(), workspace_size); PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTrainingEx( handle, mode_, bnOps_, CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, x->template data(), nullptr, nullptr, data_desc_, y->template data(), bn_param_desc_, scale->template data>(), bias->template data>(), this_factor, mean_out->template mutable_data>( ctx.GetPlace()), variance_out->template mutable_data>( ctx.GetPlace()), epsilon, saved_mean->template mutable_data>( ctx.GetPlace()), saved_variance->template mutable_data>( ctx.GetPlace()), activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr, reserve_space_size)); // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); } }; template class FusedBatchNormActGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx.GetPlace()), true, platform::errors::PreconditionNotMet("It must use CUDAPlace.")); double epsilon = static_cast(ctx.Attr("epsilon")); std::string act_type = ctx.Attr("act_type"); const auto *x = ctx.Input("X"); const auto *y = ctx.Input("Y"); const auto *d_y = ctx.Input(framework::GradVarName("Y")); const auto *scale = ctx.Input("Scale"); const auto *bias = ctx.Input("Bias"); const auto *reserve_space = ctx.Input("ReserveSpace"); const auto &x_dims = x->dims(); PADDLE_ENFORCE_EQ(x_dims.size() >= 2 && x_dims.size() <= 5, true, platform::errors::PreconditionNotMet( "The Input dim size should be between 2 and 5")); int N, C, H, W, D; const DataLayout data_layout = DataLayout::kNHWC; ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); // init output auto *d_x = ctx.Output(framework::GradVarName("X")); auto *d_scale = ctx.Output(framework::GradVarName("Scale")); auto *d_bias = ctx.Output(framework::GradVarName("Bias")); d_x->mutable_data(ctx.GetPlace()); PADDLE_ENFORCE_EQ( d_scale && d_bias, true, platform::errors::PreconditionNotMet( "Both the scale grad and the bias grad must not be null.")); d_scale->mutable_data>(ctx.GetPlace()); d_bias->mutable_data>(ctx.GetPlace()); PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL, platform::errors::PreconditionNotMet( "The scale only has one dimension.")); PADDLE_ENFORCE_EQ( scale->dims()[0], C, platform::errors::PreconditionNotMet( "The size of scale is equal to the channel of Input(X).")); auto &dev_ctx = ctx.template device_context(); if ((N * H * W * D) == 1) { if (act_type == "relu") { auto x_v = framework::EigenVector::Flatten(*x); auto y_v = framework::EigenVector::Flatten(*y); auto dx_v = framework::EigenVector::Flatten(*d_x); auto dy_v = framework::EigenVector::Flatten(*d_y); auto &dev = *dev_ctx.eigen_device(); ReluGradFunctor()(dev, x_v, y_v, dy_v, dx_v); } else { PADDLE_THROW( platform::errors::Unimplemented("Unsupported activation type")); } math::SetConstant> functor; functor(dev_ctx, d_scale, static_cast>(0)); functor(dev_ctx, d_bias, static_cast>(0)); return; } std::vector dims = {N, C, H, W, D}; std::vector strides = {H * W * C * D, 1, W * D * C, D * C, C}; // ------------------- cudnn descriptors --------------------- cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { LOG(ERROR) << "Provided epsilon is smaller than " << "CUDNN_BN_MIN_EPSILON. Setting it to " << "CUDNN_BN_MIN_EPSILON instead."; } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, data_desc_, mode_)); const auto *saved_mean = ctx.Input("SavedMean"); const auto *saved_var = ctx.Input("SavedVariance"); const auto *saved_mean_data = saved_mean->template data>(); const auto *saved_var_data = saved_var->template data>(); size_t workspace_size = 0; void *workspace_ptr = nullptr; Tensor workspace_tensor; auto reserve_space_size = reserve_space->memory_size(); cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION; platform::ScopedActivationDescriptor scope_act_desc; cudnnActivationDescriptor_t activation_desc_ = scope_act_desc.descriptor(act_type); // --------------- cudnn batchnorm workspace --------------- PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize( /*handle=*/dev_ctx.cudnn_handle(), /*mode=*/mode_, /*bnOps=*/bnOps_, /*xDesc=*/data_desc_, /*yDesc=*/data_desc_, /*dyDesc=*/data_desc_, /*dzDesc=*/nullptr, /*dxDesc=*/data_desc_, /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, /*activationDesc=*/activation_desc_, /*sizeInBytes=*/&workspace_size)); workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(), workspace_size); PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationBackwardEx( /*handle=*/dev_ctx.cudnn_handle(), /*mode=*/mode_, /*bnOps=*/bnOps_, /*alphaDataDiff=*/CudnnDataType::kOne(), /*betaDataDiff=*/CudnnDataType::kZero(), /*alphaParamDiff=*/CudnnDataType::kOne(), /*betaParamDiff=*/CudnnDataType::kZero(), /*xDesc=*/data_desc_, /*xData=*/x->template data(), /*yDesc=*/data_desc_, /*yData=*/y->template data(), /*dyDesc=*/data_desc_, /*dyData=*/d_y->template data(), /*dzDesc=*/nullptr, /*dzData=*/nullptr, /*dxDesc=*/data_desc_, /*dxData=*/d_x->template mutable_data(ctx.GetPlace()), /*dBnScaleBiasDesc=*/bn_param_desc_, /*bnScaleData=*/scale->template data>(), /*bnBiasData=*/bias->template data>(), /*dBnScaleData=*/d_scale ->template mutable_data>(ctx.GetPlace()), /*dBnBiasData=*/d_bias ->template mutable_data>(ctx.GetPlace()), /*epsilon=*/epsilon, /*savedMean=*/saved_mean_data, /*savedInvVariance=*/saved_var_data, /*activationDesc=*/activation_desc_, /*workspace=*/workspace_ptr, /*workSpaceSizeInBytes=*/workspace_size, /*reserveSpace=*/const_cast(reserve_space->template data()), /*reserveSpaceSizeInBytes=*/reserve_space_size)); // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); } }; } // namespace operators } // namespace paddle #if CUDNN_VERSION >= 7401 namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( fused_batch_norm_act, ops::FusedBatchNormActKernel, ops::FusedBatchNormActKernel, ops::FusedBatchNormActKernel); REGISTER_OP_CUDA_KERNEL( fused_batch_norm_act_grad, ops::FusedBatchNormActGradKernel, ops::FusedBatchNormActGradKernel, ops::FusedBatchNormActGradKernel); #endif