From 7879477fe2849415020c812b287bfffc9cf2a63c Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Fri, 23 Apr 2021 10:50:46 +0800 Subject: [PATCH] [ROCM] add cuda kenrel for batch_norm_op (#32393) --- paddle/fluid/operators/batch_norm_op.cu | 506 ++++++++++++++++++------ paddle/fluid/operators/norm_utils.cu.h | 43 +- 2 files changed, 409 insertions(+), 140 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index 444c24b826b..41dc87ac1ba 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -41,6 +41,83 @@ using CudnnDataType = platform::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; +template +static __global__ void BNForwardInference( + const T *x, const BatchNormParamType *mean, + const BatchNormParamType *variance, const BatchNormParamType *scale, + const BatchNormParamType *bias, const int C, const int N, const int HxW, + const double epsilon, T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int num = N * C * HxW; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType x_sub_mean = + static_cast>(x[i]) - mean[c]; + BatchNormParamType inv_var = 1 / sqrt(variance[c] + epsilon); + y[i] = static_cast(scale[c] * x_sub_mean * inv_var + bias[c]); + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( + const T *x, const BatchNormParamType *scale, + const BatchNormParamType *bias, const int C, const int N, const int HxW, + const double epsilon, double exponentialAverageFactor, T *y, + BatchNormParamType *mean, BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { + int outer_size = C; + int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType variance_val; + __shared__ BatchNormParamType inv_var_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + variance_val = x_square_sum / inner_size - mean_val * mean_val; + inv_var_val = 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = mean_val; + save_inv_variance[i] = inv_var_val; + } + mean[i] = (1 - exponentialAverageFactor) * mean_val + + exponentialAverageFactor * mean[i]; + variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * variance[i]; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_sub_mean = + static_cast>(x[index]) - mean_val; + y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; + } + } +} + template class BatchNormKernel : public framework::OpKernel { @@ -80,8 +157,12 @@ class BatchNormKernel auto dtype = platform::CudnnDataType::type; #ifdef PADDLE_WITH_HIP - // HIP do not support compute format of NHWC - auto compute_format = DataLayout::kNCHW; + auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC + : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; #else const bool fast_nhwc_batch_norm = test_mode || @@ -111,14 +192,15 @@ class BatchNormKernel // ------------------- cudnn descriptors --------------------- #ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t data_desc_; - miopenTensorDescriptor_t bn_param_desc_; - miopenBatchNormMode_t mode_; - - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; @@ -138,7 +220,8 @@ class BatchNormKernel epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); #ifdef PADDLE_WITH_HIP - mode_ = miopenBNSpatial; +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; #elif CUDNN_VERSION_MIN(7, 0, 1) if (FLAGS_cudnn_batchnorm_spatial_persistent) { mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; @@ -161,14 +244,15 @@ class BatchNormKernel } #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), - const_cast(strides.data()))); - // Note: PERSISTENT not implemented for inference - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// Note: PERSISTENT not implemented for inference +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor( +// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); #else PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, @@ -226,28 +310,53 @@ class BatchNormKernel C, est_var->dims()[0], est_var->dims())); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenBatchNormalizationForwardInference( - handle, miopenBNSpatial, - const_cast( - static_cast(CudnnDataType::kOne())), - const_cast( - static_cast(CudnnDataType::kZero())), - data_desc_, - static_cast(transformed_x.template data()), - data_desc_, - static_cast( - transformed_y.template mutable_data(ctx.GetPlace())), - bn_param_desc_, - const_cast(static_cast( - scale->template data>())), - const_cast(static_cast( - bias->template data>())), - const_cast(static_cast( - est_mean->template data>())), - const_cast(static_cast( - est_var->template data>())), - epsilon)); + const int block_size = 256; + const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; + if (compute_format == DataLayout::kNCHW) { + BNForwardInference< + T, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, transformed_y.template data()); + } else { + BNForwardInference< + T, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, transformed_y.template data()); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardInference( +// handle, miopenBNSpatial, +// const_cast( +// static_cast(CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// const_cast(static_cast( +// est_mean->template data>())), +// const_cast(static_cast( +// est_var->template data>())), +// epsilon)); #else PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardInference( @@ -365,34 +474,66 @@ class BatchNormKernel #endif // CUDNN_VERSION_MIN(7, 4, 1) if (!called) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenBatchNormalizationForwardTraining( - handle, mode_, const_cast(static_cast( - CudnnDataType::kOne())), - const_cast( - static_cast(CudnnDataType::kZero())), - data_desc_, - static_cast(transformed_x.template data()), - data_desc_, - static_cast( - transformed_y.template mutable_data(ctx.GetPlace())), - bn_param_desc_, - const_cast(static_cast( - scale->template data>())), - const_cast(static_cast( - bias->template data>())), - this_factor, - static_cast( - mean_out->template mutable_data>( - ctx.GetPlace())), - static_cast(variance_out->template mutable_data< - BatchNormParamType>(ctx.GetPlace())), - epsilon, - static_cast( - saved_mean->template mutable_data>( - ctx.GetPlace())), - static_cast(saved_variance->template mutable_data< - BatchNormParamType>(ctx.GetPlace())))); + const int num = transformed_x.numel(); + const int block = 256; + const int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(C, max_blocks); + if (compute_format == DataLayout::kNCHW) { + BNForwardTraining< + T, block, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, this_factor, transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } else { + BNForwardTraining< + T, block, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, this_factor, transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardTraining( +// handle, mode_, const_cast(static_cast( +// CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// this_factor, +// static_cast( +// mean_out->template mutable_data>( +// ctx.GetPlace())), +// static_cast(variance_out->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())), +// epsilon, +// static_cast( +// saved_mean->template mutable_data>( +// ctx.GetPlace())), +// static_cast(saved_variance->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())))); #else PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTraining( @@ -423,11 +564,12 @@ class BatchNormKernel ctx, &transformed_y, y); } #ifdef PADDLE_WITH_HIP - // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); #else // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( @@ -439,7 +581,7 @@ class BatchNormKernel }; template -static __global__ void KeBNBackwardScaleBias( +static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias( const T *dy, const T *x, const BatchNormParamType *mean, const BatchNormParamType *variance, const double epsilon, const int N, const int C, const int HxW, BatchNormParamType *dscale, @@ -526,13 +668,97 @@ class InplaceHelper { }; template -static __global__ void BNBackwardData(const T *dy, - const BatchNormParamType *scale, - const BatchNormParamType *mean, - const T *x, - const BatchNormParamType *variance, - const int C, const int N, const int HxW, - T *dx) { +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward( + const T *dy, const T *x, const BatchNormParamType *scale, + const BatchNormParamType *saved_mean, + const BatchNormParamType *saved_inv_variance, const int C, const int N, + const int HxW, const double epsilon, T *dx, BatchNormParamType *dscale, + BatchNormParamType *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType inv_var_val; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType dscale_val; + __shared__ BatchNormParamType dbias_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType ds_sum = static_cast>(0); + BatchNormParamType db_sum = static_cast>(0); + + if (saved_mean && saved_inv_variance) { + if (threadIdx.x == 0) { + inv_var_val = saved_inv_variance[i]; + mean_val = saved_mean[i]; + } + } else { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = + static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = + static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + inv_var_val = + 1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon); + } + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType dy_i = + static_cast>(dy[index]); + ds_sum += + dy_i * (static_cast>(x[index]) - mean_val); + db_sum += dy_i; + } + ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); + db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + if (threadIdx.x == 0) { + dscale_val = ds_sum * inv_var_val; + dbias_val = db_sum; + dscale[i] = dscale_val; + dbias[i] = dbias_val; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + dx[index] = scale[i] * inv_var_val * + (static_cast>(dy[index]) - + dbias_val / static_cast>(inner_size) - + (static_cast>(x[index]) - mean_val) * + inv_var_val * dscale_val / inner_size); + } + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData( + const T *dy, const BatchNormParamType *scale, + const BatchNormParamType *mean, const T *x, + const BatchNormParamType *variance, const int C, const int N, + const int HxW, T *dx) { const int outer_size = C; const int inner_size = N * HxW; typedef cub::BlockReduce, BlockDim> BlockReduce; @@ -567,7 +793,6 @@ static __global__ void BNBackwardData(const T *dy, dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; } __syncthreads(); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { const int index = layout == framework::DataLayout::kNCHW ? (j / HxW * C + i) * HxW + j % HxW @@ -668,8 +893,12 @@ class BatchNormGradKernel auto dtype = platform::CudnnDataType::type; const auto *reserve_space = ctx.Input("ReserveSpace"); #ifdef PADDLE_WITH_HIP - // HIP do not support compute format of NHWC - auto compute_format = DataLayout::kNCHW; + auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC + : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; #else const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent && @@ -714,7 +943,11 @@ class BatchNormGradKernel auto &dev_ctx = ctx.template device_context(); const int num = transformed_x.numel(); +#ifdef HIPCC + const int block = 256; +#else const int block = 512; +#endif int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid1 = (num + block - 1) / block; @@ -734,14 +967,15 @@ class BatchNormGradKernel // ------------------- cudnn descriptors --------------------- #ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t data_desc_; - miopenTensorDescriptor_t bn_param_desc_; - miopenBatchNormMode_t mode_; - - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; @@ -759,7 +993,8 @@ class BatchNormGradKernel } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); #ifdef PADDLE_WITH_HIP - mode_ = miopenBNSpatial; +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; #elif CUDNN_VERSION_MIN(7, 0, 1) if (FLAGS_cudnn_batchnorm_spatial_persistent) { mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; @@ -771,13 +1006,14 @@ class BatchNormGradKernel #endif // CUDNN_VERSION_MIN(7, 0, 1) #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), - const_cast(strides.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, +// data_desc_, mode_)); #else PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, @@ -871,20 +1107,49 @@ class BatchNormGradKernel #endif // CUDNN_VERSION_MIN(7, 4, 1) if (!called) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenBatchNormalizationBackward( - dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_d_y.template data(), data_desc_, - transformed_d_x.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); + if (compute_format == DataLayout::kNCHW) { + BNBackward< + T, block, + DataLayout::kNCHW><<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale->template data>(), saved_mean_data, + saved_var_data, C, N, H * W * D, epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } else { + BNBackward< + T, block, + DataLayout::kNHWC><<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale->template data>(), saved_mean_data, + saved_var_data, C, N, H * W * D, epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenBatchNormalizationBackward( +// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), +// CudnnDataType::kZero(), CudnnDataType::kOne(), +// CudnnDataType::kZero(), data_desc_, +// transformed_x.template data(), data_desc_, +// transformed_d_y.template data(), data_desc_, +// transformed_d_x.template mutable_data(ctx.GetPlace()), +// bn_param_desc_, scale->template data>(), +// d_scale->template mutable_data>( +// ctx.GetPlace()), +// d_bias->template mutable_data>( +// ctx.GetPlace()), +// epsilon, saved_mean_data, saved_var_data)); #else PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationBackward( @@ -931,11 +1196,12 @@ class BatchNormGradKernel } #ifdef PADDLE_WITH_HIP - // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); #else // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h index 9fcc6292338..843736833f8 100644 --- a/paddle/fluid/operators/norm_utils.cu.h +++ b/paddle/fluid/operators/norm_utils.cu.h @@ -32,6 +32,12 @@ namespace cub = hipcub; #include "paddle/fluid/platform/cudnn_helper.h" #endif +#ifdef __HIPCC__ +#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim) +#else +#define LAUNCH_BOUNDS(BlockDim) +#endif + namespace paddle { namespace operators { @@ -58,12 +64,10 @@ using DataLayout = framework::DataLayout; // axis=(n,h,w))) template -__global__ void DoubleGradComputeDX(const T *x, const T *mean, - const T *variance, const T *ddx, - const T *dy, const T *scale, - const T *ddscale, const int N, const int C, - const int sample_size, const double epsilon, - T *dx) { +__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDX( + const T *x, const T *mean, const T *variance, const T *ddx, const T *dy, + const T *scale, const T *ddscale, const int N, const int C, + const int sample_size, const double epsilon, T *dx) { const int outer_size = C; const int inner_size = N * sample_size; @@ -160,12 +164,10 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean, // scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) * // np.mean(ddx * (x - mean), axis=(n,h,w))) template -__global__ void DoubleGradComputeDDY(const T *x, const T *mean, - const T *variance, const T *ddscale, - const T *ddbias, const T *ddx, - const T *scale, const int N, const int C, - const int sample_size, - const double epsilon, T *ddy) { +__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDDY( + const T *x, const T *mean, const T *variance, const T *ddscale, + const T *ddbias, const T *ddx, const T *scale, const int N, const int C, + const int sample_size, const double epsilon, T *ddy) { const int outer_size = C; const int inner_size = N * sample_size; @@ -238,11 +240,10 @@ __global__ void DoubleGradComputeDDY(const T *x, const T *mean, // inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) * // ddx template -__global__ void DoubleGradComputeDScale(const T *x, const T *mean, - const T *variance, const T *ddx, - const T *dy, const int N, const int C, - const int sample_size, - const double epsilon, T *dscale) { +__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDScale( + const T *x, const T *mean, const T *variance, const T *ddx, const T *dy, + const int N, const int C, const int sample_size, const double epsilon, + T *dscale) { const int outer_size = C; const int inner_size = N * sample_size; @@ -302,7 +303,7 @@ __global__ void DoubleGradComputeDScale(const T *x, const T *mean, // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var template -__global__ void DoubleGradComputeDScaleWithGlobal( +__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDScaleWithGlobal( const T *ddx, const T *variance, const T *dy, const double epsilon, const int N, const int C, const int sample_size, T *dscale) { int outer_size = C; @@ -422,8 +423,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, set_constant(dev_ctx, &scale_tmp, static_cast(1)); } const T *scale_data = Scale ? Scale->data() : scale_tmp.data(); - +#ifdef __HIPCC__ + const int block = 256; +#else const int block = 512; +#endif int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid = std::min(C, max_blocks); @@ -532,6 +536,5 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } } } - } // namespace operators } // namespace paddle -- GitLab