diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index a47e20dde5ffc1ffc53f4dd50adc2a18ed3fa3f9..2534b7cf491e61d755826bd9f886920dc213f37d 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -94,8 +94,9 @@ class BatchNormKernel cudnnTensorDescriptor_t bn_param_desc_; cudnnBatchNormMode_t mode_; - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { @@ -124,12 +125,14 @@ class BatchNormKernel dims = {N, C, H, W, D}; strides = {H * W * D * C, 1, W * D * C, D * C, C}; } - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); // Note: PERSISTENT not implemented for inference - CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, is_test ? CUDNN_BATCHNORM_SPATIAL : mode_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, + is_test ? CUDNN_BATCHNORM_SPATIAL : mode_)); const auto *scale = ctx.Input("Scale"); const auto *bias = ctx.Input("Bias"); @@ -149,17 +152,18 @@ class BatchNormKernel PADDLE_ENFORCE_EQ(est_mean->dims()[0], C); PADDLE_ENFORCE_EQ(est_var->dims()[0], C); - CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardInference( - handle, - // Note: PERSISTENT not implemented for inference - CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_y.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - bias->template data>(), - est_mean->template data>(), - est_var->template data>(), epsilon)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnBatchNormalizationForwardInference( + handle, + // Note: PERSISTENT not implemented for inference + CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, + transformed_x.template data(), data_desc_, + transformed_y.template mutable_data(ctx.GetPlace()), + bn_param_desc_, scale->template data>(), + bias->template data>(), + est_mean->template data>(), + est_var->template data>(), epsilon)); } else { // if MomentumTensor is set, use MomentumTensor value, momentum // is only used in this training branch @@ -214,7 +218,7 @@ class BatchNormKernel "The argument ReserveSpace of batch_norm op is not found.")); // --------------- cudnn batchnorm workspace --------------- - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload:: cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( /*handle=*/handle, @@ -228,7 +232,7 @@ class BatchNormKernel /*sizeInBytes=*/&workspace_size)); // -------------- cudnn batchnorm reserve space -------------- - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload:: cudnnGetBatchNormalizationTrainingExReserveSpaceSize( /*handle=*/handle, @@ -242,7 +246,7 @@ class BatchNormKernel ctx.GetPlace(), transformed_x.type(), reserve_space_size); workspace_ptr = workspace_tensor.mutable_data( ctx.GetPlace(), transformed_x.type(), workspace_size); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTrainingEx( handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType::kOne(), CudnnDataType::kZero(), @@ -264,7 +268,7 @@ class BatchNormKernel } #endif if (!called) { - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTraining( handle, mode_, CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, @@ -292,8 +296,9 @@ class BatchNormKernel ctx, &transformed_y, y); } // clean when exit. - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); } }; @@ -516,9 +521,9 @@ class BatchNormGradKernel cudnnTensorDescriptor_t bn_param_desc_; cudnnBatchNormMode_t mode_; - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { LOG(ERROR) << "Provided epsilon is smaller than " @@ -536,11 +541,12 @@ class BatchNormGradKernel mode_ = CUDNN_BATCHNORM_SPATIAL; #endif - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, mode_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, + data_desc_, mode_)); const auto *saved_mean = ctx.Input("SavedMean"); const auto *saved_var = ctx.Input("SavedVariance"); @@ -559,76 +565,79 @@ class BatchNormGradKernel Tensor workspace_tensor; auto reserve_space_size = reserve_space->memory_size(); // --------------- cudnn batchnorm workspace --------------- - CUDNN_ENFORCE(platform::dynload:: - cudnnGetBatchNormalizationBackwardExWorkspaceSize( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, - /*xDesc=*/data_desc_, - /*yDesc=*/data_desc_, - /*dyDesc=*/data_desc_, - /*dzDesc=*/nullptr, - /*dxDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/nullptr, - /*sizeInBytes=*/&workspace_size)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload:: + cudnnGetBatchNormalizationBackwardExWorkspaceSize( + /*handle=*/dev_ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, + /*xDesc=*/data_desc_, + /*yDesc=*/data_desc_, + /*dyDesc=*/data_desc_, + /*dzDesc=*/nullptr, + /*dxDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/nullptr, + /*sizeInBytes=*/&workspace_size)); workspace_ptr = workspace_tensor.mutable_data( ctx.GetPlace(), transformed_x.type(), workspace_size); - CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackwardEx( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, - /*alphaDataDiff=*/CudnnDataType::kOne(), - /*betaDataDiff=*/CudnnDataType::kZero(), - /*alphaParamDiff=*/CudnnDataType::kOne(), - /*betaParamDiff=*/CudnnDataType::kZero(), - /*xDesc=*/data_desc_, - /*xData=*/transformed_x.template data(), - /*yDesc=*/nullptr, - /*yData=*/nullptr, - /*dyDesc=*/data_desc_, - /*dyData=*/transformed_d_y.template data(), - /*dzDesc=*/nullptr, - /*dzData=*/nullptr, - /*dxDesc=*/data_desc_, - /*dxData=*/transformed_d_x.template mutable_data( - ctx.GetPlace()), - /*dBnScaleBiasDesc=*/bn_param_desc_, - /*bnScaleData=*/scale->template data>(), - /*bnBiasData=*/nullptr, - /*dBnScaleData=*/d_scale - ->template mutable_data>( - ctx.GetPlace()), - /*dBnBiasData=*/d_bias - ->template mutable_data>( + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnBatchNormalizationBackwardEx( + /*handle=*/dev_ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, + /*alphaDataDiff=*/CudnnDataType::kOne(), + /*betaDataDiff=*/CudnnDataType::kZero(), + /*alphaParamDiff=*/CudnnDataType::kOne(), + /*betaParamDiff=*/CudnnDataType::kZero(), + /*xDesc=*/data_desc_, + /*xData=*/transformed_x.template data(), + /*yDesc=*/nullptr, + /*yData=*/nullptr, + /*dyDesc=*/data_desc_, + /*dyData=*/transformed_d_y.template data(), + /*dzDesc=*/nullptr, + /*dzData=*/nullptr, + /*dxDesc=*/data_desc_, + /*dxData=*/transformed_d_x.template mutable_data( ctx.GetPlace()), - /*epsilon=*/epsilon, - /*savedMean=*/saved_mean_data, - /*savedInvVariance=*/saved_var_data, - /*activationDesc=*/nullptr, - /*workspace=*/workspace_ptr, - /*workSpaceSizeInBytes=*/workspace_size, - /*reserveSpace=*/const_cast( - reserve_space->template data()), - /*reserveSpaceSizeInBytes=*/reserve_space_size)); + /*dBnScaleBiasDesc=*/bn_param_desc_, + /*bnScaleData=*/scale->template data>(), + /*bnBiasData=*/nullptr, + /*dBnScaleData=*/d_scale + ->template mutable_data>( + ctx.GetPlace()), + /*dBnBiasData=*/d_bias + ->template mutable_data>( + ctx.GetPlace()), + /*epsilon=*/epsilon, + /*savedMean=*/saved_mean_data, + /*savedInvVariance=*/saved_var_data, + /*activationDesc=*/nullptr, + /*workspace=*/workspace_ptr, + /*workSpaceSizeInBytes=*/workspace_size, + /*reserveSpace=*/const_cast( + reserve_space->template data()), + /*reserveSpaceSizeInBytes=*/reserve_space_size)); } #endif if (!called) { - CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( - dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_d_y.template data(), data_desc_, - transformed_d_x.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnBatchNormalizationBackward( + dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), + CudnnDataType::kZero(), CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, + transformed_x.template data(), data_desc_, + transformed_d_y.template data(), data_desc_, + transformed_d_x.template mutable_data(ctx.GetPlace()), + bn_param_desc_, scale->template data>(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace()), + epsilon, saved_mean_data, saved_var_data)); } if (data_layout == DataLayout::kNHWC && @@ -658,9 +667,9 @@ class BatchNormGradKernel } // clean when exit. - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); } else { const auto *running_mean = ctx.Input("Mean"); diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index ed4a09aed41e5fd19557bcacc400b68e802b0d2f..0d4a3c56bb2984d193942955af9da80f66e73a01 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -133,12 +133,14 @@ struct SearchAlgorithm { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) auto& dev_ctx = ctx.template device_context(); if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( - args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), + CUDNN_TENSOR_OP_MATH)); VLOG(5) << "use cudnn_tensor_op_math"; } else { - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( - args.cdesc.desc(), CUDNN_DEFAULT_MATH)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), + CUDNN_DEFAULT_MATH)); VLOG(5) << "NOT use cudnn_tensor_op_math"; } #endif @@ -148,10 +150,11 @@ struct SearchAlgorithm { int perf_count; int best_algo_idx = 0; std::unique_ptr perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]); - CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( - args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), - args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, &perf_count, - perf_results.get())); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, + &perf_count, perf_results.get())); algo = (perf_results.get())[best_algo_idx].algo; workspace_size = GetWorkspaceSize(args, algo); @@ -163,17 +166,20 @@ struct SearchAlgorithm { << workspace_size_limit << ")"; } if (!has_got_workspace_size) { - CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( - args.handle, args.idesc.desc(), args.wdesc.desc(), - args.cdesc.desc(), args.odesc.desc(), - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, - &algo)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); } #else - CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( - args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), - args.odesc.desc(), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); #endif VLOG(3) << "choose algo " << algo; } else { @@ -197,7 +203,7 @@ struct SearchAlgorithm { std::array perf_stat; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( args.handle, args.idesc.desc(), args.x->data(), args.wdesc.desc(), args.w->data(), args.cdesc.desc(), @@ -223,9 +229,10 @@ struct SearchAlgorithm { static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { size_t workspace_size = 0; - CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( - args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), - args.odesc.desc(), algo, &workspace_size)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), algo, &workspace_size)); return workspace_size; } }; @@ -249,12 +256,14 @@ struct SearchAlgorithm { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) auto& dev_ctx = ctx.template device_context(); if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( - args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), + CUDNN_TENSOR_OP_MATH)); VLOG(5) << "use cudnn_tensor_op_math"; } else { - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( - args.cdesc.desc(), CUDNN_DEFAULT_MATH)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), + CUDNN_DEFAULT_MATH)); VLOG(5) << "NOT use cudnn_tensor_op_math"; } #endif @@ -265,7 +274,7 @@ struct SearchAlgorithm { int best_algo_idx = 0; std::unique_ptr perf_results( new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS, @@ -294,7 +303,7 @@ struct SearchAlgorithm { << workspace_size_limit << ")"; } if (!has_got_workspace_size) { - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), @@ -302,10 +311,12 @@ struct SearchAlgorithm { workspace_size_limit, &algo)); } #else - CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), - args.idesc.desc(), CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + args.handle, args.wdesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.idesc.desc(), + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); #endif } else if (deterministic) { return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; @@ -330,7 +341,7 @@ struct SearchAlgorithm { std::array perf_stat; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload:: cudnnFindConvolutionBackwardDataAlgorithmEx( args.handle, args.wdesc.desc(), args.w->data(), @@ -359,7 +370,7 @@ struct SearchAlgorithm { static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { size_t workspace_size = 0; - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size)); @@ -385,12 +396,14 @@ struct SearchAlgorithm { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) auto& dev_ctx = ctx.template device_context(); if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( - args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), + CUDNN_TENSOR_OP_MATH)); VLOG(5) << "use cudnn_tensor_op_math"; } else { - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( - args.cdesc.desc(), CUDNN_DEFAULT_MATH)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), + CUDNN_DEFAULT_MATH)); VLOG(5) << "NOT use cudnn_tensor_op_math"; } #endif @@ -403,7 +416,7 @@ struct SearchAlgorithm { int best_algo_idx = 0; std::unique_ptr perf_results( new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS, @@ -418,7 +431,7 @@ struct SearchAlgorithm { << workspace_size_limit << ")"; } if (!has_got_workspace_size) { - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), @@ -426,7 +439,7 @@ struct SearchAlgorithm { workspace_size_limit, &algo)); } #else - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), @@ -455,7 +468,7 @@ struct SearchAlgorithm { int returned_algo_count; std::array perf_stat; auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload:: cudnnFindConvolutionBackwardFilterAlgorithmEx( args.handle, args.idesc.desc(), args.x->data(), @@ -483,7 +496,7 @@ struct SearchAlgorithm { static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { size_t workspace_size = 0; - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size)); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index f6399ee0d3ef0180cca801debe5ed1a251294c85..ad11906f3f1439287ef986897f842181a307dd68 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -237,8 +237,9 @@ class CUDNNConvOpKernel : public framework::OpKernel { // cudnn 7 can support groups, no need to do it manually // FIXME(typhoonzero): find a better way to disable groups // rather than setting it to 1. - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( - args.cdesc.desc(), groups)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(), + groups)); groups = 1; #endif args.idesc.set(transformed_input, layout_format); @@ -276,12 +277,13 @@ class CUDNNConvOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* workspace_ptr) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, args.idesc.desc(), - input_data + i * group_offset_in, args.wdesc.desc(), - filter_data + i * group_offset_filter, args.cdesc.desc(), algo, - workspace_ptr, workspace_size, &beta, args.odesc.desc(), - output_data + i * group_offset_out)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionForward( + handle, &alpha, args.idesc.desc(), + input_data + i * group_offset_in, args.wdesc.desc(), + filter_data + i * group_offset_filter, args.cdesc.desc(), + algo, workspace_ptr, workspace_size, &beta, + args.odesc.desc(), output_data + i * group_offset_out)); }, workspace_size); } @@ -596,13 +598,14 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* cudnn_workspace_ptr) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args1.wdesc.desc(), - filter_data + i * group_offset_filter, args1.odesc.desc(), - output_grad_data + i * group_offset_out, args1.cdesc.desc(), - data_algo, cudnn_workspace_ptr, workspace_size, &beta, - args1.idesc.desc(), - transformed_input_grad_data + i * group_offset_in)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, args1.wdesc.desc(), + filter_data + i * group_offset_filter, args1.odesc.desc(), + output_grad_data + i * group_offset_out, + args1.cdesc.desc(), data_algo, cudnn_workspace_ptr, + workspace_size, &beta, args1.idesc.desc(), + transformed_input_grad_data + i * group_offset_in)); }, workspace_size); } @@ -639,13 +642,14 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* cudnn_workspace_ptr) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args2.idesc.desc(), - input_data + i * group_offset_in, args2.odesc.desc(), - output_grad_data + i * group_offset_out, args2.cdesc.desc(), - filter_algo, cudnn_workspace_ptr, workspace_size, &beta, - args2.wdesc.desc(), - filter_grad_data + i * group_offset_filter)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, args2.idesc.desc(), + input_data + i * group_offset_in, args2.odesc.desc(), + output_grad_data + i * group_offset_out, + args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr, + workspace_size, &beta, args2.wdesc.desc(), + filter_grad_data + i * group_offset_filter)); }, workspace_size); } @@ -993,12 +997,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, args1.idesc.desc(), - ddx + i * group_offset_in, args1.wdesc.desc(), - w + i * group_offset_filter, args1.cdesc.desc(), fwd_algo1, - workspace_ptr, workspace_size, &beta, args1.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionForward( + handle, &alpha, args1.idesc.desc(), + ddx + i * group_offset_in, args1.wdesc.desc(), + w + i * group_offset_filter, args1.cdesc.desc(), + fwd_algo1, workspace_ptr, workspace_size, &beta, + args1.odesc.desc(), + transformed_ddy_channel + i * group_offset_out)); }, workspace_size); } @@ -1007,12 +1013,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, args2.idesc.desc(), x + i * group_offset_in, - args2.wdesc.desc(), ddw + i * group_offset_filter, - args2.cdesc.desc(), fwd_algo2, workspace_ptr, - workspace_size, &alpha, args2.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionForward( + handle, &alpha, args2.idesc.desc(), + x + i * group_offset_in, args2.wdesc.desc(), + ddw + i * group_offset_filter, args2.cdesc.desc(), + fwd_algo2, workspace_ptr, workspace_size, &alpha, + args2.odesc.desc(), + transformed_ddy_channel + i * group_offset_out)); }, workspace_size); } @@ -1028,13 +1036,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args3.idesc.desc(), ddx + i * group_offset_in, - args3.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.cdesc.desc(), filter_algo, workspace_ptr, - workspace_size, &beta, args3.wdesc.desc(), - dw + i * group_offset_filter)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, args3.idesc.desc(), + ddx + i * group_offset_in, args3.odesc.desc(), + transformed_dy_channel + i * group_offset_out, + args3.cdesc.desc(), filter_algo, workspace_ptr, + workspace_size, &beta, args3.wdesc.desc(), + dw + i * group_offset_filter)); }, workspace_size); } @@ -1045,13 +1054,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args4.wdesc.desc(), - ddw + i * group_offset_filter, args4.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.cdesc.desc(), data_algo, workspace_ptr, workspace_size, - &beta, args4.idesc.desc(), - transformed_dx + i * group_offset_in)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, args4.wdesc.desc(), + ddw + i * group_offset_filter, args4.odesc.desc(), + transformed_dy_channel + i * group_offset_out, + args4.cdesc.desc(), data_algo, workspace_ptr, + workspace_size, &beta, args4.idesc.desc(), + transformed_dx + i * group_offset_in)); }, workspace_size); } diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index 6294b16a5458d4a87084df21a3b1835d0f95af53..b827f200c0ef80625604d9bbd15a432307cff984 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -236,19 +236,21 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); // Get the algorithm - CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, - // dxDesc: Handle to the previously initialized output tensor - // descriptor. - cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, + // dxDesc: Handle to the previously initialized output tensor + // descriptor. + cudnn_output_desc, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); if (algo == 0 && FLAGS_cudnn_deterministic) { algo = static_cast(1); } // get workspace size able to allocate - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); @@ -263,11 +265,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { auto workspace_handle = dev_ctx.cudnn_workspace_handle(); for (int g = 0; g < groups; g++) { auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, - cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, - algo, cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_output_desc, transformed_output_data + output_offset * g)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, + filter_data + filter_offset * g, cudnn_input_desc, + input_data + input_offset * g, cudnn_conv_desc, algo, + cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_output_desc, + transformed_output_data + output_offset * g)); }; workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } @@ -466,19 +471,21 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { auto handle = dev_ctx.cudnn_handle(); if (input_grad) { // choose backward algorithm for data - CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( - handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, - cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &data_algo)); - CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( - handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, - cudnn_input_desc, data_algo, &fwd_ws_size)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &data_algo)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_input_desc, data_algo, &fwd_ws_size)); workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size); } if (filter_grad) { // choose backward algorithm for filter - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_filter_desc, @@ -486,7 +493,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { workspace_size_limit, &filter_algo)); // get workspace for backwards filter algorithm - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_filter_desc, filter_algo, &bwd_filter_ws_size)); @@ -507,12 +514,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset input_grad. for (int g = 0; g < groups; g++) { auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_filter_desc, - filter_data + filter_offset * g, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + input_offset * g)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_filter_desc, + filter_data + filter_offset * g, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_input_desc, input_grad_data + input_offset * g)); }; workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } @@ -543,12 +551,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Gradient with respect to the filter for (int g = 0; g < groups; g++) { auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_input_desc, - input_data + input_offset * g, cudnn_conv_desc, filter_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_filter_desc, filter_grad_data + filter_offset * g)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_input_desc, + input_data + input_offset * g, cudnn_conv_desc, filter_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + filter_offset * g)); }; workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index f0f9e8842d4c3d58d20455e1ff2377069bf5dbfc..579dddee8e82183b778f03595bb4657002262073 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -91,7 +91,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { if (is_test) { // for inference - CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardInference( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference( handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, @@ -101,7 +101,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { cudnn_rnn_cache->workspace_size_)); } else { // for train - CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardTraining( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining( handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, @@ -230,7 +230,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { auto run_seq_len = input_dims[0]; PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_, "cudnn running seq_len CAN not greater max_lengh"); - CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardData( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData( handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_, out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data, @@ -242,7 +242,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { cudnn_rnn_cache->workspace_size_, reserve_data, cudnn_rnn_cache->reserve_size_)); - CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardWeights( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights( handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, cudnn_rnn_cache->x_desc_, input->data(), cudnn_rnn_cache->hx_desc_, init_h->data(), cudnn_rnn_cache->y_desc_, out->data(), diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h index 7f18b839271a29523cc06c999c28cc0394717397..cd33338abc6223a0ae122cbb60f040562b48a761 100644 --- a/paddle/fluid/operators/cudnn_rnn_cache.h +++ b/paddle/fluid/operators/cudnn_rnn_cache.h @@ -92,13 +92,13 @@ struct CudnnRNNCache { int stride_a[3]; for (size_t i = 0; i < max_length_; ++i) { - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); dim_a[0] = batch_size_; dim_a[1] = input_size_; @@ -107,9 +107,9 @@ struct CudnnRNNCache { stride_a[0] = dim_a[2] * dim_a[1]; stride_a[1] = dim_a[2]; stride_a[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); dim_a[0] = batch_size_; @@ -120,9 +120,9 @@ struct CudnnRNNCache { stride_a[1] = dim_a[2]; stride_a[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); } @@ -134,63 +134,74 @@ struct CudnnRNNCache { stride_a[1] = dim_a[2]; stride_a[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); size_t state_size; - CUDNN_ENFORCE( - platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); - dropout_state_.Resize({static_cast(state_size)})); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size)); + dropout_state_.Resize({static_cast(state_size)}); auto *dropout_state_data = dropout_state_.mutable_data(place); - CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetDropoutDescriptor( dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, seed_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); #if CUDNN_VERSION >= 6000 - CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); #else - CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor( rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_DATA_FLOAT)); #endif - CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize( handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, @@ -199,15 +210,16 @@ struct CudnnRNNCache { dim_w[0] = weights_size_ / sizeof(float); dim_w[1] = 1; dim_w[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor( w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); - CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor( dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize( handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( - handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetRNNTrainingReserveSize( + handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); reserve_data_.Resize({static_cast(reserve_size_)}); reserve_data_.mutable_data(place); @@ -218,13 +230,13 @@ struct CudnnRNNCache { void release() { for (size_t i = 0; i < max_length_; ++i) { - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); } @@ -233,21 +245,32 @@ struct CudnnRNNCache { delete[] dx_desc_; delete[] dy_desc_; - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); - - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); + + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); } }; diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index 17edccb4ea9eee27481fa3488b2094bacabd77ed..905e19f0d8f7dd602268c59c4417c0260ab53ce1 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -164,8 +164,9 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { cudnnConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(padding_common, strides, dilations); - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( - cudnn_conv_desc, groups)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionGroupCount(cudnn_conv_desc, + groups)); cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, framework::vectorize(transformed_input.dims())); @@ -196,16 +197,17 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { auto handle = dev_ctx.cudnn_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cudnn_conv_desc, CUDNN_DEFAULT_MATH)); auto x_dims = framework::vectorize(transformed_input.dims()); auto f_dims = framework::vectorize(filter->dims()); if (!exhaustive_search) { - CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( - handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, - cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); VLOG(3) << "cuDNN forward algo " << algo; } else { auto search_func = [&]() { @@ -213,7 +215,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { std::array fwd_perf_stat; auto cudnn_find_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( handle, cudnn_input_desc, input_data, cudnn_filter_desc, filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, @@ -248,9 +250,10 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { VLOG(3) << "choose algo " << algo; } - CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( - handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, - cudnn_output_desc, algo, &workspace_size_in_bytes)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, algo, &workspace_size_in_bytes)); PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); @@ -262,13 +265,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // ------------- cudnn conv forward and bias add --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnConvolutionForward( handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, filter_data, cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); }; workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); - CUDNN_ENFORCE(platform::dynload::cudnnAddTensor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnAddTensor( handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, output_data)); } else { @@ -279,12 +282,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { ScalingParamType alpha1 = 1.0f; ScalingParamType alpha2 = residual ? 1.0f : 0.0f; auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, algo, cudnn_workspace, - workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, - cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, - output_data)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha1, cudnn_input_desc, input_data, + cudnn_filter_desc, filter_data, cudnn_conv_desc, algo, + cudnn_workspace, workspace_size_in_bytes, &alpha2, + cudnn_output_desc, residual_data, cudnn_bias_desc, bias_data, + cudnn_act_desc, cudnn_output_desc, output_data)); }; workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index 63e97ab5d98cdb906bab1aaf759de00d16455729..9d6b9665f85ab528767ae8f65356d7f3901ad274 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -95,15 +95,15 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { cudnnConvolutionDescriptor_t* conv_desc = new cudnnConvolutionDescriptor_t[4]; for (int i = 0; i < 4; ++i) { - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i])); } @@ -127,11 +127,11 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { for (int i = 0; i < 4; ++i) { filter_dims.push_back(framework::vectorize(filters[i]->dims())); - CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor( filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data())); bias_dims.push_back({1, filter_dims[i][0], 1, 1}); bias_strides.push_back({filter_dims[i][0], 1, 1, 1}); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(), bias_strides[i].data())); in_dims.push_back({n, filter_dims[i][1], h, w}); @@ -140,22 +140,25 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { out_strides.push_back({oc * h * w, h * w, w, 1}); if (i < 2) { - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor( - conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(), - CUDNN_CROSS_CORRELATION, compute_type)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionNdDescriptor( + conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(), + CUDNN_CROSS_CORRELATION, compute_type)); } else { - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor( - conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(), - CUDNN_CROSS_CORRELATION, compute_type)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionNdDescriptor( + conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(), + CUDNN_CROSS_CORRELATION, compute_type)); } - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( - conv_desc[i], CUDNN_DEFAULT_MATH)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(conv_desc[i], + CUDNN_DEFAULT_MATH)); } in_dims[2][1] *= 2; in_strides[2][0] = oc * h * w; out_strides[2][0] = filter_dims[2][0] * h * w; // this out is continuous. in_strides[3][0] = filter_dims[2][0] * h * w; - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2)); cudnnConvolutionFwdAlgo_t algo[4]; @@ -171,19 +174,21 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { } for (int i = 0; i < 4; ++i) { - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data())); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( out_desc[i], cudnn_dtype, 4, out_dims[i].data(), out_strides[i].data())); - CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( - handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, - &algo[i])); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo[i])); size_t tmp_size = 0; - CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( - handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], - algo[i], &tmp_size)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], + algo[i], &tmp_size)); workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); } cudnnActivationDescriptor_t cudnn_act_desc = @@ -196,7 +201,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { // branch1: pool + 1x1 conv ScalingParamType alpha = 1.0f, beta = 0.0f; - CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingForward( handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta, pool_out_desc, temp_data)); @@ -218,13 +223,14 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { for (int i = 0; i < 4; ++i) { auto func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], - static_cast(filters[i]->data()), conv_desc[i], - algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, - out_desc[i], out_datas[i], bias_desc[i], - static_cast(bias[i]->data()), cudnn_act_desc, - out_desc[i], out_datas[i])); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], + static_cast(filters[i]->data()), conv_desc[i], + algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, + out_desc[i], out_datas[i], bias_desc[i], + static_cast(bias[i]->data()), cudnn_act_desc, + out_desc[i], out_datas[i])); }; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); workspace_handle.RunFunc(func, workspace_size_in_bytes); @@ -232,31 +238,35 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { cudnnTensorDescriptor_t x_desc; cudnnTensorDescriptor_t y_desc; - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&x_desc)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&y_desc)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&x_desc)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&y_desc)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data())); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data())); - CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor( handle, CudnnDataType::kOne(), x_desc, static_cast(out_datas[2]), CudnnDataType::kZero(), y_desc, static_cast(output_data + (oc0 + oc1) * h * w))); for (int i = 0; i < 4; ++i) { - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i])); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i])); } - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(x_desc)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(y_desc)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(x_desc)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(y_desc)); } }; #endif diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc index 6ccb670d73c803bb1b9827f0f30b99d272bfce79..71202b26443fece8985e5d043b9c68d5760a08c2 100644 --- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc @@ -45,8 +45,10 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { cudnnTensorDescriptor_t in_desc; cudnnTensorDescriptor_t out_desc; - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&in_desc)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&out_desc)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&in_desc)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&out_desc)); cudnnDataType_t cudnn_dtype = CudnnDataType::type; auto& dev_ctx = ctx.template device_context(); @@ -85,12 +87,12 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { dims_y[i] = 1; } - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data())); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data())); - CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor( handle, CudnnDataType::kOne(), in_desc, static_cast(ins[k]->data()), CudnnDataType::kZero(), out_desc, static_cast(odata))); @@ -101,8 +103,10 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { odata += flat_shape[1]; } } - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(in_desc)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(out_desc)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(in_desc)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(out_desc)); } }; diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc index c765d344d0be88b56a9d7b5b9ac1d572d2aa5c24..92f235c1d204719dc72631f79ecbf6cad8e357c1 100644 --- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc +++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc @@ -59,7 +59,7 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel { cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( DataLayout::kNCHW, framework::vectorize(output->dims())); - CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerForward( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward( handle, cudnn_st_desc, CudnnDataType::kOne(), cudnn_input_desc, input_data, grid_data, CudnnDataType::kZero(), cudnn_output_desc, output_data)); @@ -111,12 +111,13 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel { output_grad_desc.descriptor( DataLayout::kNCHW, framework::vectorize(output_grad->dims())); - CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerBackward( - handle, cudnn_st_dest, CudnnDataType::kOne(), cudnn_input_desc, - input_data, CudnnDataType::kZero(), cudnn_input_grad_desc, - input_grad_data, CudnnDataType::kOne(), cudnn_output_grad_desc, - output_grad_data, grid_data, CudnnDataType::kZero(), - grid_grad_data)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSpatialTfSamplerBackward( + handle, cudnn_st_dest, CudnnDataType::kOne(), cudnn_input_desc, + input_data, CudnnDataType::kZero(), cudnn_input_grad_desc, + input_grad_data, CudnnDataType::kOne(), cudnn_output_grad_desc, + output_grad_data, grid_data, CudnnDataType::kZero(), + grid_grad_data)); } }; diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu index 4c04f6c315b1319b80b7db10b58a746f0491eeae..86b1cef5e371b594401eb8a6ae19afcff4fef9a2 100644 --- a/paddle/fluid/operators/instance_norm_op.cu +++ b/paddle/fluid/operators/instance_norm_op.cu @@ -94,8 +94,9 @@ class InstanceNormKernel cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t in_param_desc_; - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { @@ -113,11 +114,12 @@ class InstanceNormKernel auto &dev_ctx = ctx.template device_context(); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( - in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); const auto *scale = ctx.Input("Scale"); const auto *bias = ctx.Input("Bias"); @@ -152,19 +154,22 @@ class InstanceNormKernel functor(dev_ctx, saved_mean, static_cast>(0)); functor(dev_ctx, saved_variance, static_cast>(0)); - CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( - handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, x_tmp.template data(), - data_desc_, y->template mutable_data(ctx.GetPlace()), in_param_desc_, - scale_tmp.template data>(), - bias_tmp.template data>(), 0, nullptr, nullptr, - epsilon, saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()))); - - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnBatchNormalizationForwardTraining( + handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x_tmp.template data(), + data_desc_, y->template mutable_data(ctx.GetPlace()), + in_param_desc_, scale_tmp.template data>(), + bias_tmp.template data>(), 0, nullptr, + nullptr, epsilon, + saved_mean->template mutable_data>( + ctx.GetPlace()), + saved_variance->template mutable_data>( + ctx.GetPlace()))); + + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); } }; @@ -291,8 +296,9 @@ class InstanceNormGradKernel cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t in_param_desc_; - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { LOG(ERROR) << "Provided epsilon is smaller than " @@ -301,11 +307,12 @@ class InstanceNormGradKernel } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( - in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); const auto *saved_mean = ctx.Input("SavedMean"); const auto *saved_var = ctx.Input("SavedVariance"); @@ -314,18 +321,19 @@ class InstanceNormGradKernel const auto *saved_var_data = saved_var->template data>(); if (d_scale && d_bias) { - CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( - dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL, - CudnnDataType::kOne(), CudnnDataType::kZero(), - CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, - x_tmp.template data(), data_desc_, d_y_tmp.template data(), - data_desc_, d_x->template mutable_data(ctx.GetPlace()), - in_param_desc_, scale_tmp.template data>(), - d_scale_tmp.template mutable_data>( - ctx.GetPlace()), - d_bias_tmp.template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnBatchNormalizationBackward( + dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL, + CudnnDataType::kOne(), CudnnDataType::kZero(), + CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, + x_tmp.template data(), data_desc_, d_y_tmp.template data(), + data_desc_, d_x->template mutable_data(ctx.GetPlace()), + in_param_desc_, scale_tmp.template data>(), + d_scale_tmp.template mutable_data>( + ctx.GetPlace()), + d_bias_tmp.template mutable_data>( + ctx.GetPlace()), + epsilon, saved_mean_data, saved_var_data)); } else { if (d_x) { GradComputeDX<<>>( @@ -342,8 +350,9 @@ class InstanceNormGradKernel d_bias_tmp.data(), d_bias->data(), N, C); } - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); } }; diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index 1c0970c0aa4692bcc51bd69b025b82ecff5bec65..742dc7f4449e2cecb771380dceb92b4006f93693 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -49,7 +49,7 @@ void SoftmaxCUDNNFunctor::operator()( xDesc.descriptor(layout, cudnn_tensor_dims); cudnnTensorDescriptor_t cudnn_y_desc = xDesc.descriptor(layout, cudnn_tensor_dims); - CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxForward( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward( context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType::kOne(), cudnn_x_desc, X->data(), CudnnDataType::kZero(), cudnn_y_desc, @@ -80,7 +80,7 @@ void SoftmaxGradCUDNNFunctor::operator()( dxDesc.descriptor(layout, cudnn_tensor_dims); cudnnTensorDescriptor_t cudnn_ygrad_desc = dyDesc.descriptor(layout, cudnn_tensor_dims); - CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxBackward( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward( context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType::kOne(), cudnn_y_desc, Y->data(), cudnn_ygrad_desc, YGrad->data(), diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc index a634c328d9b8ada3e647e3b5be178439d3a09435..9317a01833307af6be88692ccfdae362d943c56f 100644 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc @@ -156,7 +156,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel { auto handle = ctx.cuda_device_context().cudnn_handle(); ScalingParamType alpha = 1.0f, beta = 0.0f; - CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingForward( handle, cudnn_pool_desc, &alpha, cudnn_input_desc, tranformed_input_data, &beta, cudnn_output_desc, tranformed_output_data)); @@ -312,7 +312,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel { T *input_grad_data = transformed_input_grad.mutable_data( transformed_input_grad.dims(), ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. - CUDNN_ENFORCE(platform::dynload::cudnnPoolingBackward( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingBackward( handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, &beta, cudnn_input_desc, input_grad_data)); diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h index 4e0185183eb801d9370c22a4cf6e6487efcb097f..e0ba1aaa6bd8b8b29b0d02259d91b9bb9470e35f 100644 --- a/paddle/fluid/platform/cudnn_desc.h +++ b/paddle/fluid/platform/cudnn_desc.h @@ -83,19 +83,21 @@ class ActivationDescriptor { struct Deleter { void operator()(T* t) { if (t != nullptr) { - CUDNN_ENFORCE(dynload::cudnnDestroyActivationDescriptor(t)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnDestroyActivationDescriptor(t)); t = nullptr; } } }; ActivationDescriptor() { T* raw_ptr; - CUDNN_ENFORCE(dynload::cudnnCreateActivationDescriptor(&raw_ptr)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnCreateActivationDescriptor(&raw_ptr)); desc_.reset(raw_ptr); } template void set(cudnnActivationMode_t mode, const T& coef) { - CUDNN_ENFORCE(dynload::cudnnSetActivationDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetActivationDescriptor( desc_.get(), mode, CUDNN_NOT_PROPAGATE_NAN, static_cast(coef))); } @@ -112,14 +114,14 @@ class TensorDescriptor { struct Deleter { void operator()(T* t) { if (t != nullptr) { - CUDNN_ENFORCE(dynload::cudnnDestroyTensorDescriptor(t)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyTensorDescriptor(t)); t = nullptr; } } }; TensorDescriptor() { T* raw_ptr; - CUDNN_ENFORCE(dynload::cudnnCreateTensorDescriptor(&raw_ptr)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateTensorDescriptor(&raw_ptr)); desc_.reset(raw_ptr); } T* desc() { return desc_.get(); } @@ -135,7 +137,7 @@ class TensorDescriptor { if (groups > 1) { dims_with_group[1] = dims_with_group[1] / groups; } - CUDNN_ENFORCE(dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor( desc_.get(), ToCudnnDataType(tensor.type()), dims_with_group.size(), dims_with_group.data(), strides.data())); } @@ -148,7 +150,7 @@ class TensorDescriptor { } else { transformed_dims = dims; } - CUDNN_ENFORCE(dynload::cudnnSetTensorNdDescriptorEx( + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx( desc_.get(), format, ToCudnnDataType(tensor.type()), transformed_dims.size(), transformed_dims.data())); } @@ -163,14 +165,14 @@ class FilterDescriptor { struct Deleter { void operator()(T* t) { if (t != nullptr) { - CUDNN_ENFORCE(dynload::cudnnDestroyFilterDescriptor(t)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFilterDescriptor(t)); t = nullptr; } } }; FilterDescriptor() { T* raw_ptr; - CUDNN_ENFORCE(dynload::cudnnCreateFilterDescriptor(&raw_ptr)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFilterDescriptor(&raw_ptr)); desc_.reset(raw_ptr); } T* desc() { return desc_.get(); } @@ -188,7 +190,7 @@ class FilterDescriptor { if (groups > 1) { transformed_dims[1] = transformed_dims[1] / groups; } - CUDNN_ENFORCE(dynload::cudnnSetFilterNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetFilterNdDescriptor( desc_.get(), ToCudnnDataType(tensor.type()), format, transformed_dims.size(), transformed_dims.data())); } @@ -203,14 +205,16 @@ class ConvolutionDescriptor { struct Deleter { void operator()(T* t) { if (t != nullptr) { - CUDNN_ENFORCE(dynload::cudnnDestroyConvolutionDescriptor(t)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnDestroyConvolutionDescriptor(t)); t = nullptr; } } }; ConvolutionDescriptor() { T* raw_ptr; - CUDNN_ENFORCE(dynload::cudnnCreateConvolutionDescriptor(&raw_ptr)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnCreateConvolutionDescriptor(&raw_ptr)); desc_.reset(raw_ptr); } T* desc() { return desc_.get(); } @@ -222,18 +226,19 @@ class ConvolutionDescriptor { cudnnDataType_t compute_type = (dtype == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT; T* desc = desc_.get(); - CUDNN_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor( desc, pads.size(), pads.data(), strides.data(), dilations.data(), CUDNN_CROSS_CORRELATION, compute_type)); #if CUDNN_VERSION_MIN(7, 0, 1) - CUDNN_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionGroupCount(desc, groups)); #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( desc, CUDNN_DEFAULT_MATH)); if (dtype == CUDNN_DATA_HALF) { - CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( - desc, CUDNN_TENSOR_OP_MATH)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(desc, + CUDNN_TENSOR_OP_MATH)); } #endif #endif diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 0a634ede292d3a3a59b5d9a3f2c13f3e7e09e110..df6e2c7293524bb5f224e68be40c794b8000aa94 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -60,14 +60,6 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { #define CUDNN_VERSION_MIN(major, minor, patch) \ (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) -#define CUDNN_ENFORCE(condition) \ - do { \ - auto status = condition; \ - if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) { \ - PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \ - } \ - } while (false) - enum class DataLayout { // Not use kNHWC, kNCHW, @@ -467,7 +459,7 @@ class ScopedActivationDescriptor { PADDLE_THROW("unrecognized activation mode: %d .", static_cast(activation_mode)); } - CUDNN_ENFORCE(dynload::cudnnSetActivationDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetActivationDescriptor( desc_, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling)); return desc_; }