未验证 提交 ba8414d3 编写于 作者: C Chen Weihang 提交者: GitHub

replace CUDNN_ENFORCE with PADDLE_ENFORCE_CUDA_SUCCESS, test=develop (#22109)

上级 cce9af0e
...@@ -94,8 +94,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -94,8 +94,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
cudnnTensorDescriptor_t bn_param_desc_; cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_; cudnnBatchNormMode_t mode_;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
...@@ -124,12 +125,14 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -124,12 +125,14 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
dims = {N, C, H, W, D}; dims = {N, C, H, W, D};
strides = {H * W * D * C, 1, W * D * C, D * C, C}; strides = {H * W * D * C, 1, W * D * C, D * C, C};
} }
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type, data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
// Note: PERSISTENT not implemented for inference // Note: PERSISTENT not implemented for inference
CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(
bn_param_desc_, data_desc_, is_test ? CUDNN_BATCHNORM_SPATIAL : mode_)); platform::dynload::cudnnDeriveBNTensorDescriptor(
bn_param_desc_, data_desc_,
is_test ? CUDNN_BATCHNORM_SPATIAL : mode_));
const auto *scale = ctx.Input<Tensor>("Scale"); const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias"); const auto *bias = ctx.Input<Tensor>("Bias");
...@@ -149,17 +152,18 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -149,17 +152,18 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
PADDLE_ENFORCE_EQ(est_mean->dims()[0], C); PADDLE_ENFORCE_EQ(est_mean->dims()[0], C);
PADDLE_ENFORCE_EQ(est_var->dims()[0], C); PADDLE_ENFORCE_EQ(est_var->dims()[0], C);
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardInference( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, platform::dynload::cudnnBatchNormalizationForwardInference(
// Note: PERSISTENT not implemented for inference handle,
CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(), // Note: PERSISTENT not implemented for inference
CudnnDataType<T>::kZero(), data_desc_, CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
transformed_x.template data<T>(), data_desc_, CudnnDataType<T>::kZero(), data_desc_,
transformed_y.template mutable_data<T>(ctx.GetPlace()), transformed_x.template data<T>(), data_desc_,
bn_param_desc_, scale->template data<BatchNormParamType<T>>(), transformed_y.template mutable_data<T>(ctx.GetPlace()),
bias->template data<BatchNormParamType<T>>(), bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
est_mean->template data<BatchNormParamType<T>>(), bias->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(), epsilon)); est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(), epsilon));
} else { } else {
// if MomentumTensor is set, use MomentumTensor value, momentum // if MomentumTensor is set, use MomentumTensor value, momentum
// is only used in this training branch // is only used in this training branch
...@@ -214,7 +218,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -214,7 +218,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
"The argument ReserveSpace of batch_norm op is not found.")); "The argument ReserveSpace of batch_norm op is not found."));
// --------------- cudnn batchnorm workspace --------------- // --------------- cudnn batchnorm workspace ---------------
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload:: platform::dynload::
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
/*handle=*/handle, /*handle=*/handle,
...@@ -228,7 +232,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -228,7 +232,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
/*sizeInBytes=*/&workspace_size)); /*sizeInBytes=*/&workspace_size));
// -------------- cudnn batchnorm reserve space -------------- // -------------- cudnn batchnorm reserve space --------------
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload:: platform::dynload::
cudnnGetBatchNormalizationTrainingExReserveSpaceSize( cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
/*handle=*/handle, /*handle=*/handle,
...@@ -242,7 +246,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -242,7 +246,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
ctx.GetPlace(), transformed_x.type(), reserve_space_size); ctx.GetPlace(), transformed_x.type(), reserve_space_size);
workspace_ptr = workspace_tensor.mutable_data( workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size); ctx.GetPlace(), transformed_x.type(), workspace_size);
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnBatchNormalizationForwardTrainingEx( platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
handle, mode_, CUDNN_BATCHNORM_OPS_BN, handle, mode_, CUDNN_BATCHNORM_OPS_BN,
CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
...@@ -264,7 +268,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -264,7 +268,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
} }
#endif #endif
if (!called) { if (!called) {
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnBatchNormalizationForwardTraining( platform::dynload::cudnnBatchNormalizationForwardTraining(
handle, mode_, CudnnDataType<T>::kOne(), handle, mode_, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_, CudnnDataType<T>::kZero(), data_desc_,
...@@ -292,8 +296,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -292,8 +296,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
ctx, &transformed_y, y); ctx, &transformed_y, y);
} }
// clean when exit. // clean when exit.
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE( platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
} }
}; };
...@@ -516,9 +521,9 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -516,9 +521,9 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
cudnnTensorDescriptor_t bn_param_desc_; cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_; cudnnBatchNormMode_t mode_;
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than " LOG(ERROR) << "Provided epsilon is smaller than "
...@@ -536,11 +541,12 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -536,11 +541,12 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
mode_ = CUDNN_BATCHNORM_SPATIAL; mode_ = CUDNN_BATCHNORM_SPATIAL;
#endif #endif
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type, data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(
bn_param_desc_, data_desc_, mode_)); platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
data_desc_, mode_));
const auto *saved_mean = ctx.Input<Tensor>("SavedMean"); const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_var = ctx.Input<Tensor>("SavedVariance"); const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
...@@ -559,76 +565,79 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -559,76 +565,79 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
Tensor workspace_tensor; Tensor workspace_tensor;
auto reserve_space_size = reserve_space->memory_size(); auto reserve_space_size = reserve_space->memory_size();
// --------------- cudnn batchnorm workspace --------------- // --------------- cudnn batchnorm workspace ---------------
CUDNN_ENFORCE(platform::dynload:: PADDLE_ENFORCE_CUDA_SUCCESS(
cudnnGetBatchNormalizationBackwardExWorkspaceSize( platform::dynload::
/*handle=*/dev_ctx.cudnn_handle(), cudnnGetBatchNormalizationBackwardExWorkspaceSize(
/*mode=*/mode_, /*handle=*/dev_ctx.cudnn_handle(),
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN, /*mode=*/mode_,
/*xDesc=*/data_desc_, /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*yDesc=*/data_desc_, /*xDesc=*/data_desc_,
/*dyDesc=*/data_desc_, /*yDesc=*/data_desc_,
/*dzDesc=*/nullptr, /*dyDesc=*/data_desc_,
/*dxDesc=*/data_desc_, /*dzDesc=*/nullptr,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_, /*dxDesc=*/data_desc_,
/*activationDesc=*/nullptr, /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*sizeInBytes=*/&workspace_size)); /*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));
workspace_ptr = workspace_tensor.mutable_data( workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size); ctx.GetPlace(), transformed_x.type(), workspace_size);
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackwardEx( PADDLE_ENFORCE_CUDA_SUCCESS(
/*handle=*/dev_ctx.cudnn_handle(), platform::dynload::cudnnBatchNormalizationBackwardEx(
/*mode=*/mode_, /*handle=*/dev_ctx.cudnn_handle(),
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN, /*mode=*/mode_,
/*alphaDataDiff=*/CudnnDataType<T>::kOne(), /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*betaDataDiff=*/CudnnDataType<T>::kZero(), /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
/*alphaParamDiff=*/CudnnDataType<T>::kOne(), /*betaDataDiff=*/CudnnDataType<T>::kZero(),
/*betaParamDiff=*/CudnnDataType<T>::kZero(), /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
/*xDesc=*/data_desc_, /*betaParamDiff=*/CudnnDataType<T>::kZero(),
/*xData=*/transformed_x.template data<T>(), /*xDesc=*/data_desc_,
/*yDesc=*/nullptr, /*xData=*/transformed_x.template data<T>(),
/*yData=*/nullptr, /*yDesc=*/nullptr,
/*dyDesc=*/data_desc_, /*yData=*/nullptr,
/*dyData=*/transformed_d_y.template data<T>(), /*dyDesc=*/data_desc_,
/*dzDesc=*/nullptr, /*dyData=*/transformed_d_y.template data<T>(),
/*dzData=*/nullptr, /*dzDesc=*/nullptr,
/*dxDesc=*/data_desc_, /*dzData=*/nullptr,
/*dxData=*/transformed_d_x.template mutable_data<T>( /*dxDesc=*/data_desc_,
ctx.GetPlace()), /*dxData=*/transformed_d_x.template mutable_data<T>(
/*dBnScaleBiasDesc=*/bn_param_desc_,
/*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
/*bnBiasData=*/nullptr,
/*dBnScaleData=*/d_scale
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
/*dBnBiasData=*/d_bias
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()), ctx.GetPlace()),
/*epsilon=*/epsilon, /*dBnScaleBiasDesc=*/bn_param_desc_,
/*savedMean=*/saved_mean_data, /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
/*savedInvVariance=*/saved_var_data, /*bnBiasData=*/nullptr,
/*activationDesc=*/nullptr, /*dBnScaleData=*/d_scale
/*workspace=*/workspace_ptr, ->template mutable_data<BatchNormParamType<T>>(
/*workSpaceSizeInBytes=*/workspace_size, ctx.GetPlace()),
/*reserveSpace=*/const_cast<T *>( /*dBnBiasData=*/d_bias
reserve_space->template data<T>()), ->template mutable_data<BatchNormParamType<T>>(
/*reserveSpaceSizeInBytes=*/reserve_space_size)); ctx.GetPlace()),
/*epsilon=*/epsilon,
/*savedMean=*/saved_mean_data,
/*savedInvVariance=*/saved_var_data,
/*activationDesc=*/nullptr,
/*workspace=*/workspace_ptr,
/*workSpaceSizeInBytes=*/workspace_size,
/*reserveSpace=*/const_cast<T *>(
reserve_space->template data<T>()),
/*reserveSpaceSizeInBytes=*/reserve_space_size));
} }
#endif #endif
if (!called) { if (!called) {
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( PADDLE_ENFORCE_CUDA_SUCCESS(
dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(), platform::dynload::cudnnBatchNormalizationBackward(
CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(), dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_, CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
transformed_x.template data<T>(), data_desc_, CudnnDataType<T>::kZero(), data_desc_,
transformed_d_y.template data<T>(), data_desc_, transformed_x.template data<T>(), data_desc_,
transformed_d_x.template mutable_data<T>(ctx.GetPlace()), transformed_d_y.template data<T>(), data_desc_,
bn_param_desc_, scale->template data<BatchNormParamType<T>>(), transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
d_scale->template mutable_data<BatchNormParamType<T>>( bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
ctx.GetPlace()), d_scale->template mutable_data<BatchNormParamType<T>>(
d_bias->template mutable_data<BatchNormParamType<T>>( ctx.GetPlace()),
ctx.GetPlace()), d_bias->template mutable_data<BatchNormParamType<T>>(
epsilon, saved_mean_data, saved_var_data)); ctx.GetPlace()),
epsilon, saved_mean_data, saved_var_data));
} }
if (data_layout == DataLayout::kNHWC && if (data_layout == DataLayout::kNHWC &&
...@@ -658,9 +667,9 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -658,9 +667,9 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
} }
// clean when exit. // clean when exit.
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
} else { } else {
const auto *running_mean = ctx.Input<Tensor>("Mean"); const auto *running_mean = ctx.Input<Tensor>("Mean");
......
...@@ -133,12 +133,14 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -133,12 +133,14 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_CUDA_SUCCESS(
args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
CUDNN_TENSOR_OP_MATH));
VLOG(5) << "use cudnn_tensor_op_math"; VLOG(5) << "use cudnn_tensor_op_math";
} else { } else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_CUDA_SUCCESS(
args.cdesc.desc(), CUDNN_DEFAULT_MATH)); platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
CUDNN_DEFAULT_MATH));
VLOG(5) << "NOT use cudnn_tensor_op_math"; VLOG(5) << "NOT use cudnn_tensor_op_math";
} }
#endif #endif
...@@ -148,10 +150,11 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -148,10 +150,11 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
int perf_count; int perf_count;
int best_algo_idx = 0; int best_algo_idx = 0;
std::unique_ptr<perf_t[]> perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]); std::unique_ptr<perf_t[]> perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]);
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( PADDLE_ENFORCE_CUDA_SUCCESS(
args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, &perf_count, args.handle, args.idesc.desc(), args.wdesc.desc(),
perf_results.get())); args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS,
&perf_count, perf_results.get()));
algo = (perf_results.get())[best_algo_idx].algo; algo = (perf_results.get())[best_algo_idx].algo;
workspace_size = GetWorkspaceSize(args, algo); workspace_size = GetWorkspaceSize(args, algo);
...@@ -163,17 +166,20 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -163,17 +166,20 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
<< workspace_size_limit << ")"; << workspace_size_limit << ")";
} }
if (!has_got_workspace_size) { if (!has_got_workspace_size) {
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( PADDLE_ENFORCE_CUDA_SUCCESS(
args.handle, args.idesc.desc(), args.wdesc.desc(), platform::dynload::cudnnGetConvolutionForwardAlgorithm(
args.cdesc.desc(), args.odesc.desc(), args.handle, args.idesc.desc(), args.wdesc.desc(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, args.cdesc.desc(), args.odesc.desc(),
&algo)); CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
} }
#else #else
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( PADDLE_ENFORCE_CUDA_SUCCESS(
args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), platform::dynload::cudnnGetConvolutionForwardAlgorithm(
args.odesc.desc(), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, args.handle, args.idesc.desc(), args.wdesc.desc(),
workspace_size_limit, &algo)); args.cdesc.desc(), args.odesc.desc(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
#endif #endif
VLOG(3) << "choose algo " << algo; VLOG(3) << "choose algo " << algo;
} else { } else {
...@@ -197,7 +203,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -197,7 +203,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat; std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
args.handle, args.idesc.desc(), args.x->data<T>(), args.handle, args.idesc.desc(), args.x->data<T>(),
args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(), args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
...@@ -223,9 +229,10 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -223,9 +229,10 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
size_t workspace_size = 0; size_t workspace_size = 0;
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( PADDLE_ENFORCE_CUDA_SUCCESS(
args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
args.odesc.desc(), algo, &workspace_size)); args.handle, args.idesc.desc(), args.wdesc.desc(),
args.cdesc.desc(), args.odesc.desc(), algo, &workspace_size));
return workspace_size; return workspace_size;
} }
}; };
...@@ -249,12 +256,14 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -249,12 +256,14 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_CUDA_SUCCESS(
args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
CUDNN_TENSOR_OP_MATH));
VLOG(5) << "use cudnn_tensor_op_math"; VLOG(5) << "use cudnn_tensor_op_math";
} else { } else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_CUDA_SUCCESS(
args.cdesc.desc(), CUDNN_DEFAULT_MATH)); platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
CUDNN_DEFAULT_MATH));
VLOG(5) << "NOT use cudnn_tensor_op_math"; VLOG(5) << "NOT use cudnn_tensor_op_math";
} }
#endif #endif
...@@ -265,7 +274,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -265,7 +274,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
int best_algo_idx = 0; int best_algo_idx = 0;
std::unique_ptr<perf_t[]> perf_results( std::unique_ptr<perf_t[]> perf_results(
new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]); new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]);
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
args.handle, args.wdesc.desc(), args.odesc.desc(), args.handle, args.wdesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS, args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS,
...@@ -294,7 +303,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -294,7 +303,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
<< workspace_size_limit << ")"; << workspace_size_limit << ")";
} }
if (!has_got_workspace_size) { if (!has_got_workspace_size) {
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.handle, args.wdesc.desc(), args.odesc.desc(), args.handle, args.wdesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.idesc.desc(), args.cdesc.desc(), args.idesc.desc(),
...@@ -302,10 +311,12 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -302,10 +311,12 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
workspace_size_limit, &algo)); workspace_size_limit, &algo));
} }
#else #else
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( PADDLE_ENFORCE_CUDA_SUCCESS(
args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.idesc.desc(), CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, args.handle, args.wdesc.desc(), args.odesc.desc(),
workspace_size_limit, &algo)); args.cdesc.desc(), args.idesc.desc(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
#endif #endif
} else if (deterministic) { } else if (deterministic) {
return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
...@@ -330,7 +341,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -330,7 +341,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat; std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload:: platform::dynload::
cudnnFindConvolutionBackwardDataAlgorithmEx( cudnnFindConvolutionBackwardDataAlgorithmEx(
args.handle, args.wdesc.desc(), args.w->data<T>(), args.handle, args.wdesc.desc(), args.w->data<T>(),
...@@ -359,7 +370,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -359,7 +370,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
size_t workspace_size = 0; size_t workspace_size = 0;
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
args.handle, args.wdesc.desc(), args.odesc.desc(), args.handle, args.wdesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size)); args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size));
...@@ -385,12 +396,14 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -385,12 +396,14 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_CUDA_SUCCESS(
args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
CUDNN_TENSOR_OP_MATH));
VLOG(5) << "use cudnn_tensor_op_math"; VLOG(5) << "use cudnn_tensor_op_math";
} else { } else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_CUDA_SUCCESS(
args.cdesc.desc(), CUDNN_DEFAULT_MATH)); platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
CUDNN_DEFAULT_MATH));
VLOG(5) << "NOT use cudnn_tensor_op_math"; VLOG(5) << "NOT use cudnn_tensor_op_math";
} }
#endif #endif
...@@ -403,7 +416,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -403,7 +416,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
int best_algo_idx = 0; int best_algo_idx = 0;
std::unique_ptr<perf_t[]> perf_results( std::unique_ptr<perf_t[]> perf_results(
new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]); new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]);
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
args.handle, args.idesc.desc(), args.odesc.desc(), args.handle, args.idesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS, args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS,
...@@ -418,7 +431,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -418,7 +431,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
<< workspace_size_limit << ")"; << workspace_size_limit << ")";
} }
if (!has_got_workspace_size) { if (!has_got_workspace_size) {
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
args.handle, args.idesc.desc(), args.odesc.desc(), args.handle, args.idesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.wdesc.desc(),
...@@ -426,7 +439,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -426,7 +439,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
workspace_size_limit, &algo)); workspace_size_limit, &algo));
} }
#else #else
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
args.handle, args.idesc.desc(), args.odesc.desc(), args.handle, args.idesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.wdesc.desc(),
...@@ -455,7 +468,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -455,7 +468,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
int returned_algo_count; int returned_algo_count;
std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat; std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload:: platform::dynload::
cudnnFindConvolutionBackwardFilterAlgorithmEx( cudnnFindConvolutionBackwardFilterAlgorithmEx(
args.handle, args.idesc.desc(), args.x->data<T>(), args.handle, args.idesc.desc(), args.x->data<T>(),
...@@ -483,7 +496,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -483,7 +496,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
size_t workspace_size = 0; size_t workspace_size = 0;
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
args.handle, args.idesc.desc(), args.odesc.desc(), args.handle, args.idesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size)); args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size));
......
...@@ -237,8 +237,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -237,8 +237,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// cudnn 7 can support groups, no need to do it manually // cudnn 7 can support groups, no need to do it manually
// FIXME(typhoonzero): find a better way to disable groups // FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1. // rather than setting it to 1.
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( PADDLE_ENFORCE_CUDA_SUCCESS(
args.cdesc.desc(), groups)); platform::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(),
groups));
groups = 1; groups = 1;
#endif #endif
args.idesc.set(transformed_input, layout_format); args.idesc.set(transformed_input, layout_format);
...@@ -276,12 +277,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -276,12 +277,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, &alpha, args.idesc.desc(), platform::dynload::cudnnConvolutionForward(
input_data + i * group_offset_in, args.wdesc.desc(), handle, &alpha, args.idesc.desc(),
filter_data + i * group_offset_filter, args.cdesc.desc(), algo, input_data + i * group_offset_in, args.wdesc.desc(),
workspace_ptr, workspace_size, &beta, args.odesc.desc(), filter_data + i * group_offset_filter, args.cdesc.desc(),
output_data + i * group_offset_out)); algo, workspace_ptr, workspace_size, &beta,
args.odesc.desc(), output_data + i * group_offset_out));
}, },
workspace_size); workspace_size);
} }
...@@ -596,13 +598,14 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -596,13 +598,14 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) { [&](void* cudnn_workspace_ptr) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, &alpha, args1.wdesc.desc(), platform::dynload::cudnnConvolutionBackwardData(
filter_data + i * group_offset_filter, args1.odesc.desc(), handle, &alpha, args1.wdesc.desc(),
output_grad_data + i * group_offset_out, args1.cdesc.desc(), filter_data + i * group_offset_filter, args1.odesc.desc(),
data_algo, cudnn_workspace_ptr, workspace_size, &beta, output_grad_data + i * group_offset_out,
args1.idesc.desc(), args1.cdesc.desc(), data_algo, cudnn_workspace_ptr,
transformed_input_grad_data + i * group_offset_in)); workspace_size, &beta, args1.idesc.desc(),
transformed_input_grad_data + i * group_offset_in));
}, },
workspace_size); workspace_size);
} }
...@@ -639,13 +642,14 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -639,13 +642,14 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) { [&](void* cudnn_workspace_ptr) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, &alpha, args2.idesc.desc(), platform::dynload::cudnnConvolutionBackwardFilter(
input_data + i * group_offset_in, args2.odesc.desc(), handle, &alpha, args2.idesc.desc(),
output_grad_data + i * group_offset_out, args2.cdesc.desc(), input_data + i * group_offset_in, args2.odesc.desc(),
filter_algo, cudnn_workspace_ptr, workspace_size, &beta, output_grad_data + i * group_offset_out,
args2.wdesc.desc(), args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
filter_grad_data + i * group_offset_filter)); workspace_size, &beta, args2.wdesc.desc(),
filter_grad_data + i * group_offset_filter));
}, },
workspace_size); workspace_size);
} }
...@@ -993,12 +997,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -993,12 +997,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc( wkspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, &alpha, args1.idesc.desc(), platform::dynload::cudnnConvolutionForward(
ddx + i * group_offset_in, args1.wdesc.desc(), handle, &alpha, args1.idesc.desc(),
w + i * group_offset_filter, args1.cdesc.desc(), fwd_algo1, ddx + i * group_offset_in, args1.wdesc.desc(),
workspace_ptr, workspace_size, &beta, args1.odesc.desc(), w + i * group_offset_filter, args1.cdesc.desc(),
transformed_ddy_channel + i * group_offset_out)); fwd_algo1, workspace_ptr, workspace_size, &beta,
args1.odesc.desc(),
transformed_ddy_channel + i * group_offset_out));
}, },
workspace_size); workspace_size);
} }
...@@ -1007,12 +1013,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1007,12 +1013,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc( wkspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, &alpha, args2.idesc.desc(), x + i * group_offset_in, platform::dynload::cudnnConvolutionForward(
args2.wdesc.desc(), ddw + i * group_offset_filter, handle, &alpha, args2.idesc.desc(),
args2.cdesc.desc(), fwd_algo2, workspace_ptr, x + i * group_offset_in, args2.wdesc.desc(),
workspace_size, &alpha, args2.odesc.desc(), ddw + i * group_offset_filter, args2.cdesc.desc(),
transformed_ddy_channel + i * group_offset_out)); fwd_algo2, workspace_ptr, workspace_size, &alpha,
args2.odesc.desc(),
transformed_ddy_channel + i * group_offset_out));
}, },
workspace_size); workspace_size);
} }
...@@ -1028,13 +1036,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1028,13 +1036,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc( wkspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, &alpha, args3.idesc.desc(), ddx + i * group_offset_in, platform::dynload::cudnnConvolutionBackwardFilter(
args3.odesc.desc(), handle, &alpha, args3.idesc.desc(),
transformed_dy_channel + i * group_offset_out, ddx + i * group_offset_in, args3.odesc.desc(),
args3.cdesc.desc(), filter_algo, workspace_ptr, transformed_dy_channel + i * group_offset_out,
workspace_size, &beta, args3.wdesc.desc(), args3.cdesc.desc(), filter_algo, workspace_ptr,
dw + i * group_offset_filter)); workspace_size, &beta, args3.wdesc.desc(),
dw + i * group_offset_filter));
}, },
workspace_size); workspace_size);
} }
...@@ -1045,13 +1054,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1045,13 +1054,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc( wkspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, &alpha, args4.wdesc.desc(), platform::dynload::cudnnConvolutionBackwardData(
ddw + i * group_offset_filter, args4.odesc.desc(), handle, &alpha, args4.wdesc.desc(),
transformed_dy_channel + i * group_offset_out, ddw + i * group_offset_filter, args4.odesc.desc(),
args4.cdesc.desc(), data_algo, workspace_ptr, workspace_size, transformed_dy_channel + i * group_offset_out,
&beta, args4.idesc.desc(), args4.cdesc.desc(), data_algo, workspace_ptr,
transformed_dx + i * group_offset_in)); workspace_size, &beta, args4.idesc.desc(),
transformed_dx + i * group_offset_in));
}, },
workspace_size); workspace_size);
} }
......
...@@ -236,19 +236,21 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -236,19 +236,21 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
// Get the algorithm // Get the algorithm
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
// dxDesc: Handle to the previously initialized output tensor handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
// descriptor. // dxDesc: Handle to the previously initialized output tensor
cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, // descriptor.
workspace_size_limit, &algo)); cudnn_output_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
if (algo == 0 && FLAGS_cudnn_deterministic) { if (algo == 0 && FLAGS_cudnn_deterministic) {
algo = static_cast<cudnnConvolutionBwdDataAlgo_t>(1); algo = static_cast<cudnnConvolutionBwdDataAlgo_t>(1);
} }
// get workspace size able to allocate // get workspace size able to allocate
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes)); cudnn_output_desc, algo, &workspace_size_in_bytes));
...@@ -263,11 +265,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -263,11 +265,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle();
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, platform::dynload::cudnnConvolutionBackwardData(
cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, handle, &alpha, cudnn_filter_desc,
algo, cudnn_workspace, workspace_size_in_bytes, &beta, filter_data + filter_offset * g, cudnn_input_desc,
cudnn_output_desc, transformed_output_data + output_offset * g)); input_data + input_offset * g, cudnn_conv_desc, algo,
cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_output_desc,
transformed_output_data + output_offset * g));
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
...@@ -466,19 +471,21 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -466,19 +471,21 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
if (input_grad) { if (input_grad) {
// choose backward algorithm for data // choose backward algorithm for data
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, platform::dynload::cudnnGetConvolutionForwardAlgorithm(
cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
workspace_size_limit, &data_algo)); cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( workspace_size_limit, &data_algo));
handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, PADDLE_ENFORCE_CUDA_SUCCESS(
cudnn_input_desc, data_algo, &fwd_ws_size)); platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_input_desc, data_algo, &fwd_ws_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size); workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
} }
if (filter_grad) { if (filter_grad) {
// choose backward algorithm for filter // choose backward algorithm for filter
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_filter_desc, cudnn_filter_desc,
...@@ -486,7 +493,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -486,7 +493,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
workspace_size_limit, &filter_algo)); workspace_size_limit, &filter_algo));
// get workspace for backwards filter algorithm // get workspace for backwards filter algorithm
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_filter_desc, filter_algo, &bwd_filter_ws_size)); cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
...@@ -507,12 +514,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -507,12 +514,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, &alpha, cudnn_output_desc, platform::dynload::cudnnConvolutionForward(
output_grad_data + output_grad_offset * g, cudnn_filter_desc, handle, &alpha, cudnn_output_desc,
filter_data + filter_offset * g, cudnn_conv_desc, data_algo, output_grad_data + output_grad_offset * g, cudnn_filter_desc,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
input_grad_data + input_offset * g)); cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_input_desc, input_grad_data + input_offset * g));
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
...@@ -543,12 +551,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -543,12 +551,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Gradient with respect to the filter // Gradient with respect to the filter
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, &alpha, cudnn_output_desc, platform::dynload::cudnnConvolutionBackwardFilter(
output_grad_data + output_grad_offset * g, cudnn_input_desc, handle, &alpha, cudnn_output_desc,
input_data + input_offset * g, cudnn_conv_desc, filter_algo, output_grad_data + output_grad_offset * g, cudnn_input_desc,
cudnn_workspace, workspace_size_in_bytes, &beta, input_data + input_offset * g, cudnn_conv_desc, filter_algo,
cudnn_filter_desc, filter_grad_data + filter_offset * g)); cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_filter_desc, filter_grad_data + filter_offset * g));
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
......
...@@ -91,7 +91,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> { ...@@ -91,7 +91,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
if (is_test) { if (is_test) {
// for inference // for inference
CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardInference( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
...@@ -101,7 +101,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> { ...@@ -101,7 +101,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
cudnn_rnn_cache->workspace_size_)); cudnn_rnn_cache->workspace_size_));
} else { } else {
// for train // for train
CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardTraining( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
...@@ -230,7 +230,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> { ...@@ -230,7 +230,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
auto run_seq_len = input_dims[0]; auto run_seq_len = input_dims[0];
PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_, PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_,
"cudnn running seq_len CAN not greater max_lengh"); "cudnn running seq_len CAN not greater max_lengh");
CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardData( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_,
out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data, out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data,
...@@ -242,7 +242,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> { ...@@ -242,7 +242,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
cudnn_rnn_cache->workspace_size_, reserve_data, cudnn_rnn_cache->workspace_size_, reserve_data,
cudnn_rnn_cache->reserve_size_)); cudnn_rnn_cache->reserve_size_));
CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardWeights( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
cudnn_rnn_cache->x_desc_, input->data<T>(), cudnn_rnn_cache->hx_desc_, cudnn_rnn_cache->x_desc_, input->data<T>(), cudnn_rnn_cache->hx_desc_,
init_h->data<T>(), cudnn_rnn_cache->y_desc_, out->data<T>(), init_h->data<T>(), cudnn_rnn_cache->y_desc_, out->data<T>(),
......
...@@ -92,13 +92,13 @@ struct CudnnRNNCache { ...@@ -92,13 +92,13 @@ struct CudnnRNNCache {
int stride_a[3]; int stride_a[3];
for (size_t i = 0; i < max_length_; ++i) { for (size_t i = 0; i < max_length_; ++i) {
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
dim_a[0] = batch_size_; dim_a[0] = batch_size_;
dim_a[1] = input_size_; dim_a[1] = input_size_;
...@@ -107,9 +107,9 @@ struct CudnnRNNCache { ...@@ -107,9 +107,9 @@ struct CudnnRNNCache {
stride_a[0] = dim_a[2] * dim_a[1]; stride_a[0] = dim_a[2] * dim_a[1];
stride_a[1] = dim_a[2]; stride_a[1] = dim_a[2];
stride_a[2] = 1; stride_a[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
dim_a[0] = batch_size_; dim_a[0] = batch_size_;
...@@ -120,9 +120,9 @@ struct CudnnRNNCache { ...@@ -120,9 +120,9 @@ struct CudnnRNNCache {
stride_a[1] = dim_a[2]; stride_a[1] = dim_a[2];
stride_a[2] = 1; stride_a[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
} }
...@@ -134,63 +134,74 @@ struct CudnnRNNCache { ...@@ -134,63 +134,74 @@ struct CudnnRNNCache {
stride_a[1] = dim_a[2]; stride_a[1] = dim_a[2];
stride_a[2] = 1; stride_a[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
size_t state_size; size_t state_size;
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
dropout_state_.Resize({static_cast<int64_t>(state_size)})); dropout_state_.Resize({static_cast<int64_t>(state_size)});
auto *dropout_state_data = dropout_state_.mutable_data<uint8_t>(place); auto *dropout_state_data = dropout_state_.mutable_data<uint8_t>(place);
CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetDropoutDescriptor(
dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
seed_)); seed_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
#if CUDNN_VERSION >= 6000 #if CUDNN_VERSION >= 6000
CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
CUDNN_LINEAR_INPUT, CUDNN_LINEAR_INPUT,
is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
#else #else
CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
CUDNN_DATA_FLOAT)); CUDNN_DATA_FLOAT));
#endif #endif
CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
...@@ -199,15 +210,16 @@ struct CudnnRNNCache { ...@@ -199,15 +210,16 @@ struct CudnnRNNCache {
dim_w[0] = weights_size_ / sizeof(float); dim_w[0] = weights_size_ / sizeof(float);
dim_w[1] = 1; dim_w[1] = 1;
dim_w[2] = 1; dim_w[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); platform::dynload::cudnnGetRNNTrainingReserveSize(
handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
reserve_data_.Resize({static_cast<int64_t>(reserve_size_)}); reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
reserve_data_.mutable_data<uint8_t>(place); reserve_data_.mutable_data<uint8_t>(place);
...@@ -218,13 +230,13 @@ struct CudnnRNNCache { ...@@ -218,13 +230,13 @@ struct CudnnRNNCache {
void release() { void release() {
for (size_t i = 0; i < max_length_; ++i) { for (size_t i = 0; i < max_length_; ++i) {
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
} }
...@@ -233,21 +245,32 @@ struct CudnnRNNCache { ...@@ -233,21 +245,32 @@ struct CudnnRNNCache {
delete[] dx_desc_; delete[] dx_desc_;
delete[] dy_desc_; delete[] dy_desc_;
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE( platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
} }
}; };
......
...@@ -164,8 +164,9 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -164,8 +164,9 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t cudnn_conv_desc = cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(padding_common, strides, dilations); conv_desc.descriptor<T>(padding_common, strides, dilations);
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( PADDLE_ENFORCE_CUDA_SUCCESS(
cudnn_conv_desc, groups)); platform::dynload::cudnnSetConvolutionGroupCount(cudnn_conv_desc,
groups));
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>( cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize<int>(transformed_input.dims())); layout, framework::vectorize<int>(transformed_input.dims()));
...@@ -196,16 +197,17 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -196,16 +197,17 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle();
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH)); cudnn_conv_desc, CUDNN_DEFAULT_MATH));
auto x_dims = framework::vectorize(transformed_input.dims()); auto x_dims = framework::vectorize(transformed_input.dims());
auto f_dims = framework::vectorize(filter->dims()); auto f_dims = framework::vectorize(filter->dims());
if (!exhaustive_search) { if (!exhaustive_search) {
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, platform::dynload::cudnnGetConvolutionForwardAlgorithm(
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
workspace_size_limit, &algo)); cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
VLOG(3) << "cuDNN forward algo " << algo; VLOG(3) << "cuDNN forward algo " << algo;
} else { } else {
auto search_func = [&]() { auto search_func = [&]() {
...@@ -213,7 +215,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -213,7 +215,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS> std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat; fwd_perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace) { auto cudnn_find_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
handle, cudnn_input_desc, input_data, cudnn_filter_desc, handle, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
...@@ -248,9 +250,10 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -248,9 +250,10 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
VLOG(3) << "choose algo " << algo; VLOG(3) << "choose algo " << algo;
} }
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
cudnn_output_desc, algo, &workspace_size_in_bytes)); handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes));
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit"); "workspace_size to be allocated exceeds the limit");
...@@ -262,13 +265,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -262,13 +265,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// ------------- cudnn conv forward and bias add --------------------- // ------------- cudnn conv forward and bias add ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace, filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
CUDNN_ENFORCE(platform::dynload::cudnnAddTensor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnAddTensor(
handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
output_data)); output_data));
} else { } else {
...@@ -279,12 +282,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -279,12 +282,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
ScalingParamType<T> alpha1 = 1.0f; ScalingParamType<T> alpha1 = 1.0f;
ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f; ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, platform::dynload::cudnnConvolutionBiasActivationForward(
filter_data, cudnn_conv_desc, algo, cudnn_workspace, handle, &alpha1, cudnn_input_desc, input_data,
workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, cudnn_filter_desc, filter_data, cudnn_conv_desc, algo,
cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, cudnn_workspace, workspace_size_in_bytes, &alpha2,
output_data)); cudnn_output_desc, residual_data, cudnn_bias_desc, bias_data,
cudnn_act_desc, cudnn_output_desc, output_data));
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
......
...@@ -95,15 +95,15 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> { ...@@ -95,15 +95,15 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t* conv_desc = cudnnConvolutionDescriptor_t* conv_desc =
new cudnnConvolutionDescriptor_t[4]; new cudnnConvolutionDescriptor_t[4];
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i])); platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i])); platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i])); platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i])); platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i])); platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i]));
} }
...@@ -127,11 +127,11 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> { ...@@ -127,11 +127,11 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
filter_dims.push_back(framework::vectorize<int>(filters[i]->dims())); filter_dims.push_back(framework::vectorize<int>(filters[i]->dims()));
CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data())); filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data()));
bias_dims.push_back({1, filter_dims[i][0], 1, 1}); bias_dims.push_back({1, filter_dims[i][0], 1, 1});
bias_strides.push_back({filter_dims[i][0], 1, 1, 1}); bias_strides.push_back({filter_dims[i][0], 1, 1, 1});
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(), bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(),
bias_strides[i].data())); bias_strides[i].data()));
in_dims.push_back({n, filter_dims[i][1], h, w}); in_dims.push_back({n, filter_dims[i][1], h, w});
...@@ -140,22 +140,25 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> { ...@@ -140,22 +140,25 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
out_strides.push_back({oc * h * w, h * w, w, 1}); out_strides.push_back({oc * h * w, h * w, w, 1});
if (i < 2) { if (i < 2) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(
conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(), platform::dynload::cudnnSetConvolutionNdDescriptor(
CUDNN_CROSS_CORRELATION, compute_type)); conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(),
CUDNN_CROSS_CORRELATION, compute_type));
} else { } else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(
conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(), platform::dynload::cudnnSetConvolutionNdDescriptor(
CUDNN_CROSS_CORRELATION, compute_type)); conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(),
CUDNN_CROSS_CORRELATION, compute_type));
} }
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_CUDA_SUCCESS(
conv_desc[i], CUDNN_DEFAULT_MATH)); platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
CUDNN_DEFAULT_MATH));
} }
in_dims[2][1] *= 2; in_dims[2][1] *= 2;
in_strides[2][0] = oc * h * w; in_strides[2][0] = oc * h * w;
out_strides[2][0] = filter_dims[2][0] * h * w; // this out is continuous. out_strides[2][0] = filter_dims[2][0] * h * w; // this out is continuous.
in_strides[3][0] = filter_dims[2][0] * h * w; in_strides[3][0] = filter_dims[2][0] * h * w;
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2)); platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2));
cudnnConvolutionFwdAlgo_t algo[4]; cudnnConvolutionFwdAlgo_t algo[4];
...@@ -171,19 +174,21 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> { ...@@ -171,19 +174,21 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
} }
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data())); in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data()));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
out_desc[i], cudnn_dtype, 4, out_dims[i].data(), out_desc[i], cudnn_dtype, 4, out_dims[i].data(),
out_strides[i].data())); out_strides[i].data()));
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], platform::dynload::cudnnGetConvolutionForwardAlgorithm(
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
&algo[i])); CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo[i]));
size_t tmp_size = 0; size_t tmp_size = 0;
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
algo[i], &tmp_size)); handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
algo[i], &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
} }
cudnnActivationDescriptor_t cudnn_act_desc = cudnnActivationDescriptor_t cudnn_act_desc =
...@@ -196,7 +201,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> { ...@@ -196,7 +201,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
// branch1: pool + 1x1 conv // branch1: pool + 1x1 conv
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingForward(
handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta, handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
pool_out_desc, temp_data)); pool_out_desc, temp_data));
...@@ -218,13 +223,14 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> { ...@@ -218,13 +223,14 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
auto func = [&](void* cudnn_workspace) { auto func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], platform::dynload::cudnnConvolutionBiasActivationForward(
static_cast<const void*>(filters[i]->data<T>()), conv_desc[i], handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
out_desc[i], out_datas[i], bias_desc[i], algo[i], cudnn_workspace, workspace_size_in_bytes, &beta,
static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc, out_desc[i], out_datas[i], bias_desc[i],
out_desc[i], out_datas[i])); static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
out_desc[i], out_datas[i]));
}; };
auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle();
workspace_handle.RunFunc(func, workspace_size_in_bytes); workspace_handle.RunFunc(func, workspace_size_in_bytes);
...@@ -232,31 +238,35 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> { ...@@ -232,31 +238,35 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
cudnnTensorDescriptor_t x_desc; cudnnTensorDescriptor_t x_desc;
cudnnTensorDescriptor_t y_desc; cudnnTensorDescriptor_t y_desc;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&x_desc)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&y_desc)); platform::dynload::cudnnCreateTensorDescriptor(&x_desc));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&y_desc));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data())); x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data()));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data())); y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data()));
CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
handle, CudnnDataType<T>::kOne(), x_desc, handle, CudnnDataType<T>::kOne(), x_desc,
static_cast<const void*>(out_datas[2]), CudnnDataType<T>::kZero(), static_cast<const void*>(out_datas[2]), CudnnDataType<T>::kZero(),
y_desc, static_cast<void*>(output_data + (oc0 + oc1) * h * w))); y_desc, static_cast<void*>(output_data + (oc0 + oc1) * h * w)));
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i])); platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i])); platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i])); platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i])); platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i]));
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i])); platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i]));
} }
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(x_desc)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(y_desc)); platform::dynload::cudnnDestroyTensorDescriptor(x_desc));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(y_desc));
} }
}; };
#endif #endif
......
...@@ -45,8 +45,10 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> { ...@@ -45,8 +45,10 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
cudnnTensorDescriptor_t in_desc; cudnnTensorDescriptor_t in_desc;
cudnnTensorDescriptor_t out_desc; cudnnTensorDescriptor_t out_desc;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&in_desc)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&out_desc)); platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type; cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
...@@ -85,12 +87,12 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> { ...@@ -85,12 +87,12 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
dims_y[i] = 1; dims_y[i] = 1;
} }
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data())); in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data())); out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
handle, CudnnDataType<T>::kOne(), in_desc, handle, CudnnDataType<T>::kOne(), in_desc,
static_cast<const void*>(ins[k]->data<T>()), static_cast<const void*>(ins[k]->data<T>()),
CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata))); CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
...@@ -101,8 +103,10 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> { ...@@ -101,8 +103,10 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
odata += flat_shape[1]; odata += flat_shape[1];
} }
} }
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(in_desc)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(out_desc)); platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
} }
}; };
......
...@@ -59,7 +59,7 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> { ...@@ -59,7 +59,7 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>( cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
DataLayout::kNCHW, framework::vectorize<int>(output->dims())); DataLayout::kNCHW, framework::vectorize<int>(output->dims()));
CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerForward( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc, handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc, input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
output_data)); output_data));
...@@ -111,12 +111,13 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> { ...@@ -111,12 +111,13 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
output_grad_desc.descriptor<T>( output_grad_desc.descriptor<T>(
DataLayout::kNCHW, framework::vectorize<int>(output_grad->dims())); DataLayout::kNCHW, framework::vectorize<int>(output_grad->dims()));
CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerBackward( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, cudnn_st_dest, CudnnDataType<T>::kOne(), cudnn_input_desc, platform::dynload::cudnnSpatialTfSamplerBackward(
input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc, handle, cudnn_st_dest, CudnnDataType<T>::kOne(), cudnn_input_desc,
input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc, input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
output_grad_data, grid_data, CudnnDataType<T>::kZero(), input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
grid_grad_data)); output_grad_data, grid_data, CudnnDataType<T>::kZero(),
grid_grad_data));
} }
}; };
......
...@@ -94,8 +94,9 @@ class InstanceNormKernel<platform::CUDADeviceContext, T> ...@@ -94,8 +94,9 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t in_param_desc_; cudnnTensorDescriptor_t in_param_desc_;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
...@@ -113,11 +114,12 @@ class InstanceNormKernel<platform::CUDADeviceContext, T> ...@@ -113,11 +114,12 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type, data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); platform::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
const auto *scale = ctx.Input<Tensor>("Scale"); const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias"); const auto *bias = ctx.Input<Tensor>("Bias");
...@@ -152,19 +154,22 @@ class InstanceNormKernel<platform::CUDADeviceContext, T> ...@@ -152,19 +154,22 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0)); functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0)); functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( PADDLE_ENFORCE_CUDA_SUCCESS(
handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(), platform::dynload::cudnnBatchNormalizationForwardTraining(
CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(), handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
data_desc_, y->template mutable_data<T>(ctx.GetPlace()), in_param_desc_, CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(),
scale_tmp.template data<BatchNormParamType<T>>(), data_desc_, y->template mutable_data<T>(ctx.GetPlace()),
bias_tmp.template data<BatchNormParamType<T>>(), 0, nullptr, nullptr, in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>( bias_tmp.template data<BatchNormParamType<T>>(), 0, nullptr,
ctx.GetPlace()), nullptr, epsilon,
saved_variance->template mutable_data<BatchNormParamType<T>>( saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()))); ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); ctx.GetPlace())));
CUDNN_ENFORCE(
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
} }
}; };
...@@ -291,8 +296,9 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T> ...@@ -291,8 +296,9 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t in_param_desc_; cudnnTensorDescriptor_t in_param_desc_;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE( platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than " LOG(ERROR) << "Provided epsilon is smaller than "
...@@ -301,11 +307,12 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T> ...@@ -301,11 +307,12 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
} }
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type, data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); platform::dynload::cudnnDeriveBNTensorDescriptor(
in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
const auto *saved_mean = ctx.Input<Tensor>("SavedMean"); const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_var = ctx.Input<Tensor>("SavedVariance"); const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
...@@ -314,18 +321,19 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T> ...@@ -314,18 +321,19 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
const auto *saved_var_data = const auto *saved_var_data =
saved_var->template data<BatchNormParamType<T>>(); saved_var->template data<BatchNormParamType<T>>();
if (d_scale && d_bias) { if (d_scale && d_bias) {
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( PADDLE_ENFORCE_CUDA_SUCCESS(
dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL, platform::dynload::cudnnBatchNormalizationBackward(
CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL,
CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), data_desc_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
x_tmp.template data<T>(), data_desc_, d_y_tmp.template data<T>(), CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), data_desc_,
data_desc_, d_x->template mutable_data<T>(ctx.GetPlace()), x_tmp.template data<T>(), data_desc_, d_y_tmp.template data<T>(),
in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(), data_desc_, d_x->template mutable_data<T>(ctx.GetPlace()),
d_scale_tmp.template mutable_data<BatchNormParamType<T>>( in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
ctx.GetPlace()), d_scale_tmp.template mutable_data<BatchNormParamType<T>>(
d_bias_tmp.template mutable_data<BatchNormParamType<T>>( ctx.GetPlace()),
ctx.GetPlace()), d_bias_tmp.template mutable_data<BatchNormParamType<T>>(
epsilon, saved_mean_data, saved_var_data)); ctx.GetPlace()),
epsilon, saved_mean_data, saved_var_data));
} else { } else {
if (d_x) { if (d_x) {
GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>( GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>(
...@@ -342,8 +350,9 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T> ...@@ -342,8 +350,9 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
d_bias_tmp.data<T>(), d_bias->data<T>(), N, C); d_bias_tmp.data<T>(), d_bias->data<T>(), N, C);
} }
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
CUDNN_ENFORCE( platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
} }
}; };
......
...@@ -49,7 +49,7 @@ void SoftmaxCUDNNFunctor<T>::operator()( ...@@ -49,7 +49,7 @@ void SoftmaxCUDNNFunctor<T>::operator()(
xDesc.descriptor<T>(layout, cudnn_tensor_dims); xDesc.descriptor<T>(layout, cudnn_tensor_dims);
cudnnTensorDescriptor_t cudnn_y_desc = cudnnTensorDescriptor_t cudnn_y_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims); xDesc.descriptor<T>(layout, cudnn_tensor_dims);
CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxForward( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc, CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc,
X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc, X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
...@@ -80,7 +80,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()( ...@@ -80,7 +80,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
dxDesc.descriptor<T>(layout, cudnn_tensor_dims); dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
cudnnTensorDescriptor_t cudnn_ygrad_desc = cudnnTensorDescriptor_t cudnn_ygrad_desc =
dyDesc.descriptor<T>(layout, cudnn_tensor_dims); dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxBackward( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_y_desc, CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_y_desc,
Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(), Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
......
...@@ -156,7 +156,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> { ...@@ -156,7 +156,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
auto handle = ctx.cuda_device_context().cudnn_handle(); auto handle = ctx.cuda_device_context().cudnn_handle();
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingForward(
handle, cudnn_pool_desc, &alpha, cudnn_input_desc, handle, cudnn_pool_desc, &alpha, cudnn_input_desc,
tranformed_input_data, &beta, cudnn_output_desc, tranformed_input_data, &beta, cudnn_output_desc,
tranformed_output_data)); tranformed_output_data));
...@@ -312,7 +312,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> { ...@@ -312,7 +312,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
T *input_grad_data = transformed_input_grad.mutable_data<T>( T *input_grad_data = transformed_input_grad.mutable_data<T>(
transformed_input_grad.dims(), ctx.GetPlace()); transformed_input_grad.dims(), ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
CUDNN_ENFORCE(platform::dynload::cudnnPoolingBackward( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingBackward(
handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
&beta, cudnn_input_desc, input_grad_data)); &beta, cudnn_input_desc, input_grad_data));
......
...@@ -83,19 +83,21 @@ class ActivationDescriptor { ...@@ -83,19 +83,21 @@ class ActivationDescriptor {
struct Deleter { struct Deleter {
void operator()(T* t) { void operator()(T* t) {
if (t != nullptr) { if (t != nullptr) {
CUDNN_ENFORCE(dynload::cudnnDestroyActivationDescriptor(t)); PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnDestroyActivationDescriptor(t));
t = nullptr; t = nullptr;
} }
} }
}; };
ActivationDescriptor() { ActivationDescriptor() {
T* raw_ptr; T* raw_ptr;
CUDNN_ENFORCE(dynload::cudnnCreateActivationDescriptor(&raw_ptr)); PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreateActivationDescriptor(&raw_ptr));
desc_.reset(raw_ptr); desc_.reset(raw_ptr);
} }
template <typename T> template <typename T>
void set(cudnnActivationMode_t mode, const T& coef) { void set(cudnnActivationMode_t mode, const T& coef) {
CUDNN_ENFORCE(dynload::cudnnSetActivationDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetActivationDescriptor(
desc_.get(), mode, CUDNN_NOT_PROPAGATE_NAN, static_cast<double>(coef))); desc_.get(), mode, CUDNN_NOT_PROPAGATE_NAN, static_cast<double>(coef)));
} }
...@@ -112,14 +114,14 @@ class TensorDescriptor { ...@@ -112,14 +114,14 @@ class TensorDescriptor {
struct Deleter { struct Deleter {
void operator()(T* t) { void operator()(T* t) {
if (t != nullptr) { if (t != nullptr) {
CUDNN_ENFORCE(dynload::cudnnDestroyTensorDescriptor(t)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyTensorDescriptor(t));
t = nullptr; t = nullptr;
} }
} }
}; };
TensorDescriptor() { TensorDescriptor() {
T* raw_ptr; T* raw_ptr;
CUDNN_ENFORCE(dynload::cudnnCreateTensorDescriptor(&raw_ptr)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateTensorDescriptor(&raw_ptr));
desc_.reset(raw_ptr); desc_.reset(raw_ptr);
} }
T* desc() { return desc_.get(); } T* desc() { return desc_.get(); }
...@@ -135,7 +137,7 @@ class TensorDescriptor { ...@@ -135,7 +137,7 @@ class TensorDescriptor {
if (groups > 1) { if (groups > 1) {
dims_with_group[1] = dims_with_group[1] / groups; dims_with_group[1] = dims_with_group[1] / groups;
} }
CUDNN_ENFORCE(dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
desc_.get(), ToCudnnDataType(tensor.type()), dims_with_group.size(), desc_.get(), ToCudnnDataType(tensor.type()), dims_with_group.size(),
dims_with_group.data(), strides.data())); dims_with_group.data(), strides.data()));
} }
...@@ -148,7 +150,7 @@ class TensorDescriptor { ...@@ -148,7 +150,7 @@ class TensorDescriptor {
} else { } else {
transformed_dims = dims; transformed_dims = dims;
} }
CUDNN_ENFORCE(dynload::cudnnSetTensorNdDescriptorEx( PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx(
desc_.get(), format, ToCudnnDataType(tensor.type()), desc_.get(), format, ToCudnnDataType(tensor.type()),
transformed_dims.size(), transformed_dims.data())); transformed_dims.size(), transformed_dims.data()));
} }
...@@ -163,14 +165,14 @@ class FilterDescriptor { ...@@ -163,14 +165,14 @@ class FilterDescriptor {
struct Deleter { struct Deleter {
void operator()(T* t) { void operator()(T* t) {
if (t != nullptr) { if (t != nullptr) {
CUDNN_ENFORCE(dynload::cudnnDestroyFilterDescriptor(t)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFilterDescriptor(t));
t = nullptr; t = nullptr;
} }
} }
}; };
FilterDescriptor() { FilterDescriptor() {
T* raw_ptr; T* raw_ptr;
CUDNN_ENFORCE(dynload::cudnnCreateFilterDescriptor(&raw_ptr)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFilterDescriptor(&raw_ptr));
desc_.reset(raw_ptr); desc_.reset(raw_ptr);
} }
T* desc() { return desc_.get(); } T* desc() { return desc_.get(); }
...@@ -188,7 +190,7 @@ class FilterDescriptor { ...@@ -188,7 +190,7 @@ class FilterDescriptor {
if (groups > 1) { if (groups > 1) {
transformed_dims[1] = transformed_dims[1] / groups; transformed_dims[1] = transformed_dims[1] / groups;
} }
CUDNN_ENFORCE(dynload::cudnnSetFilterNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetFilterNdDescriptor(
desc_.get(), ToCudnnDataType(tensor.type()), format, desc_.get(), ToCudnnDataType(tensor.type()), format,
transformed_dims.size(), transformed_dims.data())); transformed_dims.size(), transformed_dims.data()));
} }
...@@ -203,14 +205,16 @@ class ConvolutionDescriptor { ...@@ -203,14 +205,16 @@ class ConvolutionDescriptor {
struct Deleter { struct Deleter {
void operator()(T* t) { void operator()(T* t) {
if (t != nullptr) { if (t != nullptr) {
CUDNN_ENFORCE(dynload::cudnnDestroyConvolutionDescriptor(t)); PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnDestroyConvolutionDescriptor(t));
t = nullptr; t = nullptr;
} }
} }
}; };
ConvolutionDescriptor() { ConvolutionDescriptor() {
T* raw_ptr; T* raw_ptr;
CUDNN_ENFORCE(dynload::cudnnCreateConvolutionDescriptor(&raw_ptr)); PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreateConvolutionDescriptor(&raw_ptr));
desc_.reset(raw_ptr); desc_.reset(raw_ptr);
} }
T* desc() { return desc_.get(); } T* desc() { return desc_.get(); }
...@@ -222,18 +226,19 @@ class ConvolutionDescriptor { ...@@ -222,18 +226,19 @@ class ConvolutionDescriptor {
cudnnDataType_t compute_type = cudnnDataType_t compute_type =
(dtype == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT; (dtype == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
T* desc = desc_.get(); T* desc = desc_.get();
CUDNN_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
desc, pads.size(), pads.data(), strides.data(), dilations.data(), desc, pads.size(), pads.data(), strides.data(), dilations.data(),
CUDNN_CROSS_CORRELATION, compute_type)); CUDNN_CROSS_CORRELATION, compute_type));
#if CUDNN_VERSION_MIN(7, 0, 1) #if CUDNN_VERSION_MIN(7, 0, 1)
CUDNN_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetConvolutionGroupCount(desc, groups)); platform::dynload::cudnnSetConvolutionGroupCount(desc, groups));
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
desc, CUDNN_DEFAULT_MATH)); desc, CUDNN_DEFAULT_MATH));
if (dtype == CUDNN_DATA_HALF) { if (dtype == CUDNN_DATA_HALF) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_CUDA_SUCCESS(
desc, CUDNN_TENSOR_OP_MATH)); platform::dynload::cudnnSetConvolutionMathType(desc,
CUDNN_TENSOR_OP_MATH));
} }
#endif #endif
#endif #endif
......
...@@ -60,14 +60,6 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { ...@@ -60,14 +60,6 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
#define CUDNN_VERSION_MIN(major, minor, patch) \ #define CUDNN_VERSION_MIN(major, minor, patch) \
(CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
#define CUDNN_ENFORCE(condition) \
do { \
auto status = condition; \
if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) { \
PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
} \
} while (false)
enum class DataLayout { // Not use enum class DataLayout { // Not use
kNHWC, kNHWC,
kNCHW, kNCHW,
...@@ -467,7 +459,7 @@ class ScopedActivationDescriptor { ...@@ -467,7 +459,7 @@ class ScopedActivationDescriptor {
PADDLE_THROW("unrecognized activation mode: %d .", PADDLE_THROW("unrecognized activation mode: %d .",
static_cast<int>(activation_mode)); static_cast<int>(activation_mode));
} }
CUDNN_ENFORCE(dynload::cudnnSetActivationDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetActivationDescriptor(
desc_, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling)); desc_, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling));
return desc_; return desc_;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册