From 73fcfc06ec76bae730906583b44c649a7a008bc0 Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Thu, 26 Jul 2018 13:46:08 +0800 Subject: [PATCH] refine conv cudnn enforce (#12353) * refine conv cudnn enforce * update * update all cudnn ops * fix --- paddle/fluid/operators/conv_cudnn_op.cu.cc | 26 +++++++++---------- .../operators/conv_transpose_cudnn_op.cu.cc | 18 ++++++------- paddle/fluid/operators/math/softmax.cu | 4 +-- paddle/fluid/operators/pool_cudnn_op.cu.cc | 4 +-- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 1828be57b..b3781ded0 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -77,7 +77,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { // cudnn 7 can support groups, no need to do it mannually // FIXME(typhoonzero): find a better way to disable groups // rather than setting it to 1. - PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( cudnn_conv_desc, groups)); groups = 1; #endif @@ -129,7 +129,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, &algo)); @@ -140,18 +140,18 @@ class CUDNNConvOpKernel : public framework::OpKernel { if (dev_ctx.GetComputeCapability() >= 70 && std::type_index(typeid(T)) == std::type_index(typeid(platform::float16))) { - PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( cudnn_conv_desc, CUDNN_TENSOR_OP_MATH)); // Currently tensor core is only enabled using this algo algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; } else { - PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( cudnn_conv_desc, CUDNN_DEFAULT_MATH)); } #endif // get workspace size able to allocate - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); // It is possible for float16 on Volta GPU to allocate more memory than @@ -165,7 +165,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { // ------------------- cudnn conv forward --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; for (int i = 0; i < groups; i++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, cudnn_filter_desc, filter_data + i * group_offset_filter, cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, @@ -218,7 +218,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // cudnn 7 can support groups, no need to do it mannually // FIXME(typhoonzero): find a better way to disable groups // rather than setting it to 1. - PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( cudnn_conv_desc, groups)); groups = 1; #endif @@ -273,7 +273,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { auto handle = dev_ctx.cudnn_handle(); if (input_grad) { if (FLAGS_cudnn_deterministic) { - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, // dyDesc: Handle to the previously initialized input @@ -289,7 +289,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( handle, cudnn_filter_desc, cudnn_output_grad_desc, cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size)); @@ -298,7 +298,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { if (filter_grad) { if (FLAGS_cudnn_deterministic) { - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, cudnn_filter_desc, @@ -308,7 +308,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; } - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, cudnn_filter_desc, filter_algo, &tmp_size)); @@ -326,7 +326,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset input_grad. for (int i = 0; i < groups; i++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( handle, &alpha, cudnn_filter_desc, filter_data + i * group_offset_filter, cudnn_output_grad_desc, output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, @@ -339,7 +339,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset filter_grad. for (int i = 0; i < groups; i++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, cudnn_output_grad_desc, output_grad_data + i * group_offset_out, cudnn_conv_desc, filter_algo, cudnn_workspace, diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc index 038ea8999..82fff68e7 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -87,7 +87,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); // Get the algorithm - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, // dxDesc: Handle to the previously initialized output tensor // descriptor. @@ -95,7 +95,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { workspace_size_limit, &algo)); // get workspace size able to allocate - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); @@ -110,7 +110,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; for (int g = 0; g < groups; g++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, &beta, @@ -178,11 +178,11 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { auto handle = dev_ctx.cudnn_handle(); if (input_grad) { // choose backward algorithm for data - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, &data_algo)); - PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_input_desc, data_algo, &fwd_ws_size)); workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size); @@ -190,7 +190,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { if (filter_grad) { // choose backward algorithm for filter - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_filter_desc, @@ -198,7 +198,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { workspace_size_limit, &filter_algo)); // get workspace for backwards filter algorithm - PADDLE_ENFORCE( + CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_filter_desc, filter_algo, &bwd_filter_ws_size)); @@ -222,7 +222,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. for (int g = 0; g < groups; g++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( handle, &alpha, cudnn_output_desc, output_grad_data + output_grad_offset * g, cudnn_filter_desc, filter_data + filter_offset * g, cudnn_conv_desc, data_algo, @@ -237,7 +237,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter for (int g = 0; g < groups; g++) { - PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( handle, &alpha, cudnn_output_desc, output_grad_data + output_grad_offset * g, cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, filter_algo, diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index a579182ec..3effe7762 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -52,7 +52,7 @@ void SoftmaxCUDNNFunctor::operator()( xDesc.descriptor(layout, cudnn_tensor_dims); cudnnTensorDescriptor_t cudnn_y_desc = xDesc.descriptor(layout, cudnn_tensor_dims); - PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxForward( + CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxForward( context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType::kOne(), cudnn_x_desc, X->data(), CudnnDataType::kZero(), cudnn_y_desc, @@ -83,7 +83,7 @@ void SoftmaxGradCUDNNFunctor::operator()( dxDesc.descriptor(layout, cudnn_tensor_dims); cudnnTensorDescriptor_t cudnn_ygrad_desc = dyDesc.descriptor(layout, cudnn_tensor_dims); - PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxBackward( + CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxBackward( context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType::kOne(), cudnn_y_desc, Y->data(), cudnn_ygrad_desc, YGrad->data(), diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc index be55bc43b..31f083565 100644 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc @@ -81,7 +81,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel { // ------------------- cudnn pool algorithm --------------------- auto handle = ctx.cuda_device_context().cudnn_handle(); ScalingParamType alpha = 1.0f, beta = 0.0f; - PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward( + CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward( handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta, cudnn_output_desc, output_data)); } @@ -154,7 +154,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel { T *input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. - PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward( + CUDNN_ENFORCE(platform::dynload::cudnnPoolingBackward( handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, &beta, cudnn_input_desc, input_grad_data)); -- GitLab