From b721e23b25eba63bbbf2e08453dcb92ce0457f23 Mon Sep 17 00:00:00 2001 From: wangchaochaohu Date: Tue, 24 Mar 2020 21:25:47 +0800 Subject: [PATCH] transpose cudnn using cudnn v7 api (#19738) * refine the transopose conv using v7 to choose algorithm --- .../operators/conv_transpose_cudnn_op.cu | 220 ++++++++---------- 1 file changed, 91 insertions(+), 129 deletions(-) diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index 2907929887..e4b525981a 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/conv_cudnn_helper.h" #include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/padding.h" @@ -24,13 +25,8 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; -using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; using DataLayout = platform::DataLayout; -static constexpr size_t kConvCUDNNWorkspaceLimitBytes = 1024 * 1024 * 1024; - template static void DataTranspose(const framework::ExecutionContext& ctx, const Tensor* input, Tensor* output, @@ -68,7 +64,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { // cudnn v5 does not support dilations std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); - int user_workspace_size = ctx.Attr("workspace_size_MB"); const T* filter_data = filter->data(); const std::string data_layout_str = ctx.Attr("data_format"); const paddle::operators::DataLayout data_layout = @@ -200,60 +195,44 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { } T* transformed_output_data = transformed_output.data(); - // ------------------- cudnn descriptors --------------------- - ScopedTensorDescriptor input_desc; - ScopedTensorDescriptor output_desc; - ScopedFilterDescriptor filter_desc; - ScopedConvolutionDescriptor conv_desc; DataLayout layout; + int iwo_groups = groups; + int c_groups = 1; +#if CUDNN_VERSION_MIN(7, 0, 1) + iwo_groups = 1; + c_groups = groups; + groups = 1; +#endif + if (strides.size() == 2U) { layout = DataLayout::kNCHW; } else { layout = DataLayout::kNCDHW; } - // (N, M, H, W) or (N, M, D, H, W) - cudnnTensorDescriptor_t cudnn_input_desc = - input_desc.descriptor(layout, input_vec, groups); - // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) - cudnnTensorDescriptor_t cudnn_output_desc = - output_desc.descriptor(layout, transformed_output_vec, groups); - // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w) - cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( - layout, framework::vectorize(filter->dims()), groups); - cudnnConvolutionDescriptor_t cudnn_conv_desc = - conv_desc.descriptor(padding_common, strides, dilations); - - // ------------------- cudnn conv workspace --------------------- - size_t workspace_size_in_bytes; // final workspace to allocate. - size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; - if (user_workspace_size > 0) { - workspace_size_limit = user_workspace_size * 1024 * 1024; - } + size_t workspace_size = 0; + cudnnConvolutionBwdDataAlgo_t algo{}; // ------------------- cudnn conv algorithm --------------------- - cudnnConvolutionBwdDataAlgo_t algo; auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); - // Get the algorithm - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, - // dxDesc: Handle to the previously initialized output tensor - // descriptor. - cudnn_output_desc, - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); - - if (FLAGS_cudnn_deterministic) { - algo = static_cast(1); - } + auto layout_tensor = GetCudnnTensorFormat(layout); + bool deterministic = FLAGS_cudnn_deterministic; - // get workspace size able to allocate - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( - handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, - cudnn_output_desc, algo, &workspace_size_in_bytes)); + auto dtype = platform::CudnnDataType::type; + // ------------------- cudnn descriptors --------------------- + ConvArgs args{&transformed_output, filter, &transformed_input, strides, + padding_common, dilations}; + args.handle = handle; + args.idesc.set(transformed_output, iwo_groups); + args.wdesc.set(*filter, layout_tensor, iwo_groups); + args.odesc.set(transformed_input, iwo_groups); + args.cdesc.set(dtype, padding_common, strides, dilations, c_groups); + + using search = SearchAlgorithm; + algo = search::Find(args, false, deterministic, 2, ctx); + workspace_size = + std::max(workspace_size, search::GetWorkspaceSize(args, algo)); // ------------------- cudnn conv transpose forward --------------------- int input_offset = @@ -267,16 +246,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { auto cudnn_func = [&](void* cudnn_workspace) { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, - filter_data + filter_offset * g, cudnn_input_desc, - input_data + input_offset * g, cudnn_conv_desc, algo, - cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_output_desc, + handle, &alpha, args.wdesc.desc(), + filter_data + filter_offset * g, args.odesc.desc(), + input_data + input_offset * g, args.cdesc.desc(), algo, + cudnn_workspace, workspace_size, &beta, args.idesc.desc(), transformed_output_data + output_offset * g)); }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + workspace_handle.RunFunc(cudnn_func, workspace_size); } - if (!is_sys_pad && strides.size() == 2U) { Slice( ctx, &transformed_output, output, starts, ends, axes); @@ -432,10 +409,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { output_vec = framework::vectorize(transformed_output_grad.dims()); // ------------------- cudnn descriptors --------------------- - ScopedTensorDescriptor input_desc; - ScopedTensorDescriptor output_desc; - ScopedFilterDescriptor filter_desc; - ScopedConvolutionDescriptor conv_desc; DataLayout layout; if (strides.size() == 2U) { @@ -444,68 +417,59 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { layout = DataLayout::kNCDHW; } - // Input: (N, M, H, W) or (N, M, D, H, W) - cudnnTensorDescriptor_t cudnn_input_desc = - input_desc.descriptor(layout, input_vec, groups); - // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) - cudnnTensorDescriptor_t cudnn_output_desc = - output_desc.descriptor(layout, output_vec, groups); - // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w) - cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( - layout, framework::vectorize(filter->dims()), groups); - - cudnnConvolutionDescriptor_t cudnn_conv_desc = - conv_desc.descriptor(padding_common, strides, dilations); - - // ------------------- cudnn backward algorithm --------------------- - cudnnConvolutionFwdAlgo_t data_algo; - cudnnConvolutionBwdFilterAlgo_t filter_algo; - size_t bwd_filter_ws_size, fwd_ws_size; - size_t workspace_size_in_bytes = 0; - size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; - if (user_workspace_size > 0) { - workspace_size_limit = user_workspace_size * 1024 * 1024; - } - + int iwo_groups = groups; + int c_groups = 1; +#if CUDNN_VERSION_MIN(7, 0, 1) + iwo_groups = 1; + c_groups = groups; + groups = 1; +#endif + ConvArgs args1{&transformed_output_grad, filter, + &input_transpose, strides, + padding_common, dilations}; + ConvArgs args2{&transformed_output_grad, filter, + &input_transpose, strides, + padding_common, dilations}; + cudnnConvolutionFwdAlgo_t data_algo{}; + cudnnConvolutionBwdFilterAlgo_t filter_algo{}; + + auto layout_tensor = GetCudnnTensorFormat(layout); + size_t workspace_size = 0; auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); + auto dtype = platform::CudnnDataType::type; + bool deterministic = FLAGS_cudnn_deterministic; + T* input_grad_data = nullptr; + T* filter_grad_data = nullptr; + if (input_grad) + input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + if (filter_grad) + filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); + if (input_grad) { - // choose backward algorithm for data - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm( - handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, - cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &data_algo)); - - if (FLAGS_cudnn_deterministic) { - data_algo = static_cast(1); - } - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( - handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, - cudnn_input_desc, data_algo, &fwd_ws_size)); - workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size); + input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + args1.handle = handle; + args1.idesc.set(transformed_output_grad, iwo_groups); + args1.wdesc.set(*filter, layout_tensor, iwo_groups); + args1.odesc.set(input_transpose, iwo_groups); + args1.cdesc.set(dtype, padding_common, strides, dilations, c_groups); + using search1 = SearchAlgorithm; + data_algo = search1::Find(args1, false, deterministic, 0, ctx); + workspace_size = + std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); } if (filter_grad) { - // choose backward algorithm for filter - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, - cudnn_filter_desc, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &filter_algo)); - - if (FLAGS_cudnn_deterministic) { - filter_algo = static_cast(1); - } - // get workspace for backwards filter algorithm - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( - handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, - cudnn_filter_desc, filter_algo, &bwd_filter_ws_size)); - workspace_size_in_bytes = - std::max(workspace_size_in_bytes, bwd_filter_ws_size); + filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); + args2.handle = handle; + args2.idesc.set(transformed_output_grad, iwo_groups); + args2.wdesc.set(*filter_grad, layout_tensor, iwo_groups); + args2.odesc.set(input_transpose, iwo_groups); + args2.cdesc.set(dtype, padding_common, strides, dilations, c_groups); + using search2 = SearchAlgorithm; + filter_algo = search2::Find(args2, false, deterministic, 1, ctx); + workspace_size = std::max(workspace_size, + search2::GetWorkspaceSize(args2, filter_algo)); } // ------------------- cudnn conv backward data --------------------- @@ -517,19 +481,18 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { T alpha = static_cast(1.0), beta = static_cast(0.0); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); if (input_grad) { - T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. for (int g = 0; g < groups; g++) { auto cudnn_func = [&](void* cudnn_workspace) { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_filter_desc, - filter_data + filter_offset * g, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_input_desc, input_grad_data + input_offset * g)); + handle, &alpha, args1.idesc.desc(), + output_grad_data + output_grad_offset * g, args1.wdesc.desc(), + filter_data + filter_offset * g, args1.cdesc.desc(), + data_algo, cudnn_workspace, workspace_size, &beta, + args1.odesc.desc(), input_grad_data + input_offset * g)); }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + workspace_handle.RunFunc(cudnn_func, workspace_size); } if (data_layout == DataLayout::kNHWC) { @@ -553,20 +516,19 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // ------------------- cudnn conv backward filter --------------------- if (filter_grad) { - T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter for (int g = 0; g < groups; g++) { auto cudnn_func = [&](void* cudnn_workspace) { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_input_desc, - input_data + input_offset * g, cudnn_conv_desc, filter_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_filter_desc, filter_grad_data + filter_offset * g)); + handle, &alpha, args2.idesc.desc(), + output_grad_data + output_grad_offset * g, args2.odesc.desc(), + input_data + input_offset * g, args2.cdesc.desc(), + filter_algo, cudnn_workspace, workspace_size, &beta, + args2.wdesc.desc(), filter_grad_data + filter_offset * g)); }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + workspace_handle.RunFunc(cudnn_func, workspace_size); } } } -- GitLab