From c331e2ce2031d68a553bc9469a07c30d718438f3 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Mon, 7 Nov 2022 22:32:20 +0800 Subject: [PATCH] Define ConvRunner to wrapper the call of cudnn conv functions. (#47576) * Define ConvRunner to wrapper the call of cudnn conv functions. * Use ConvKind in SearchAlgorithm. --- paddle/phi/kernels/gpudnn/conv_cudnn_v7.h | 161 ++++++++++++- paddle/phi/kernels/gpudnn/conv_gpudnn_base.h | 4 +- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 220 +++++++---------- paddle/phi/kernels/gpudnn/conv_kernel.cu | 36 ++- .../gpudnn/conv_transpose_grad_kernel.cu | 221 ++++++++---------- .../kernels/gpudnn/conv_transpose_kernel.cu | 38 ++- 6 files changed, 367 insertions(+), 313 deletions(-) diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h index 60d5854340f..12afa223f16 100644 --- a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h +++ b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h @@ -115,7 +115,7 @@ void ChooseAlgoByWorkspace(const std::vector& perf_results, } } -template +template struct SearchAlgorithmBase {}; // cuDNN convolution forward algorithm searcher, consisted of three searching @@ -123,9 +123,10 @@ struct SearchAlgorithmBase {}; // As well as one workspace size acquirsition function with respect to // the chosen alogrithm. template <> -struct SearchAlgorithmBase { +struct SearchAlgorithmBase { using PerfT = cudnnConvolutionFwdAlgoPerf_t; using AlgoT = cudnnConvolutionFwdAlgo_t; + constexpr static phi::autotune::AlgorithmType kAlgoType = phi::autotune::AlgorithmType::kConvForward; @@ -296,9 +297,10 @@ struct SearchAlgorithmBase { // As well as one workspace size acquirsition function with // respect to the chosen alogrithm. template <> -struct SearchAlgorithmBase { +struct SearchAlgorithmBase { using PerfT = cudnnConvolutionBwdDataAlgoPerf_t; using AlgoT = cudnnConvolutionBwdDataAlgo_t; + constexpr static phi::autotune::AlgorithmType kAlgoType = phi::autotune::AlgorithmType::kConvBackwardData; @@ -478,9 +480,10 @@ struct SearchAlgorithmBase { // exhaustive_search mode. As well as one workspace size acquirsition function // with respect to the chosen alogrithm. template <> -struct SearchAlgorithmBase { +struct SearchAlgorithmBase { using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t; using AlgoT = cudnnConvolutionBwdFilterAlgo_t; + constexpr static phi::autotune::AlgorithmType kAlgoType = phi::autotune::AlgorithmType::kConvBackwardFilter; @@ -684,9 +687,9 @@ struct SearchAlgorithmBase { } }; -template -struct SearchAlgorithm : public SearchAlgorithmBase { - using AlgoT = typename SearchAlgorithmBase::AlgoT; +template +struct SearchAlgorithm : public SearchAlgorithmBase { + using AlgoT = typename SearchAlgorithmBase::AlgoT; template static SearchResult Find(const phi::GPUContext& ctx, @@ -700,7 +703,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase { SetConvMathType(ctx, dtype, args.cdesc); if (deterministic) { - result = SearchAlgorithmBase::FindAlgoDeterministic(args); + result = SearchAlgorithmBase::FindAlgoDeterministic(args); } else { // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. // 2. Once turning on auto-tune, run heuristic (default) before @@ -710,7 +713,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase { // default mode for the rest. auto key = args.ConvertToConvCacheKey(); auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv( - SearchAlgorithmBase::kAlgoType); + SearchAlgorithmBase::kAlgoType); bool find_in_cache = cache.Find(key); if (find_in_cache) { auto t = cache.Get(key); @@ -727,7 +730,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase { // Once autotune is enabled, the autotuned result can rewrite the // previous result in cache found by heuristic method. result = - SearchAlgorithmBase::template FindAlgoExhaustiveSearch( + SearchAlgorithmBase::template FindAlgoExhaustiveSearch( args, ctx); cache.Set(key, phi::autotune::ConvAutoTuneResult( @@ -735,7 +738,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase { result.workspace_size, true)); } else if (!find_in_cache) { - result = SearchAlgorithmBase::FindAlgoHeuristic(args, ctx); + result = SearchAlgorithmBase::FindAlgoHeuristic(args, ctx); cache.Set(key, phi::autotune::ConvAutoTuneResult( static_cast(result.algo), @@ -744,7 +747,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase { } } } - VLOG(3) << "[cuDNN " << SearchAlgorithmBase::GetPerfName() + VLOG(3) << "[cuDNN " << SearchAlgorithmBase::GetPerfName() << "] exhaustive_search=" << exhaustive_search << ", use_autotune=" << use_autotune << ", deterministic=" << deterministic @@ -783,4 +786,138 @@ struct SearchAlgorithm : public SearchAlgorithmBase { } }; +template +struct ConvRunner {}; + +template +struct ConvRunner { + static void Apply( + const phi::GPUContext& ctx, + const ConvArgs& args, + const SearchResult& search_result, + const T* input_ptr, + const T* filter_ptr, + T* output_ptr, + int groups, + int group_offset_in, + int group_offset_filter, + int group_offset_out, + size_t workspace_size, + phi::DnnWorkspaceHandle* workspace_handle, + bool use_addto = false) { + ScalingParamType alpha = 1.0f; + ScalingParamType beta = use_addto ? 1.0f : 0.0f; + + auto cudnn_handle = ctx.cudnn_handle(); + for (int i = 0; i < groups; i++) { + workspace_handle->RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnConvolutionForward( + cudnn_handle, + &alpha, + args.idesc.desc(), + input_ptr + i * group_offset_in, + args.wdesc.desc(), + filter_ptr + i * group_offset_filter, + args.cdesc.desc(), + search_result.algo, + workspace_ptr, + workspace_size, + &beta, + args.odesc.desc(), + output_ptr + i * group_offset_out)); + }, + workspace_size); + } + } +}; + +template +struct ConvRunner { + static void Apply( + const phi::GPUContext& ctx, + const ConvArgs& args, + const SearchResult& search_result, + const T* output_grad_ptr, + const T* filter_ptr, + T* input_grad_ptr, + int groups, + int group_offset_in, + int group_offset_filter, + int group_offset_out, + size_t workspace_size, + phi::DnnWorkspaceHandle* workspace_handle, + bool use_addto = false) { + ScalingParamType alpha = 1.0f; + ScalingParamType beta = use_addto ? 1.0f : 0.0f; + + auto cudnn_handle = ctx.cudnn_handle(); + for (int i = 0; i < groups; i++) { + workspace_handle->RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnConvolutionBackwardData( + cudnn_handle, + &alpha, + args.wdesc.desc(), + filter_ptr + i * group_offset_filter, + args.odesc.desc(), + output_grad_ptr + i * group_offset_out, + args.cdesc.desc(), + search_result.algo, + workspace_ptr, + workspace_size, + &beta, + args.idesc.desc(), + input_grad_ptr + i * group_offset_in)); + }, + workspace_size); + } + } +}; + +template +struct ConvRunner { + static void Apply( + const phi::GPUContext& ctx, + const ConvArgs& args, + const SearchResult& search_result, + const T* output_grad_ptr, + const T* input_ptr, + T* filter_grad_ptr, + int groups, + int group_offset_in, + int group_offset_filter, + int group_offset_out, + size_t workspace_size, + phi::DnnWorkspaceHandle* workspace_handle, + bool use_addto = false) { + ScalingParamType alpha = 1.0f; + ScalingParamType beta = use_addto ? 1.0f : 0.0f; + + auto cudnn_handle = ctx.cudnn_handle(); + for (int i = 0; i < groups; i++) { + workspace_handle->RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnConvolutionBackwardFilter( + cudnn_handle, + &alpha, + args.idesc.desc(), + input_ptr + i * group_offset_in, + args.odesc.desc(), + output_grad_ptr + i * group_offset_out, + args.cdesc.desc(), + search_result.algo, + workspace_ptr, + workspace_size, + &beta, + args.wdesc.desc(), + filter_grad_ptr + i * group_offset_filter)); + }, + workspace_size); + } + } +}; + } // namespace phi diff --git a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h index 537fee8a682..8f3604ed42e 100644 --- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h +++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h @@ -34,7 +34,9 @@ template using ScalingParamType = typename paddle::platform::CudnnDataType::ScalingParamType; -// As the container of searchAlgorithm::Find() result. +enum class ConvKind { kForward = 1, kBackwardData = 2, kBackwardFilter = 3 }; + +// The container of SearchAlgorithm::Find() result. template struct SearchResult { SearchResult() {} diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index a6934596332..6d7c486a75f 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -376,7 +376,7 @@ void ConvCudnnGradKernel(const Context& ctx, bwd_result.algo = search1::Find( args1, exhaustive_search, deterministic, workspace_size, ctx); #else - using search1 = SearchAlgorithm; + using search1 = SearchAlgorithm; bwd_result = search1::Find(ctx, args1, exhaustive_search, deterministic); workspace_size = std::max(workspace_size, bwd_result.workspace_size); #endif @@ -401,7 +401,7 @@ void ConvCudnnGradKernel(const Context& ctx, filter_result.algo = search2::Find( args2, exhaustive_search, deterministic, workspace_size, ctx); #else - using search2 = SearchAlgorithm; + using search2 = SearchAlgorithm; filter_result = search2::Find(ctx, args2, exhaustive_search, deterministic); VLOG(3) << "filter algo: " << filter_result.algo << ", time " @@ -481,30 +481,22 @@ void ConvCudnnGradKernel(const Context& ctx, }, workspace_size); } - #else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionBackwardData( - handle, - &alpha, - args1.wdesc.desc(), - filter_data + i * group_offset_filter, - args1.odesc.desc(), - output_grad_data + i * group_offset_out, - args1.cdesc.desc(), - bwd_result.algo, - cudnn_workspace_ptr, - workspace_size, - &beta, - args1.idesc.desc(), - transformed_input_grad_data + i * group_offset_in)); - }, - workspace_size); - } + ConvRunner::Apply(ctx, + args1, + bwd_result, + output_grad_data, + filter_data, + transformed_input_grad_data, + groups, + group_offset_in, + group_offset_filter, + group_offset_out, + workspace_size, + &workspace_handle, + use_addto); #endif + if (!is_sys_pad) { std::vector starts(transformed_input_channel.dims().size(), 0); std::vector axes(transformed_input_channel.dims().size(), 0); @@ -536,8 +528,6 @@ void ConvCudnnGradKernel(const Context& ctx, } } - // filter_grad do not use inplace addto. - ScalingParamType beta_filter = 0.0f; // ------------------- cudnn conv backward filter --------------------- if (filter_grad) { // Because beta is zero, it is unnecessary to reset filter_grad. @@ -562,27 +552,19 @@ void ConvCudnnGradKernel(const Context& ctx, }, workspace_size); #else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionBackwardFilter( - handle, - &alpha, - args2.idesc.desc(), - input_data + i * group_offset_in, - args2.odesc.desc(), - output_grad_data + i * group_offset_out, - args2.cdesc.desc(), - filter_result.algo, - cudnn_workspace_ptr, - workspace_size, - &beta_filter, - args2.wdesc.desc(), - filter_grad_data + i * group_offset_filter)); - }, - workspace_size); - } + ConvRunner::Apply(ctx, + args2, + filter_result, + output_grad_data, + input_data, + filter_grad_data, + groups, + group_offset_in, + group_offset_filter, + group_offset_out, + workspace_size, + &workspace_handle, + false); #endif if (compute_format == paddle::platform::DataLayout::kNHWC) { @@ -952,7 +934,7 @@ void ConvCudnnGradGradKernel( fwd_result1.algo = search1::Find( args1, exhaustive_search, false, workspace_size, ctx); #else - using search1 = SearchAlgorithm; + using search1 = SearchAlgorithm; fwd_result1 = search1::Find(ctx, args1, exhaustive_search, false); workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); #endif @@ -977,7 +959,7 @@ void ConvCudnnGradGradKernel( fwd_result2.algo = search2::Find( args2, exhaustive_search, false, workspace_size, ctx); #else - using search2 = SearchAlgorithm; + using search2 = SearchAlgorithm; fwd_result2 = search2::Find(ctx, args2, exhaustive_search, false); workspace_size = std::max( workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); @@ -1003,7 +985,7 @@ void ConvCudnnGradGradKernel( filter_result.algo = search3::Find( args3, exhaustive_search, deterministic, workspace_size, ctx); #else - using search3 = SearchAlgorithm; + using search3 = SearchAlgorithm; filter_result = search3::Find(ctx, args3, exhaustive_search, deterministic); workspace_size = std::max( @@ -1030,7 +1012,7 @@ void ConvCudnnGradGradKernel( data_result.algo = search4::Find( args4, exhaustive_search, deterministic, workspace_size, ctx); #else - using search4 = SearchAlgorithm; + using search4 = SearchAlgorithm; data_result = search4::Find(ctx, args4, exhaustive_search, deterministic); workspace_size = std::max( @@ -1088,27 +1070,19 @@ void ConvCudnnGradGradKernel( }, workspace_size); #else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionForward( - handle, - &alpha, - args1.idesc.desc(), - ddx + i * group_offset_in, - args1.wdesc.desc(), - w + i * group_offset_filter, - args1.cdesc.desc(), - fwd_result1.algo, - workspace_ptr, - workspace_size, - &beta, - args1.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } + ConvRunner::Apply(ctx, + args1, + fwd_result1, + ddx, + w, + transformed_ddy_channel, + groups, + group_offset_in, + group_offset_filter, + group_offset_out, + workspace_size, + &workspace_handle, + false); #endif } if (ddW) { @@ -1134,27 +1108,19 @@ void ConvCudnnGradGradKernel( }, workspace_size); #else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionForward( - handle, - &alpha, - args2.idesc.desc(), - x + i * group_offset_in, - args2.wdesc.desc(), - ddw + i * group_offset_filter, - args2.cdesc.desc(), - fwd_result2.algo, - workspace_ptr, - workspace_size, - &alpha, - args2.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } + ConvRunner::Apply(ctx, + args2, + fwd_result2, + x, + ddw, + transformed_ddy_channel, + groups, + group_offset_in, + group_offset_filter, + group_offset_out, + workspace_size, + &workspace_handle, + true); #endif } if (channel_last) { @@ -1185,27 +1151,19 @@ void ConvCudnnGradGradKernel( }, workspace_size); #else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionBackwardFilter( - handle, - &alpha, - args3.idesc.desc(), - ddx + i * group_offset_in, - args3.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.cdesc.desc(), - filter_result.algo, - workspace_ptr, - workspace_size, - &beta, - args3.wdesc.desc(), - dw + i * group_offset_filter)); - }, - workspace_size); - } + ConvRunner::Apply(ctx, + args3, + filter_result, + transformed_dy_channel, + ddx, + dw, + groups, + group_offset_in, + group_offset_filter, + group_offset_out, + workspace_size, + &workspace_handle, + false); #endif } @@ -1232,27 +1190,19 @@ void ConvCudnnGradGradKernel( }, workspace_size); #else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionBackwardData( - handle, - &alpha, - args4.wdesc.desc(), - ddw + i * group_offset_filter, - args4.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.cdesc.desc(), - data_result.algo, - workspace_ptr, - workspace_size, - &beta, - args4.idesc.desc(), - transformed_dx + i * group_offset_in)); - }, - workspace_size); - } + ConvRunner::Apply(ctx, + args4, + data_result, + transformed_dy_channel, + ddw, + transformed_dx, + groups, + group_offset_in, + group_offset_filter, + group_offset_out, + workspace_size, + &workspace_handle, + false); #endif if (!is_sys_pad) { diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index a393cc278f2..ba4cc129907 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -315,7 +315,7 @@ void ConvCudnnKernel(const Context& ctx, args, exhaustive_search, deterministic, workspace_size, ctx); #else SearchResult fwd_result; - using search = SearchAlgorithm; + using search = SearchAlgorithm; fwd_result = search::Find(ctx, args, exhaustive_search, deterministic); workspace_size = fwd_result.workspace_size; #endif @@ -359,27 +359,19 @@ void ConvCudnnKernel(const Context& ctx, }, workspace_size); #else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionForward( - handle, - &alpha, - args.idesc.desc(), - input_data + i * group_offset_in, - args.wdesc.desc(), - filter_data + i * group_offset_filter, - args.cdesc.desc(), - fwd_result.algo, - workspace_ptr, - workspace_size, - &beta, - args.odesc.desc(), - output_data + i * group_offset_out)); - }, - workspace_size); - } + ConvRunner::Apply(ctx, + args, + fwd_result, + input_data, + filter_data, + output_data, + groups, + group_offset_in, + group_offset_filter, + group_offset_out, + workspace_size, + &workspace_handle, + false); #endif if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu index f2c3cd0cc69..78961c86b07 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu @@ -227,7 +227,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, fwd_result.algo = search1::Find(args1, false, deterministic, workspace_size, ctx); #else - using search1 = SearchAlgorithm; + using search1 = SearchAlgorithm; fwd_result = search1::Find(ctx, args1, false, deterministic, false); workspace_size = std::max( workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo)); @@ -252,7 +252,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, filter_result.algo = search2::Find(args2, false, deterministic, workspace_size, ctx); #else - using search2 = SearchAlgorithm; + using search2 = SearchAlgorithm; filter_result = search2::Find(ctx, args2, false, deterministic, false); workspace_size = std::max( workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo)); @@ -269,9 +269,9 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ScalingParamType beta = 0.0f; auto workspace_handle = ctx.cudnn_workspace_handle(); if (dx) { +#ifdef PADDLE_WITH_HIP // Because beta is zero, it is unnecessary to reset dx. for (int g = 0; g < groups; g++) { -#ifdef PADDLE_WITH_HIP auto cudnn_func = [&](void* cudnn_workspace) { PADDLE_ENFORCE_GPU_SUCCESS( dynload::miopenConvolutionForward(handle, @@ -288,26 +288,23 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, cudnn_workspace, workspace_size)); }; + workspace_handle.RunFunc(cudnn_func, workspace_size); + } #else // PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS( - dynload::cudnnConvolutionForward(handle, - &alpha, - args1.idesc.desc(), - dout_data + dout_offset * g, - args1.wdesc.desc(), - filter_data + filter_offset * g, - args1.cdesc.desc(), - fwd_result.algo, - cudnn_workspace, + ConvRunner::Apply(ctx, + args1, + fwd_result, + dout_data, + filter_data, + dx_data, + groups, + dout_offset, + filter_offset, + x_offset, workspace_size, - &beta, - args1.odesc.desc(), - dx_data + x_offset * g)); - }; + &workspace_handle, + false); #endif // PADDLE_WITH_HIP - workspace_handle.RunFunc(cudnn_func, workspace_size); - } if (data_layout == GPUDNNDataLayout::kNHWC) { DenseTensor dx_transpose; @@ -330,8 +327,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, if (dfilter) { // Because beta is zero, it is unnecessary to reset dfilter. // Gradient with respect to the filter - for (int g = 0; g < groups; g++) { #ifdef PADDLE_WITH_HIP + for (int g = 0; g < groups; g++) { auto cudnn_func = [&](void* cudnn_workspace) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardWeights( handle, @@ -348,26 +345,23 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, cudnn_workspace, workspace_size)); }; -#else // PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter( - handle, - &alpha, - args2.idesc.desc(), - dout_data + dout_offset * g, - args2.odesc.desc(), - x_data + x_offset * g, - args2.cdesc.desc(), - filter_result.algo, - cudnn_workspace, - workspace_size, - &beta, - args2.wdesc.desc(), - dfilter_data + filter_offset * g)); - }; -#endif // PADDLE_WITH_HIP workspace_handle.RunFunc(cudnn_func, workspace_size); } +#else // PADDLE_WITH_HIP + ConvRunner::Apply(ctx, + args2, + filter_result, + x_data, + dout_data, + dfilter_data, + groups, + dout_offset, + filter_offset, + x_offset, + workspace_size, + &workspace_handle, + false); +#endif // PADDLE_WITH_HIP } } @@ -704,7 +698,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( bwd_result1.algo = search1::Find(args1, false, deterministic, workspace_size, ctx); #else - using search1 = SearchAlgorithm; + using search1 = SearchAlgorithm; bwd_result1 = search1::Find(ctx, args1, false, deterministic, false); workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo); #endif @@ -726,7 +720,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( bwd_result2.algo = search2::Find(args2, false, deterministic, workspace_size, ctx); #else - using search2 = SearchAlgorithm; + using search2 = SearchAlgorithm; bwd_result2 = search2::Find(ctx, args2, false, deterministic, false); workspace_size = std::max( workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo)); @@ -751,7 +745,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( filter_result.algo = search3::Find(args3, false, deterministic, workspace_size, ctx); #else - using search3 = SearchAlgorithm; + using search3 = SearchAlgorithm; filter_result = search3::Find(ctx, args3, false, deterministic, false); workspace_size = std::max( workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); @@ -777,7 +771,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( fwd_result.algo = search4::Find(args4, false, deterministic, workspace_size, ctx); #else - using search4 = SearchAlgorithm; + using search4 = SearchAlgorithm; fwd_result = search4::Find(ctx, args4, false, deterministic, false); workspace_size = std::max( workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo)); @@ -815,8 +809,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( if (ddout) { ddx_ = transformed_ddx.data(); - for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP + for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData( @@ -835,30 +829,25 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( workspace_size)); }, workspace_size); + } #else // PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData( - handle, - &alpha, - args1.wdesc.desc(), - filter_ + i * group_offset_filter, - args1.odesc.desc(), - ddx_ + i * group_offset_in, - args1.cdesc.desc(), - bwd_result1.algo, - workspace_ptr, - workspace_size, - &beta, - args1.idesc.desc(), - transformed_ddout_channel_ + i * group_offset_out)); - }, - workspace_size); + ConvRunner::Apply(ctx, + args1, + bwd_result1, + ddx_, + filter_, + transformed_ddout_channel_, + groups, + group_offset_out, + group_offset_filter, + group_offset_in, + workspace_size, + &workspace_handle, + false); #endif // PADDLE_WITH_HIP - } - for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP + for (int i = 0; i < groups; i++) { // MIOPEN ONLY support beta to be 0.0f DenseTensor conv_x_ddfilter(dout.type()); conv_x_ddfilter.Resize(transformed_ddout_channel.dims()); @@ -893,27 +882,22 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( &beta, args2.idesc.desc(), transformed_ddout_channel_ + i * group_offset_out)); + } #else // PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData( - handle, - &alpha, - args2.wdesc.desc(), - ddfilter_ + i * group_offset_filter, - args2.odesc.desc(), - x_ + i * group_offset_in, - args2.cdesc.desc(), - bwd_result2.algo, - workspace_ptr, - workspace_size, - &alpha, - args2.idesc.desc(), - transformed_ddout_channel_ + i * group_offset_out)); - }, - workspace_size); + ConvRunner::Apply(ctx, + args2, + bwd_result2, + x_, + ddfilter_, + transformed_ddout_channel_, + groups, + group_offset_out, + group_offset_filter, + group_offset_in, + workspace_size, + &workspace_handle, + true); #endif // PADDLE_WITH_HIP - } if ((!is_sys_pad) && (!channel_last)) { if (strides.size() == 2U) { @@ -947,8 +931,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( T* transformed_dout_channel_ = transformed_dout.data(); if (dfilter) { ddx_ = transformed_ddx_channel.data(); - for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP + for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( @@ -968,33 +952,28 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( workspace_size)); }, workspace_size); + } #else // PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter( - handle, - &alpha, - args3.idesc.desc(), - transformed_dout_channel_ + i * group_offset_out, - args3.odesc.desc(), - ddx_ + i * group_offset_in, - args3.cdesc.desc(), - filter_result.algo, - workspace_ptr, - workspace_size, - &beta, - args3.wdesc.desc(), - dfilter_ + i * group_offset_filter)); - }, - workspace_size); + ConvRunner::Apply(ctx, + args3, + filter_result, + ddx_, + transformed_dout_channel_, + dfilter_, + groups, + group_offset_out, + group_offset_filter, + group_offset_in, + workspace_size, + &workspace_handle, + false); #endif // PADDLE_WITH_HIP - } } if (dx) { ddfilter_ = ddfilter.data(); - for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP + for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward( @@ -1013,27 +992,23 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( workspace_size)); }, workspace_size); + } #else // PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionForward( - handle, - &alpha, - args4.idesc.desc(), - transformed_dout_channel_ + i * group_offset_out, - args4.wdesc.desc(), - ddfilter_ + i * group_offset_filter, - args4.cdesc.desc(), - fwd_result.algo, - workspace_ptr, - workspace_size, - &beta, - args4.odesc.desc(), - transformed_dx_ + i * group_offset_in)); - }, - workspace_size); + ConvRunner::Apply(ctx, + args4, + fwd_result, + transformed_dout_channel_, + ddfilter_, + transformed_dx_, + groups, + group_offset_out, + group_offset_filter, + group_offset_in, + workspace_size, + &workspace_handle, + false); #endif // PADDLE_WITH_HIP - } + if (channel_last) { TransToChannelLast(ctx, &transformed_dx_channel, dx); } diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu index 13e668f7602..593114ac65f 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu @@ -227,7 +227,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, search::Find(args, false, deterministic, workspace_size, ctx); #else SearchResult bwd_result; - using search = SearchAlgorithm; + using search = SearchAlgorithm; bwd_result = search::Find(ctx, args, false, deterministic, false); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo)); @@ -240,8 +240,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, ScalingParamType alpha = 1.0f; ScalingParamType beta = 0.0f; auto workspace_handle = ctx.cudnn_workspace_handle(); - for (int g = 0; g < groups; g++) { #ifdef PADDLE_WITH_HIP + for (int g = 0; g < groups; g++) { auto cudnn_func = [&](void* cudnn_workspace) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData( handle, @@ -258,26 +258,24 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, cudnn_workspace, workspace_size)); }; -#else // PADDLE_WITH_HIP - auto cudnn_func = [&](void* cudnn_workspace) { - PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData( - handle, - &alpha, - args.wdesc.desc(), - filter_data + filter_offset * g, - args.odesc.desc(), - x_data + x_offset * g, - args.cdesc.desc(), - bwd_result.algo, - cudnn_workspace, - workspace_size, - &beta, - args.idesc.desc(), - transformed_out_data + out_offset * g)); - }; -#endif // PADDLE_WITH_HIP workspace_handle.RunFunc(cudnn_func, workspace_size); } +#else // PADDLE_WITH_HIP + ConvRunner::Apply(ctx, + args, + bwd_result, + x_data, + filter_data, + transformed_out_data, + groups, + out_offset, + filter_offset, + x_offset, + workspace_size, + &workspace_handle, + false); +#endif // PADDLE_WITH_HIP + if (!is_sys_pad && strides.size() == 2U) { funcs::Slice(ctx, &transformed_out, out, starts, ends, axes); } else if (!is_sys_pad && strides.size() == 3U) { -- GitLab