From bc47e7ac2c63e2ba12984f71a82f2d312eb6ef18 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Mon, 24 Oct 2022 16:21:38 +0800 Subject: [PATCH] Enhance the implementation of some conv functions. (#47281) --- paddle/phi/kernels/gpudnn/conv_cudnn_v7.h | 83 +++++++------------ paddle/phi/kernels/gpudnn/conv_gpudnn_base.h | 17 ++-- .../kernels/gpudnn/conv_grad_grad_kernel.cu | 34 ++++---- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 50 +++++------ paddle/phi/kernels/gpudnn/conv_kernel.cu | 11 ++- .../gpudnn/conv_transpose_grad_kernel.cu | 45 +++++----- .../kernels/gpudnn/conv_transpose_kernel.cu | 4 +- 7 files changed, 112 insertions(+), 132 deletions(-) diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h index 7010365f0d..60d5854340 100644 --- a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h +++ b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h @@ -75,9 +75,9 @@ void ChooseAlgoByWorkspace(const std::vector& perf_results, SearchResult* search_result) { int best_algo_idx = -1; for (size_t i = 0; i < perf_results.size(); ++i) { - auto result = perf_results[i]; + const auto& result = perf_results[i]; if (result.status == CUDNN_STATUS_SUCCESS && - result.memory < workspace_limit) { + result.memory <= workspace_limit) { if (best_algo_idx == -1) { // The algorithm which has minimize time cost and need a workspace_size // fitting the workspace_limit constraint. @@ -87,8 +87,10 @@ void ChooseAlgoByWorkspace(const std::vector& perf_results, break; } } else { - float best_algo_time = perf_results[best_algo_idx].time; - if ((result.time - best_algo_time) / best_algo_time < 0.01) { + // Compared to the next suboptimal algorithm, if the best one only has + // 1% performance difference, we'd like to pick the one which need less + // memory. + if (result.time < 1.01 * perf_results[best_algo_idx].time) { best_algo_idx = (result.memory < perf_results[best_algo_idx].memory) ? i : best_algo_idx; @@ -98,9 +100,15 @@ void ChooseAlgoByWorkspace(const std::vector& perf_results, } } if (best_algo_idx != -1) { - search_result->algo = perf_results[best_algo_idx].algo; - search_result->time = perf_results[best_algo_idx].time; - search_result->workspace_size = perf_results[best_algo_idx].memory; + const auto& result = perf_results[best_algo_idx]; + search_result->algo = result.algo; + search_result->time = result.time; + search_result->workspace_size = result.memory; + auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F"; + VLOG(3) << "Choose algo=" << result.algo + << ", tensor_core=" << math_type_str << ", time=" << result.time + << " ms, memory=" << ToMegaBytes(result.memory) + << " MB, status=" << result.status; } else { VLOG(3) << "Can not find an algorithm that requires memory < " << ToMegaBytes(workspace_limit) << " MB"; @@ -626,7 +634,8 @@ struct SearchAlgorithmBase { perf_results, perf_results.size(), workspace_size_limit); - ChooseAlgo(perf_results, workspace_size_limit, &result); + ChooseAlgoByWorkspace( + perf_results, workspace_size_limit, &result); } result.workspace_size = GetWorkspaceSize(args, result.algo); @@ -673,42 +682,6 @@ struct SearchAlgorithmBase { return workspace_size_limit; } } - - static void ChooseAlgo(const std::vector& perf_results, - size_t workspace_limit, - SearchResult* algo_result) { - for (size_t i = 0; i != perf_results.size(); ++i) { - const auto& result = perf_results[i]; - if (result.status == CUDNN_STATUS_SUCCESS && - (result.memory <= workspace_limit)) { - if ((result.mathType == CUDNN_TENSOR_OP_MATH) && - (i != perf_results.size() - 1)) { - const auto& next_result = perf_results[i + 1]; - if (next_result.status == CUDNN_STATUS_SUCCESS && - next_result.algo == result.algo && - next_result.memory == result.memory && - next_result.mathType != CUDNN_TENSOR_OP_MATH && - next_result.time < 1.01 * result.time) { - // Skip over this result- it's not really a Tensor Core algo. - // Because it is only 1% performance difference. - // Prefer to choose the next equivalent non-Tensor Core algo. - continue; - } - } - algo_result->algo = result.algo; - algo_result->time = result.time; - auto math_type_str = "0"; - if (result.mathType == CUDNN_TENSOR_OP_MATH) { - math_type_str = "1"; - } - VLOG(3) << " choose algo: " << result.algo - << ", TC: " << math_type_str << ", time: " << result.time - << " ms, wksp = " << result.memory - << ", status = " << result.status; - break; - } - } - } }; template @@ -735,7 +708,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase { // Auto tune is only enabled between specified range. // 3. After auto-tune process, run cached algorithm if cached, run // default mode for the rest. - auto key = args.Convert2ConvCacheKey(); + auto key = args.ConvertToConvCacheKey(); auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv( SearchAlgorithmBase::kAlgoType); bool find_in_cache = cache.Find(key); @@ -746,7 +719,6 @@ struct SearchAlgorithm : public SearchAlgorithmBase { result.exhaustive_search = t.exhaustive_search; } if (!result.exhaustive_search) { - bool need_update_cache = false; // In conv2d_tranpose, enable_autotune is set to false because some // algorithm picked by exhaustive search method produce wrong result. use_autotune = enable_autotune && @@ -757,17 +729,18 @@ struct SearchAlgorithm : public SearchAlgorithmBase { result = SearchAlgorithmBase::template FindAlgoExhaustiveSearch( args, ctx); - need_update_cache = true; + cache.Set(key, + phi::autotune::ConvAutoTuneResult( + static_cast(result.algo), + result.workspace_size, + true)); } else if (!find_in_cache) { result = SearchAlgorithmBase::FindAlgoHeuristic(args, ctx); - need_update_cache = true; - } - if (need_update_cache) { - phi::autotune::ConvAutoTuneResult node( - static_cast(result.algo), - result.workspace_size, - exhaustive_search || use_autotune); - cache.Set(key, node); + cache.Set(key, + phi::autotune::ConvAutoTuneResult( + static_cast(result.algo), + result.workspace_size, + false)); } } } diff --git a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h index 7d29e64c68..537fee8a68 100644 --- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h +++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h @@ -69,10 +69,15 @@ static std::ostream& operator<<(std::ostream& out, const std::vector& v) { template struct ConvArgsBase { HandleT handle; - paddle::platform::TensorDescriptor idesc, odesc; + paddle::platform::TensorDescriptor idesc; + paddle::platform::TensorDescriptor odesc; paddle::platform::FilterDescriptor wdesc; paddle::platform::ConvolutionDescriptor cdesc; - const phi::DenseTensor *x, *w, *o; + + const phi::DenseTensor* x = nullptr; + const phi::DenseTensor* w = nullptr; + const phi::DenseTensor* o = nullptr; + DataT cudnn_dtype; // strides @@ -88,7 +93,8 @@ struct ConvArgsBase { // data foramt GPUDNNDataLayout data_layout; - ConvArgsBase(const phi::DenseTensor* x, + ConvArgsBase(const HandleT& h, + const phi::DenseTensor* x, const phi::DenseTensor* w, const phi::DenseTensor* o, const std::vector s, @@ -97,7 +103,8 @@ struct ConvArgsBase { DataT dtype, int g, GPUDNNDataLayout layout) - : x(x), + : handle(h), + x(x), w(w), o(o), s(s), @@ -108,7 +115,7 @@ struct ConvArgsBase { data_layout(layout) {} template - phi::autotune::ConvCacheKey Convert2ConvCacheKey() const { + phi::autotune::ConvCacheKey ConvertToConvCacheKey() const { auto x_shape = phi::vectorize(x->dims()); auto w_shape = phi::vectorize(w->dims()); VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu index 7b4bd05991..de1360a596 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu @@ -257,7 +257,8 @@ void ConvCudnnGradGradKernel( auto layout = paddle::platform::GetCudnnTensorFormat( paddle::platform::DataLayout::kNCHW); - ConvArgs args1{&transformed_ddX, + ConvArgs args1{handle, + &transformed_ddX, W, &transformed_ddO_channel, strides, @@ -266,7 +267,8 @@ void ConvCudnnGradGradKernel( dtype, groups, paddle::platform::DataLayout::kNCHW}; - ConvArgs args2{&transformed_X, + ConvArgs args2{handle, + &transformed_X, ddW, &transformed_ddO_channel, strides, @@ -275,7 +277,8 @@ void ConvCudnnGradGradKernel( dtype, groups, paddle::platform::DataLayout::kNCHW}; - ConvArgs args3{&transformed_ddX, + ConvArgs args3{handle, + &transformed_ddX, dW, &transformed_dO_channel, strides, @@ -284,7 +287,8 @@ void ConvCudnnGradGradKernel( dtype, groups, paddle::platform::DataLayout::kNCHW}; - ConvArgs args4{&transformed_dX, + ConvArgs args4{handle, + &transformed_dX, ddW, &transformed_dO_channel, strides, @@ -314,7 +318,6 @@ void ConvCudnnGradGradKernel( ddy = ddO->data(); transformed_ddy_channel = transformed_ddO_channel.data(); if (ddX) { - args1.handle = handle; args1.idesc.set(transformed_ddX, iwo_group); args1.wdesc.set(*W, layout, iwo_group); args1.odesc.set(transformed_ddO_channel, iwo_group); @@ -339,7 +342,6 @@ void ConvCudnnGradGradKernel( if (ddW) { ddw = ddW->data(); - args2.handle = handle; args2.idesc.set(transformed_X, iwo_group); args2.wdesc.set(*ddW, layout, iwo_group); args2.odesc.set(transformed_ddO_channel, iwo_group); @@ -367,7 +369,6 @@ void ConvCudnnGradGradKernel( if (dW && ddX) { dw = dW->data(); - args3.handle = handle; args3.idesc.set(transformed_ddX, iwo_group); args3.wdesc.set(*dW, layout, iwo_group); args3.odesc.set(transformed_dO_channel, iwo_group); @@ -395,7 +396,6 @@ void ConvCudnnGradGradKernel( if (ddW && dX) { transformed_dx = transformed_dX.data(); - args4.handle = handle; args4.idesc.set(transformed_dX, iwo_group); args4.wdesc.set(*ddW, layout, iwo_group); args4.odesc.set(transformed_dO_channel, iwo_group); @@ -444,13 +444,13 @@ void ConvCudnnGradGradKernel( // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : // 0.0f; // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); - auto wkspace_handle = ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); if (ddO) { if (ddX) { ddx = transformed_ddX.data(); #ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::miopenConvolutionForward( @@ -471,7 +471,7 @@ void ConvCudnnGradGradKernel( workspace_size); #else for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cudnnConvolutionForward( @@ -496,7 +496,7 @@ void ConvCudnnGradGradKernel( if (ddW) { #ifdef PADDLE_WITH_HIP // MIOPEN ONLY support beta to be 0.0f - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::miopenConvolutionForward( @@ -517,7 +517,7 @@ void ConvCudnnGradGradKernel( workspace_size); #else for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cudnnConvolutionForward( @@ -547,7 +547,7 @@ void ConvCudnnGradGradKernel( if (dW && ddX) { ddx = transformed_ddX.data(); #ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::miopenConvolutionBackwardWeights( @@ -568,7 +568,7 @@ void ConvCudnnGradGradKernel( workspace_size); #else for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cudnnConvolutionBackwardFilter( @@ -594,7 +594,7 @@ void ConvCudnnGradGradKernel( if (dX && ddW) { ddw = ddW->data(); #ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::miopenConvolutionBackwardData( @@ -615,7 +615,7 @@ void ConvCudnnGradGradKernel( workspace_size); #else for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cudnnConvolutionBackwardData( diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index aa8cbd6c87..9a5bd1c5bc 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -251,12 +251,14 @@ void ConvCudnnGradKernel(const Context& ctx, T* input_grad_data = nullptr; T* transformed_input_grad_data = nullptr; + auto handle = ctx.cudnn_handle(); paddle::platform::DataLayout layout = compute_format == paddle::platform::DataLayout::kNHWC ? paddle::platform::DataLayout::kNHWC : paddle::platform::DataLayout::kNCHW; - ConvArgs args1{&transformed_input_grad, + ConvArgs args1{handle, + &transformed_input_grad, &transformed_filter_channel, &transformed_output_grad_channel, strides, @@ -265,7 +267,8 @@ void ConvCudnnGradKernel(const Context& ctx, dtype, groups, layout}; - ConvArgs args2{&transformed_input, + ConvArgs args2{handle, + &transformed_input, &transformed_filter_grad_channel, &transformed_output_grad_channel, strides, @@ -275,7 +278,6 @@ void ConvCudnnGradKernel(const Context& ctx, groups, layout}; - auto handle = ctx.cudnn_handle(); // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout if (transformed_input.dims().size() == 5) { @@ -332,10 +334,7 @@ void ConvCudnnGradKernel(const Context& ctx, SearchResult bwd_result; SearchResult filter_result; #endif - // input data workspace_size - size_t workspace_size_d = 0; - // weight workspace_size - size_t workspace_size_w = 0; + size_t workspace_size = 0; int iwo_groups = groups; int c_groups = 1; @@ -350,7 +349,6 @@ void ConvCudnnGradKernel(const Context& ctx, input_grad_data = input_grad->data(); transformed_input_grad_data = transformed_input_grad.data(); - args1.handle = handle; args1.idesc.set(transformed_input_grad, layout_tensor); args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups); args1.odesc.set(transformed_output_grad_channel, layout_tensor); @@ -363,21 +361,20 @@ void ConvCudnnGradKernel(const Context& ctx, #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; - workspace_size_d = - std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); + workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); bwd_result.algo = search1::Find( - args1, exhaustive_search, deterministic, workspace_size_d, ctx); + args1, exhaustive_search, deterministic, workspace_size, ctx); #else using search1 = SearchAlgorithm; bwd_result = search1::Find(ctx, args1, exhaustive_search, deterministic); - workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size); + workspace_size = std::max(workspace_size, bwd_result.workspace_size); #endif } if (filter_grad) { // ------------------- cudnn descriptors --------------------- filter_grad_data = transformed_filter_grad_channel.data(); - args2.handle = handle; + args2.idesc.set(transformed_input, layout_tensor); args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, iwo_groups); args2.odesc.set(transformed_output_grad_channel, layout_tensor); @@ -389,17 +386,16 @@ void ConvCudnnGradKernel(const Context& ctx, c_groups); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; - workspace_size_w = - std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); + workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); filter_result.algo = search2::Find( - args2, exhaustive_search, deterministic, workspace_size_w, ctx); + args2, exhaustive_search, deterministic, workspace_size, ctx); #else using search2 = SearchAlgorithm; filter_result = search2::Find(ctx, args2, exhaustive_search, deterministic); VLOG(3) << "filter algo: " << filter_result.algo << ", time " << filter_result.time; - workspace_size_w = std::max(workspace_size_w, filter_result.workspace_size); + workspace_size = std::max(workspace_size, filter_result.workspace_size); #endif } @@ -438,9 +434,9 @@ void ConvCudnnGradKernel(const Context& ctx, args1.idesc.desc(), temp_tensor_data, cudnn_workspace_ptr, - workspace_size_d)); + workspace_size)); }, - workspace_size_d); + workspace_size); PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor( handle, miopenTensorOpAdd, @@ -470,9 +466,9 @@ void ConvCudnnGradKernel(const Context& ctx, args1.idesc.desc(), transformed_input_grad_data, cudnn_workspace_ptr, - workspace_size_d)); + workspace_size)); }, - workspace_size_d); + workspace_size); } #else @@ -490,12 +486,12 @@ void ConvCudnnGradKernel(const Context& ctx, args1.cdesc.desc(), bwd_result.algo, cudnn_workspace_ptr, - workspace_size_d, + workspace_size, &beta, args1.idesc.desc(), transformed_input_grad_data + i * group_offset_in)); }, - workspace_size_d); + workspace_size); } #endif if (!is_sys_pad) { @@ -551,9 +547,9 @@ void ConvCudnnGradKernel(const Context& ctx, args2.wdesc.desc(), filter_grad_data, cudnn_workspace_ptr, - workspace_size_w)); + workspace_size)); }, - workspace_size_w); + workspace_size); #else for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( @@ -569,12 +565,12 @@ void ConvCudnnGradKernel(const Context& ctx, args2.cdesc.desc(), filter_result.algo, cudnn_workspace_ptr, - workspace_size_w, + workspace_size, &beta_filter, args2.wdesc.desc(), filter_grad_data + i * group_offset_filter)); }, - workspace_size_w); + workspace_size); } #endif diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index a44a98450d..bbac834755 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -201,11 +201,14 @@ void ConvCudnnKernel(const Context& ctx, } const T* input_data = transformed_input.data(); - const T* filter_data = transformed_filter_channel.data(); + auto handle = ctx.cudnn_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); + // ------------------- cudnn descriptors --------------------- - ConvArgs args{&transformed_input, + ConvArgs args{handle, + &transformed_input, &transformed_filter_channel, &transformed_output, strides, @@ -215,8 +218,6 @@ void ConvCudnnKernel(const Context& ctx, groups, compute_format}; - auto handle = ctx.cudnn_handle(); - auto workspace_handle = ctx.cudnn_workspace_handle(); paddle::platform::DataLayout layout = compute_format == paddle::platform::DataLayout::kNHWC ? paddle::platform::DataLayout::kNHWC @@ -228,8 +229,6 @@ void ConvCudnnKernel(const Context& ctx, } auto layout_format = paddle::platform::GetCudnnTensorFormat(layout); - args.handle = handle; - #ifdef PADDLE_WITH_HIP // MIOPEN need to set groups in cdesc in miopen_desc.h args.cdesc.set(dtype, diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu index 640ebd23d9..f2c3cd0cc6 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu @@ -172,8 +172,10 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, #endif auto dtype = paddle::platform::CudnnDataType::type; + auto handle = ctx.cudnn_handle(); - ConvArgs args1{&transformed_dout, + ConvArgs args1{handle, + &transformed_dout, &filter, &x_transpose, strides, @@ -182,7 +184,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, dtype, groups, layout}; - ConvArgs args2{&transformed_dout, + ConvArgs args2{handle, + &transformed_dout, &filter, &x_transpose, strides, @@ -202,14 +205,13 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout); size_t workspace_size = 0; - auto handle = ctx.cudnn_handle(); bool deterministic = FLAGS_cudnn_deterministic; T* dx_data = nullptr; T* dfilter_data = nullptr; if (dx) { dx_data = ctx.template Alloc(dx); - args1.handle = handle; + args1.idesc.set(transformed_dout, iwo_groups); args1.wdesc.set(filter, layout_tensor, iwo_groups); args1.odesc.set(x_transpose, iwo_groups); @@ -234,7 +236,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, if (dfilter) { dfilter_data = ctx.template Alloc(dfilter); - args2.handle = handle; + args2.idesc.set(transformed_dout, iwo_groups); args2.wdesc.set(*dfilter, layout_tensor, iwo_groups); args2.odesc.set(x_transpose, iwo_groups); @@ -625,7 +627,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( auto handle = ctx.cudnn_handle(); auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW); - ConvArgs args1{&transformed_ddout_channel, + ConvArgs args1{handle, + &transformed_ddout_channel, &filter, &transformed_ddx, strides, @@ -634,7 +637,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( dtype, groups, GPUDNNDataLayout::kNCHW}; - ConvArgs args2{&transformed_ddout_channel, + ConvArgs args2{handle, + &transformed_ddout_channel, &ddfilter, &transformed_x, strides, @@ -644,7 +648,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( groups, GPUDNNDataLayout::kNCHW}; - ConvArgs args3{&transformed_dout, + ConvArgs args3{handle, + &transformed_dout, dfilter, &transformed_ddx_channel, strides, @@ -653,7 +658,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( dtype, groups, GPUDNNDataLayout::kNCHW}; - ConvArgs args4{&transformed_dout, + ConvArgs args4{handle, + &transformed_dout, &ddfilter, &transformed_dx_channel, strides, @@ -683,7 +689,6 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ddout_ = ddout->data(); transformed_ddout_channel_ = transformed_ddout_channel.data(); - args1.handle = handle; args1.idesc.set(transformed_ddout_channel, iwo_group); args1.wdesc.set(filter, layout, iwo_group); args1.odesc.set(transformed_ddx, iwo_group); @@ -730,7 +735,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( if (dfilter) { dfilter_ = dfilter->data(); - args3.handle = handle; + args3.idesc.set(transformed_dout, iwo_group); args3.wdesc.set(*dfilter, layout, iwo_group); args3.odesc.set(transformed_ddx_channel, iwo_group); @@ -806,13 +811,13 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ScalingParamType alpha = 1.0f; ScalingParamType beta = 0.0f; - auto wkspace_handle = ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); if (ddout) { ddx_ = transformed_ddx.data(); for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData( handle, @@ -831,7 +836,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( }, workspace_size); #else // PADDLE_WITH_HIP - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData( handle, @@ -858,7 +863,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( DenseTensor conv_x_ddfilter(dout.type()); conv_x_ddfilter.Resize(transformed_ddout_channel.dims()); T* conv_x_ddfilter_data = ctx.template Alloc(&conv_x_ddfilter); - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData( handle, @@ -889,7 +894,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( args2.idesc.desc(), transformed_ddout_channel_ + i * group_offset_out)); #else // PADDLE_WITH_HIP - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData( handle, @@ -944,7 +949,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ddx_ = transformed_ddx_channel.data(); for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS( dynload::miopenConvolutionBackwardWeights( @@ -964,7 +969,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( }, workspace_size); #else // PADDLE_WITH_HIP - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter( handle, @@ -990,7 +995,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ddfilter_ = ddfilter.data(); for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward( handle, @@ -1009,7 +1014,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( }, workspace_size); #else // PADDLE_WITH_HIP - wkspace_handle.RunFunc( + workspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionForward( handle, diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu index b98a6d6ae4..13e668f760 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu @@ -199,7 +199,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, auto dtype = paddle::platform::CudnnDataType::type; // ------------------- cudnn descriptors --------------------- - ConvArgs args{&transformed_out, + ConvArgs args{handle, + &transformed_out, &filter, &transformed_x, strides, @@ -208,7 +209,6 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, dtype, groups, data_layout}; - args.handle = handle; args.idesc.set(transformed_out, iwo_groups); args.wdesc.set(filter, layout_tensor, iwo_groups); args.odesc.set(transformed_x, iwo_groups); -- GitLab