From f4ec1563be117df485d451f87b6eb5a0912f69d1 Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com> Date: Mon, 19 Sep 2022 12:39:57 +0800 Subject: [PATCH] convfusion_cache (#46054) --- paddle/fluid/framework/conv_search_cache.h | 6 +- .../fluid/framework/operator_kernel_configs.h | 10 ++ .../fluid/operators/fused/conv_fusion_op.cu | 96 ++++++++++--------- 3 files changed, 67 insertions(+), 45 deletions(-) diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h index 4da2aeb4d04..1620c99ce85 100644 --- a/paddle/fluid/framework/conv_search_cache.h +++ b/paddle/fluid/framework/conv_search_cache.h @@ -55,7 +55,8 @@ class ConvSearchCache { AlgorithmsCache* GetBackwardFilter() { return &backward_filter_cache_; } - AlgorithmsCache* GetConvFusion() { + AlgorithmsCache>* + GetConvFusion() { return &fusion_forward_cache_; } #endif @@ -75,7 +76,8 @@ class ConvSearchCache { AlgorithmsCache forward_cache_; AlgorithmsCache backward_data_cache_; AlgorithmsCache backward_filter_cache_; - AlgorithmsCache fusion_forward_cache_; + AlgorithmsCache> + fusion_forward_cache_; #endif }; diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h index dc798868dc6..6a6419042f1 100644 --- a/paddle/fluid/framework/operator_kernel_configs.h +++ b/paddle/fluid/framework/operator_kernel_configs.h @@ -24,6 +24,16 @@ limitations under the License. */ namespace paddle { namespace framework { +template +struct SearchFuseResult { + SearchFuseResult() {} + explicit SearchFuseResult(AlgoT a) : algo(a) {} + + AlgoT algo = static_cast(0); + float time = -1.f; + size_t workspace_size = 0; +}; + // thread-safe. template class AlgorithmsCache { diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index 4f05e6b6e2f..6f0ebc2c7eb 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -35,6 +35,7 @@ using ScopedActivationDescriptor = platform::ScopedActivationDescriptor; using DataLayout = platform::DataLayout; using framework::AlgorithmsCache; using framework::ConvSearchCache; +using framework::SearchFuseResult; template using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; @@ -348,34 +349,35 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { &perf_count, perf_results.get())); algo = (perf_results.get())[best_algo_idx].algo; +#else PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, - algo, - &workspace_size_in_bytes)); - if (workspace_size_in_bytes > workspace_size_limit) - workspace_size_limit = workspace_size_in_bytes; -#else + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, + &algo)); +#endif PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm( + platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, - &algo)); + algo, + &workspace_size_in_bytes)); + if (workspace_size_in_bytes > workspace_size_limit) + workspace_size_limit = workspace_size_in_bytes; VLOG(3) << "cuDNN forward algo " << algo; -#endif } else { - std::function search_func = - [&]() -> cudnnConvolutionFwdAlgo_t { + std::function()> search_func = + [&]() -> SearchFuseResult { int returned_algo_count; + SearchFuseResult fwd_result; std::array fwd_perf_stat; auto cudnn_find_func = [&](void* cudnn_workspace) { @@ -402,11 +404,34 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time << " " << stat.memory; } - return fwd_perf_stat[0].algo; + + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, + cudnn_input_desc, + cudnn_filter_desc, + cudnn_conv_desc, + cudnn_output_desc, + fwd_perf_stat[0].algo, + &workspace_size_in_bytes)); + // PADDLE_ENFORCE_LE( + // workspace_size_in_bytes, + // workspace_size_limit, + // platform::errors::InvalidArgument( + // "The actual workspace size to be allocated for cuDNN is + // expected " "to be less than the limit. But received: the + // actual workspace " "size = %d, limit = %d.", + // workspace_size_in_bytes, + // workspace_size_limit)); + + fwd_result.algo = fwd_perf_stat[0].algo; + fwd_result.workspace_size = workspace_size_in_bytes; + return fwd_result; }; - AlgorithmsCache& algo_cache = + AlgorithmsCache>& algo_cache = *(framework::ConvSearchCache::Instance().GetConvFusion()); int search_times = ctx.Attr("search_times"); + SearchFuseResult algo_result; search_times = std::max( static_cast(FLAGS_cudnn_exhaustive_search_times), search_times); // TODO(dangqingqing): Unify this if-else. @@ -414,39 +439,24 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { // The searched algo will be cached by `search_times` times for // different input dimension. For other dimensions, select the algo // of closest area. - algo = algo_cache.GetAlgorithm( + algo_result = algo_cache.GetAlgorithm( x_dims[2] * x_dims[3], search_times, 0, search_func); + algo = algo_result.algo; + workspace_size_in_bytes = algo_result.workspace_size; } else { - algo = algo_cache.GetAlgorithm(x_dims, - f_dims, - strides, - paddings, - dilations, - 0, - dtype, - search_func); + algo_result = algo_cache.GetAlgorithm(x_dims, + f_dims, + strides, + paddings, + dilations, + 0, + dtype, + search_func); + algo = algo_result.algo; + workspace_size_in_bytes = algo_result.workspace_size; } VLOG(3) << "choose algo " << algo; } - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( - handle, - cudnn_input_desc, - cudnn_filter_desc, - cudnn_conv_desc, - cudnn_output_desc, - algo, - &workspace_size_in_bytes)); - // PADDLE_ENFORCE_LE( - // workspace_size_in_bytes, - // workspace_size_limit, - // platform::errors::InvalidArgument( - // "The actual workspace size to be allocated for cuDNN is expected - // " "to be less than the limit. But received: the actual workspace - // " "size = %d, limit = %d.", workspace_size_in_bytes, - // workspace_size_limit)); - if ((activation == "identity") && (!residual)) { // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. -- GitLab