diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index c70e3cc3c9198008d9eca5f462000aa67ff7e5ba..cf410c3ca16955620610634b99ee9111106ef99f 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -20,6 +20,11 @@ limitations under the License. */ #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" +DEFINE_bool(cudnn_algo_use_autotune, true, + "Whether allow using an autotuning algorithm for convolution " + "operator. The autotuning algorithm may be non-deterministic. If " + "false, the algorithm is deterministic."); + namespace paddle { namespace operators { @@ -267,17 +272,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); if (input_grad) { - PADDLE_ENFORCE( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - handle, cudnn_filter_desc, - // dyDesc: Handle to the previously initialized input differential - // tensor descriptor. - cudnn_output_grad_desc, cudnn_conv_desc, - // dxDesc: Handle to the previously initialized output tensor - // descriptor. - cudnn_input_desc, - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &data_algo)); + if (FLAGS_cudnn_algo_use_autotune) { + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + handle, cudnn_filter_desc, + // dyDesc: Handle to the previously initialized input + // differential + // tensor descriptor. + cudnn_output_grad_desc, cudnn_conv_desc, + // dxDesc: Handle to the previously initialized output tensor + // descriptor. + cudnn_input_desc, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &data_algo)); + } else { + data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } + PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( handle, cudnn_filter_desc, cudnn_output_grad_desc, @@ -286,12 +297,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } if (filter_grad) { - PADDLE_ENFORCE( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, - cudnn_filter_desc, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &filter_algo)); + if (FLAGS_cudnn_algo_use_autotune) { + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + handle, cudnn_input_desc, cudnn_output_grad_desc, + cudnn_conv_desc, cudnn_filter_desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &filter_algo)); + } else { + filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + } PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index e2502990d5b78eb0db7bdfd0c8ef9fb6688016df..c940f474787359667ccf9ffdb71170fdf358cad9 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -107,8 +107,12 @@ def __bootstrap__(): os.environ['OMP_NUM_THREADS'] = str(num_threads) read_env_flags = [ - 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', - 'eager_delete_scope' + 'use_pinned_memory', + 'check_nan_inf', + 'benchmark', + 'warpctc_dir', + 'eager_delete_scope', + 'cudnn_algo_use_autotune', ] if core.is_compiled_with_cuda(): read_env_flags += ['fraction_of_gpu_memory_to_use']