diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 7a65ee62c9bfc0dad2ebee3be21de825fa405d73..1cd3113030086104e7fc5c4ba3364a5ff027632b 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -46,6 +46,7 @@ void ScaleLossGradOpHandle::RunImpl() { ->stream(); memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); + VLOG(1) << place_ << "RUN Scale loss grad op"; }); #endif } diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index c70e3cc3c9198008d9eca5f462000aa67ff7e5ba..cf410c3ca16955620610634b99ee9111106ef99f 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -20,6 +20,11 @@ limitations under the License. */ #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" +DEFINE_bool(cudnn_algo_use_autotune, true, + "Whether allow using an autotuning algorithm for convolution " + "operator. The autotuning algorithm may be non-deterministic. If " + "false, the algorithm is deterministic."); + namespace paddle { namespace operators { @@ -267,17 +272,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); if (input_grad) { - PADDLE_ENFORCE( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - handle, cudnn_filter_desc, - // dyDesc: Handle to the previously initialized input differential - // tensor descriptor. - cudnn_output_grad_desc, cudnn_conv_desc, - // dxDesc: Handle to the previously initialized output tensor - // descriptor. - cudnn_input_desc, - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &data_algo)); + if (FLAGS_cudnn_algo_use_autotune) { + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + handle, cudnn_filter_desc, + // dyDesc: Handle to the previously initialized input + // differential + // tensor descriptor. + cudnn_output_grad_desc, cudnn_conv_desc, + // dxDesc: Handle to the previously initialized output tensor + // descriptor. + cudnn_input_desc, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &data_algo)); + } else { + data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } + PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( handle, cudnn_filter_desc, cudnn_output_grad_desc, @@ -286,12 +297,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } if (filter_grad) { - PADDLE_ENFORCE( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, - cudnn_filter_desc, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &filter_algo)); + if (FLAGS_cudnn_algo_use_autotune) { + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + handle, cudnn_input_desc, cudnn_output_grad_desc, + cudnn_conv_desc, cudnn_filter_desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &filter_algo)); + } else { + filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + } PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index e2502990d5b78eb0db7bdfd0c8ef9fb6688016df..04f6905ce68d96556c4e037791d6e9fb80c03f37 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -111,7 +111,9 @@ def __bootstrap__(): 'eager_delete_scope' ] if core.is_compiled_with_cuda(): - read_env_flags += ['fraction_of_gpu_memory_to_use'] + read_env_flags += [ + 'fraction_of_gpu_memory_to_use', 'cudnn_algo_use_autotune' + ] core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0])