提交 c5774e32 编写于 作者: C chengduoZH

add FLAGS_use_deterministic_algo

上级 2f53cd0a
...@@ -20,6 +20,11 @@ limitations under the License. */ ...@@ -20,6 +20,11 @@ limitations under the License. */
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
DEFINE_bool(cudnn_algo_use_autotune, true,
"Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If "
"false, the algorithm is deterministic.");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -267,17 +272,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -267,17 +272,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
if (input_grad) { if (input_grad) {
PADDLE_ENFORCE( if (FLAGS_cudnn_algo_use_autotune) {
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( PADDLE_ENFORCE(
handle, cudnn_filter_desc, platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
// dyDesc: Handle to the previously initialized input differential handle, cudnn_filter_desc,
// tensor descriptor. // dyDesc: Handle to the previously initialized input
cudnn_output_grad_desc, cudnn_conv_desc, // differential
// dxDesc: Handle to the previously initialized output tensor // tensor descriptor.
// descriptor. cudnn_output_grad_desc, cudnn_conv_desc,
cudnn_input_desc, // dxDesc: Handle to the previously initialized output tensor
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, // descriptor.
workspace_size_limit, &data_algo)); cudnn_input_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &data_algo));
} else {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
PADDLE_ENFORCE( PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_output_grad_desc, handle, cudnn_filter_desc, cudnn_output_grad_desc,
...@@ -286,12 +297,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -286,12 +297,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} }
if (filter_grad) { if (filter_grad) {
PADDLE_ENFORCE( if (FLAGS_cudnn_algo_use_autotune) {
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( PADDLE_ENFORCE(
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
cudnn_filter_desc, handle, cudnn_input_desc, cudnn_output_grad_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, cudnn_conv_desc, cudnn_filter_desc,
workspace_size_limit, &filter_algo)); CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &filter_algo));
} else {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
}
PADDLE_ENFORCE( PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
......
...@@ -107,8 +107,12 @@ def __bootstrap__(): ...@@ -107,8 +107,12 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads) os.environ['OMP_NUM_THREADS'] = str(num_threads)
read_env_flags = [ read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'use_pinned_memory',
'eager_delete_scope' 'check_nan_inf',
'benchmark',
'warpctc_dir',
'eager_delete_scope',
'cudnn_algo_use_autotune',
] ]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += ['fraction_of_gpu_memory_to_use'] read_env_flags += ['fraction_of_gpu_memory_to_use']
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册