diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index 2c09ee7394ad605f7a324d021ce0468a79bb71ca..3235ad52b999e1ca3f992034781edaab9921a300 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { auto x_dims = framework::vectorize(input->dims()); auto f_dims = framework::vectorize(filter->dims()); - if (activation == "identity") { - // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is - // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. - algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - } else if (!exhaustive_search) { + if (!exhaustive_search) { CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, @@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - // ------------------- cudnn conv+bias+act forward -------------------- - ScalingParamType alpha1 = 1.0f; - ScalingParamType alpha2 = residual ? 1.0f : 0.0f; - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, algo, cudnn_workspace, - workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, - cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, + if ((activation == "identity") && + (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) && + (!residual)) { + // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is + // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. + // But test in some case, the speed is slower, change to use + // cudnnConvolutionForward and cudnnAddTensor + // ------------- cudnn conv forward and bias add --------------------- + ScalingParamType alpha = 1.0f, beta = 0.0f; + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + CUDNN_ENFORCE(platform::dynload::cudnnAddTensor( + handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, output_data)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + } else { + if (activation == "identity") { + algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + } + // ------------------- cudnn conv+bias+act forward -------------------- + ScalingParamType alpha1 = 1.0f; + ScalingParamType alpha2 = residual ? 1.0f : 0.0f; + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, + cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, + output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + } } }; #endif diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py index 9f3f2f348166864be9583855fcd1949fd4ac818c..6cd71e39e41dae5d07e5761fc9caeca113f3b47e 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py @@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp): self.activation = 'identity' +class TestIdentityActivation(TestConv2dFusionOp): + def init_activation(self): + self.activation = 'identity' + self.add_residual_data = False + + class TestWithGroup(TestConv2dFusionOp): def init_group(self): self.groups = 3