未验证 提交 549f165b 编写于 作者: Q qingqing01 提交者: GitHub

Speed conv_fusion_op for identity activation. (#14744)

* Refine conv_fusion_op for identity activation.
* Fix unit testing.
test=develop
上级 c6b39a00
...@@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto x_dims = framework::vectorize(input->dims()); auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims()); auto f_dims = framework::vectorize(filter->dims());
if (activation == "identity") { if (!exhaustive_search) {
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else if (!exhaustive_search) {
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
...@@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit"); "workspace_size to be allocated exceeds the limit");
// ------------------- cudnn conv+bias+act forward -------------------- if ((activation == "identity") &&
ScalingParamType<T> alpha1 = 1.0f; (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) &&
ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f; (!residual)) {
auto cudnn_func = [&](void* cudnn_workspace) { // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, // But test in some case, the speed is slower, change to use
filter_data, cudnn_conv_desc, algo, cudnn_workspace, // cudnnConvolutionForward and cudnnAddTensor
workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, // ------------- cudnn conv forward and bias add ---------------------
cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
output_data)); output_data));
}; } else {
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); if (activation == "identity") {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
}
// ------------------- cudnn conv+bias+act forward --------------------
ScalingParamType<T> alpha1 = 1.0f;
ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
}
} }
}; };
#endif #endif
......
...@@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp): ...@@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp):
self.activation = 'identity' self.activation = 'identity'
class TestIdentityActivation(TestConv2dFusionOp):
def init_activation(self):
self.activation = 'identity'
self.add_residual_data = False
class TestWithGroup(TestConv2dFusionOp): class TestWithGroup(TestConv2dFusionOp):
def init_group(self): def init_group(self):
self.groups = 3 self.groups = 3
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册