From 2e5b4a216cc7eb95f0968faeb2882439511b1aa7 Mon Sep 17 00:00:00 2001 From: LoveAn Date: Mon, 21 Dec 2020 11:56:16 +0800 Subject: [PATCH] Optimize compilation time with Unity Build (#29733) * Test compilation time with less parallel count, notest, test=windows_ci * optimize rules of Unity Build, notest, test=windows_ci, test=windows_op * limit parallel counts used only on GPU, test=develop * remove limit of argument /m:8 on Windows, test=develop --- CMakeLists.txt | 8 ++++- .../controlflow/unity_build_rule.cmake | 4 +++ .../optimizers/proximal_adagrad_op.h | 16 +++++----- .../operators/optimizers/proximal_gd_op.h | 16 +++++----- .../fluid/operators/optimizers/rmsprop_op.h | 22 ++++++-------- .../optimizers/unity_build_rule.cmake | 17 ++++------- .../sequence_ops/sequence_concat_op.cc | 22 +++++++------- .../sequence_ops/sequence_concat_op.cu.cc | 30 ++++++++++++------- .../sequence_ops/sequence_expand_as_op.cu | 2 +- .../sequence_ops/sequence_expand_as_op.h | 6 ++-- .../sequence_ops/unity_build_rule.cmake | 9 ++---- 11 files changed, 78 insertions(+), 74 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 36c5bc5fbe5..4cbbe44a89b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,7 +87,13 @@ if(WIN32) CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}") - set(${flag_var} "${${flag_var}} /MP") + # NOTE(Avin0323): Less parallel count result in faster compilation with + # Unity Build on GPU. + if(WITH_UNITY_BUILD AND WITH_GPU) + set(${flag_var} "${${flag_var}} /MP8") + else() + set(${flag_var} "${${flag_var}} /MP") + endif() endforeach(flag_var) foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS) set(${flag_var} "${${flag_var}} /w") diff --git a/paddle/fluid/operators/controlflow/unity_build_rule.cmake b/paddle/fluid/operators/controlflow/unity_build_rule.cmake index 027e32a9e42..6ed8f8a7537 100644 --- a/paddle/fluid/operators/controlflow/unity_build_rule.cmake +++ b/paddle/fluid/operators/controlflow/unity_build_rule.cmake @@ -14,3 +14,7 @@ register_unity_group(cc logical_op.cc tensor_array_read_write_op.cc while_op.cc) +register_unity_group(cu + logical_op.cu + compare_op.cu + compare_all_op.cu) diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h index 91416450a60..3faf8ea7659 100644 --- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h +++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h @@ -20,9 +20,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; template class ProximalAdagradOpKernel : public framework::OpKernel { @@ -38,13 +35,14 @@ class ProximalAdagradOpKernel : public framework::OpKernel { auto l2 = static_cast(ctx.Attr("l2")); auto grad = ctx.Input("Grad"); - auto p = EigenVector::Flatten(*ctx.Input("Param")); - auto m = EigenVector::Flatten(*ctx.Input("Moment")); - auto g = EigenVector::Flatten(*grad); - auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); + auto p = framework::EigenVector::Flatten(*ctx.Input("Param")); + auto m = framework::EigenVector::Flatten(*ctx.Input("Moment")); + auto g = framework::EigenVector::Flatten(*grad); + auto lr = + framework::EigenVector::Flatten(*ctx.Input("LearningRate")); - auto p_out = EigenVector::Flatten(*param_out); - auto m_out = EigenVector::Flatten(*moment_out); + auto p_out = framework::EigenVector::Flatten(*param_out); + auto m_out = framework::EigenVector::Flatten(*moment_out); auto* place = ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.h b/paddle/fluid/operators/optimizers/proximal_gd_op.h index d49badf16d5..7caa8421f04 100644 --- a/paddle/fluid/operators/optimizers/proximal_gd_op.h +++ b/paddle/fluid/operators/optimizers/proximal_gd_op.h @@ -20,9 +20,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; template class ProximalGDOpKernel : public framework::OpKernel { @@ -37,11 +34,12 @@ class ProximalGDOpKernel : public framework::OpKernel { auto l1 = static_cast(ctx.Attr("l1")); auto l2 = static_cast(ctx.Attr("l2")); - auto p = EigenVector::Flatten(*ctx.Input("Param")); - auto g = EigenVector::Flatten(*grad); - auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); + auto p = framework::EigenVector::Flatten(*ctx.Input("Param")); + auto g = framework::EigenVector::Flatten(*grad); + auto lr = + framework::EigenVector::Flatten(*ctx.Input("LearningRate")); - auto p_out = EigenVector::Flatten(*param_out); + auto p_out = framework::EigenVector::Flatten(*param_out); auto& place = *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); @@ -52,10 +50,10 @@ class ProximalGDOpKernel : public framework::OpKernel { prox_param.sign() * (((prox_param.abs() - (lr * l1).broadcast(grad_dsize)) .cwiseMax(T(0.0))) / - (1.0 + (lr * l2).broadcast(grad_dsize))); + (1.0f + (lr * l2).broadcast(grad_dsize))); } else { p_out.device(place) = - prox_param / (1.0 + (lr * l2).broadcast(grad_dsize)); + prox_param / (1.0f + (lr * l2).broadcast(grad_dsize)); } } }; diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h index 1ec712a1431..9971cb92306 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op.h +++ b/paddle/fluid/operators/optimizers/rmsprop_op.h @@ -23,10 +23,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -using EigenVector = framework::EigenVector; - template struct DenseRmspropGradFunctor { inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {} @@ -169,25 +165,25 @@ class RmspropOpKernel : public framework::OpKernel { *ctx.template device_context().eigen_device(); auto lr_value = lr_tensor.data()[0]; - auto p = EigenVector::Flatten(p_tensor); - auto ms = EigenVector::Flatten(ms_tensor); - auto g = EigenVector::Flatten(grad_tensor); - auto mom = EigenVector::Flatten(mom_tensor); + auto p = framework::EigenVector::Flatten(p_tensor); + auto ms = framework::EigenVector::Flatten(ms_tensor); + auto g = framework::EigenVector::Flatten(grad_tensor); + auto mom = framework::EigenVector::Flatten(mom_tensor); - auto p_out = EigenVector::Flatten(*param_out); - auto mom_out = EigenVector::Flatten(*moment_out); - auto ms_out = EigenVector::Flatten(*mean_square_out); + auto p_out = framework::EigenVector::Flatten(*param_out); + auto mom_out = framework::EigenVector::Flatten(*moment_out); + auto ms_out = framework::EigenVector::Flatten(*mean_square_out); ms_out.device(place) = rho * ms + (1 - rho) * g * g; if (centered) { auto &mg_tensor = *ctx.Input("MeanGrad"); - auto mg = EigenVector::Flatten(mg_tensor); + auto mg = framework::EigenVector::Flatten(mg_tensor); auto *mean_grad_out = ctx.Output("MeanGradOut"); PADDLE_ENFORCE_EQ( &mg_tensor, mean_grad_out, platform::errors::InvalidArgument( "MeanGrad and MeanGradOut must be the same Tensor")); - auto mg_out = EigenVector::Flatten(*mean_grad_out); + auto mg_out = framework::EigenVector::Flatten(*mean_grad_out); mg_out.device(place) = rho * mg + (1 - rho) * g; mom_out.device(place) = diff --git a/paddle/fluid/operators/optimizers/unity_build_rule.cmake b/paddle/fluid/operators/optimizers/unity_build_rule.cmake index 5b4ec175ef8..769bb781d6e 100644 --- a/paddle/fluid/operators/optimizers/unity_build_rule.cmake +++ b/paddle/fluid/operators/optimizers/unity_build_rule.cmake @@ -8,14 +8,13 @@ register_unity_group(cc ftrl_op.cc lars_momentum_op.cc momentum_op.cc - sgd_op.cc) -register_unity_group(cc + sgd_op.cc + proximal_adagrad_op.cc adagrad_op.cc adam_op.cc adamax_op.cc dgc_momentum_op.cc - proximal_gd_op.cc) -register_unity_group(cc + proximal_gd_op.cc decayed_adagrad_op.cc adadelta_op.cc lamb_op.cc @@ -25,16 +24,12 @@ register_unity_group(cu ftrl_op.cu lars_momentum_op.cu momentum_op.cu - sgd_op.cu) -register_unity_group(cu + sgd_op.cu + proximal_adagrad_op.cu adagrad_op.cu adam_op.cu - adamax_op.cu) -register_unity_group(cu + adamax_op.cu decayed_adagrad_op.cu adadelta_op.cu lamb_op.cu rmsprop_op.cu) -# The following groups are to make better use of `/MP` which MSVC's parallel -# compilation instruction when compiling in Unity Build. -register_unity_group(cu proximal_adagrad_op.cu) diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc index 0d3be48b763..fa626076782 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc @@ -133,16 +133,18 @@ namespace op = paddle::operators; REGISTER_OPERATOR(sequence_concat, op::SequenceConcatOp, op::SeqConcatOpMaker, op::SeqConcatGradOpMaker, op::SeqConcatGradOpMaker); -template -using Kernel = op::SeqConcatKernel; -REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel, - Kernel, Kernel); +REGISTER_OP_CPU_KERNEL( + sequence_concat, + op::SeqConcatKernel, + op::SeqConcatKernel, + op::SeqConcatKernel, + op::SeqConcatKernel); REGISTER_OPERATOR(sequence_concat_grad, op::SeqConcatGradOp, op::SeqConcatGradNoNeedBufferVarsInferer); -template -using GradKernel = - op::SeqConcatGradKernel; -REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel, - GradKernel, GradKernel, - GradKernel); +REGISTER_OP_CPU_KERNEL( + sequence_concat_grad, + op::SeqConcatGradKernel, + op::SeqConcatGradKernel, + op::SeqConcatGradKernel, + op::SeqConcatGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc index 6eda8595b17..d58a2da29c9 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc @@ -21,15 +21,23 @@ class CUDADeviceContext; } // namespace platform } // namespace paddle -template -using Kernel = - paddle::operators::SeqConcatKernel; -REGISTER_OP_CUDA_KERNEL(sequence_concat, Kernel, Kernel, - Kernel, Kernel); -template -using GradKernel = +REGISTER_OP_CUDA_KERNEL( + sequence_concat, + paddle::operators::SeqConcatKernel, + paddle::operators::SeqConcatKernel, + paddle::operators::SeqConcatKernel, + paddle::operators::SeqConcatKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_concat_grad, paddle::operators::SeqConcatGradKernel; -REGISTER_OP_CUDA_KERNEL(sequence_concat_grad, GradKernel, - GradKernel, GradKernel, - GradKernel); + float>, + paddle::operators::SeqConcatGradKernel, + paddle::operators::SeqConcatGradKernel, + paddle::operators::SeqConcatGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu index a7fdf39340c..c8b6156881c 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu @@ -62,7 +62,7 @@ static __global__ void sequence_expand_as_grad_kernel( } template -struct SequenceExpandFunctor { +struct SequenceExpandAsFunctor { void operator()( const platform::CUDADeviceContext &context, const LoDTensor &x, const framework::Vector &ref_lod, /*expand referenced lod*/ diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h index 6afcc72763d..d2f07599811 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h @@ -24,7 +24,7 @@ namespace paddle { namespace operators { template -struct SequenceExpandFunctor { +struct SequenceExpandAsFunctor { void operator()( const DeviceContext &ctx, const framework::LoDTensor &x, const framework::Vector &ref_lod, /*expand referenced lod*/ @@ -40,7 +40,7 @@ struct SequenceExpandAsGradFunctor { }; template -struct SequenceExpandFunctor { +struct SequenceExpandAsFunctor { void operator()( const platform::CPUDeviceContext &context, const framework::LoDTensor &x, const framework::Vector &ref_lod, /*expand referenced lod*/ @@ -97,7 +97,7 @@ class SequenceExpandAsKernel : public framework::OpKernel { out->mutable_data(context.GetPlace()); auto &dev_ctx = context.template device_context(); - SequenceExpandFunctor seq_espand_functor; + SequenceExpandAsFunctor seq_espand_functor; seq_espand_functor(dev_ctx, *x, y_lod[0], out); } }; diff --git a/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake b/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake index c29eea70c49..9ccc4432df5 100644 --- a/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake +++ b/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake @@ -12,8 +12,7 @@ register_unity_group(cc sequence_expand_op.cc sequence_mask_op.cc sequence_pad_op.cc - sequence_pool_op.cc) -register_unity_group(cc + sequence_pool_op.cc sequence_expand_as_op.cc sequence_reshape_op.cc sequence_reverse_op.cc @@ -21,8 +20,7 @@ register_unity_group(cc sequence_slice_op.cc sequence_softmax_op.cc sequence_topk_avg_pooling_op.cc - sequence_unpad_op.cc) -register_unity_group(cc + sequence_unpad_op.cc sequence_concat_op.cu.cc sequence_conv_op.cu.cc) register_unity_group(cu @@ -31,8 +29,7 @@ register_unity_group(cu sequence_expand_op.cu sequence_mask_op.cu sequence_pad_op.cu - sequence_pool_op.cu) -register_unity_group(cu + sequence_pool_op.cu sequence_expand_as_op.cu sequence_reshape_op.cu sequence_reverse_op.cu -- GitLab