diff --git a/CMakeLists.txt b/CMakeLists.txt index 36c5bc5fbe54f5915982ac2ea219f8c61cf842c9..4cbbe44a89b15177001d6a5ca9642c1fefd6fee7 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,7 +87,13 @@ if(WIN32) CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}") - set(${flag_var} "${${flag_var}} /MP") + # NOTE(Avin0323): Less parallel count result in faster compilation with + # Unity Build on GPU. + if(WITH_UNITY_BUILD AND WITH_GPU) + set(${flag_var} "${${flag_var}} /MP8") + else() + set(${flag_var} "${${flag_var}} /MP") + endif() endforeach(flag_var) foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS) set(${flag_var} "${${flag_var}} /w") diff --git a/paddle/fluid/operators/controlflow/unity_build_rule.cmake b/paddle/fluid/operators/controlflow/unity_build_rule.cmake index 027e32a9e4292640128bda72b56c810c1f4c6375..6ed8f8a75374eaba122e7a3b3d935079a81756ee 100644 --- a/paddle/fluid/operators/controlflow/unity_build_rule.cmake +++ b/paddle/fluid/operators/controlflow/unity_build_rule.cmake @@ -14,3 +14,7 @@ register_unity_group(cc logical_op.cc tensor_array_read_write_op.cc while_op.cc) +register_unity_group(cu + logical_op.cu + compare_op.cu + compare_all_op.cu) diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h index 91416450a60d683eeb33462258ff8325dae76e6e..3faf8ea76594456ee0afb7235f3df7c0e95bcf16 100644 --- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h +++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h @@ -20,9 +20,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; template class ProximalAdagradOpKernel : public framework::OpKernel { @@ -38,13 +35,14 @@ class ProximalAdagradOpKernel : public framework::OpKernel { auto l2 = static_cast(ctx.Attr("l2")); auto grad = ctx.Input("Grad"); - auto p = EigenVector::Flatten(*ctx.Input("Param")); - auto m = EigenVector::Flatten(*ctx.Input("Moment")); - auto g = EigenVector::Flatten(*grad); - auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); + auto p = framework::EigenVector::Flatten(*ctx.Input("Param")); + auto m = framework::EigenVector::Flatten(*ctx.Input("Moment")); + auto g = framework::EigenVector::Flatten(*grad); + auto lr = + framework::EigenVector::Flatten(*ctx.Input("LearningRate")); - auto p_out = EigenVector::Flatten(*param_out); - auto m_out = EigenVector::Flatten(*moment_out); + auto p_out = framework::EigenVector::Flatten(*param_out); + auto m_out = framework::EigenVector::Flatten(*moment_out); auto* place = ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.h b/paddle/fluid/operators/optimizers/proximal_gd_op.h index d49badf16d510c13847b8921360544bbb6078e05..7caa8421f041c199edf6487814ec5de22b0d1286 100644 --- a/paddle/fluid/operators/optimizers/proximal_gd_op.h +++ b/paddle/fluid/operators/optimizers/proximal_gd_op.h @@ -20,9 +20,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; template class ProximalGDOpKernel : public framework::OpKernel { @@ -37,11 +34,12 @@ class ProximalGDOpKernel : public framework::OpKernel { auto l1 = static_cast(ctx.Attr("l1")); auto l2 = static_cast(ctx.Attr("l2")); - auto p = EigenVector::Flatten(*ctx.Input("Param")); - auto g = EigenVector::Flatten(*grad); - auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); + auto p = framework::EigenVector::Flatten(*ctx.Input("Param")); + auto g = framework::EigenVector::Flatten(*grad); + auto lr = + framework::EigenVector::Flatten(*ctx.Input("LearningRate")); - auto p_out = EigenVector::Flatten(*param_out); + auto p_out = framework::EigenVector::Flatten(*param_out); auto& place = *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); @@ -52,10 +50,10 @@ class ProximalGDOpKernel : public framework::OpKernel { prox_param.sign() * (((prox_param.abs() - (lr * l1).broadcast(grad_dsize)) .cwiseMax(T(0.0))) / - (1.0 + (lr * l2).broadcast(grad_dsize))); + (1.0f + (lr * l2).broadcast(grad_dsize))); } else { p_out.device(place) = - prox_param / (1.0 + (lr * l2).broadcast(grad_dsize)); + prox_param / (1.0f + (lr * l2).broadcast(grad_dsize)); } } }; diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h index 1ec712a1431a4657cf1c1456da91fe5369914438..9971cb92306a2710e02998fade05c8a498e88627 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op.h +++ b/paddle/fluid/operators/optimizers/rmsprop_op.h @@ -23,10 +23,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -using EigenVector = framework::EigenVector; - template struct DenseRmspropGradFunctor { inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {} @@ -169,25 +165,25 @@ class RmspropOpKernel : public framework::OpKernel { *ctx.template device_context().eigen_device(); auto lr_value = lr_tensor.data()[0]; - auto p = EigenVector::Flatten(p_tensor); - auto ms = EigenVector::Flatten(ms_tensor); - auto g = EigenVector::Flatten(grad_tensor); - auto mom = EigenVector::Flatten(mom_tensor); + auto p = framework::EigenVector::Flatten(p_tensor); + auto ms = framework::EigenVector::Flatten(ms_tensor); + auto g = framework::EigenVector::Flatten(grad_tensor); + auto mom = framework::EigenVector::Flatten(mom_tensor); - auto p_out = EigenVector::Flatten(*param_out); - auto mom_out = EigenVector::Flatten(*moment_out); - auto ms_out = EigenVector::Flatten(*mean_square_out); + auto p_out = framework::EigenVector::Flatten(*param_out); + auto mom_out = framework::EigenVector::Flatten(*moment_out); + auto ms_out = framework::EigenVector::Flatten(*mean_square_out); ms_out.device(place) = rho * ms + (1 - rho) * g * g; if (centered) { auto &mg_tensor = *ctx.Input("MeanGrad"); - auto mg = EigenVector::Flatten(mg_tensor); + auto mg = framework::EigenVector::Flatten(mg_tensor); auto *mean_grad_out = ctx.Output("MeanGradOut"); PADDLE_ENFORCE_EQ( &mg_tensor, mean_grad_out, platform::errors::InvalidArgument( "MeanGrad and MeanGradOut must be the same Tensor")); - auto mg_out = EigenVector::Flatten(*mean_grad_out); + auto mg_out = framework::EigenVector::Flatten(*mean_grad_out); mg_out.device(place) = rho * mg + (1 - rho) * g; mom_out.device(place) = diff --git a/paddle/fluid/operators/optimizers/unity_build_rule.cmake b/paddle/fluid/operators/optimizers/unity_build_rule.cmake index 5b4ec175ef87b53486c2460445bdba1bdb0aa44f..769bb781d6e724a36a29d8b7fb881d6c5468472c 100644 --- a/paddle/fluid/operators/optimizers/unity_build_rule.cmake +++ b/paddle/fluid/operators/optimizers/unity_build_rule.cmake @@ -8,14 +8,13 @@ register_unity_group(cc ftrl_op.cc lars_momentum_op.cc momentum_op.cc - sgd_op.cc) -register_unity_group(cc + sgd_op.cc + proximal_adagrad_op.cc adagrad_op.cc adam_op.cc adamax_op.cc dgc_momentum_op.cc - proximal_gd_op.cc) -register_unity_group(cc + proximal_gd_op.cc decayed_adagrad_op.cc adadelta_op.cc lamb_op.cc @@ -25,16 +24,12 @@ register_unity_group(cu ftrl_op.cu lars_momentum_op.cu momentum_op.cu - sgd_op.cu) -register_unity_group(cu + sgd_op.cu + proximal_adagrad_op.cu adagrad_op.cu adam_op.cu - adamax_op.cu) -register_unity_group(cu + adamax_op.cu decayed_adagrad_op.cu adadelta_op.cu lamb_op.cu rmsprop_op.cu) -# The following groups are to make better use of `/MP` which MSVC's parallel -# compilation instruction when compiling in Unity Build. -register_unity_group(cu proximal_adagrad_op.cu) diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc index 0d3be48b7637b35ae75c4f425dff1ab1f86a83de..fa6260767829cd00243134f6c3f14338334b42b6 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc @@ -133,16 +133,18 @@ namespace op = paddle::operators; REGISTER_OPERATOR(sequence_concat, op::SequenceConcatOp, op::SeqConcatOpMaker, op::SeqConcatGradOpMaker, op::SeqConcatGradOpMaker); -template -using Kernel = op::SeqConcatKernel; -REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel, - Kernel, Kernel); +REGISTER_OP_CPU_KERNEL( + sequence_concat, + op::SeqConcatKernel, + op::SeqConcatKernel, + op::SeqConcatKernel, + op::SeqConcatKernel); REGISTER_OPERATOR(sequence_concat_grad, op::SeqConcatGradOp, op::SeqConcatGradNoNeedBufferVarsInferer); -template -using GradKernel = - op::SeqConcatGradKernel; -REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel, - GradKernel, GradKernel, - GradKernel); +REGISTER_OP_CPU_KERNEL( + sequence_concat_grad, + op::SeqConcatGradKernel, + op::SeqConcatGradKernel, + op::SeqConcatGradKernel, + op::SeqConcatGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc index 6eda8595b1769b0891213124ac47a3b1cf27fb7c..d58a2da29c941d270c6be08355648aeaf582ae61 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc @@ -21,15 +21,23 @@ class CUDADeviceContext; } // namespace platform } // namespace paddle -template -using Kernel = - paddle::operators::SeqConcatKernel; -REGISTER_OP_CUDA_KERNEL(sequence_concat, Kernel, Kernel, - Kernel, Kernel); -template -using GradKernel = +REGISTER_OP_CUDA_KERNEL( + sequence_concat, + paddle::operators::SeqConcatKernel, + paddle::operators::SeqConcatKernel, + paddle::operators::SeqConcatKernel, + paddle::operators::SeqConcatKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_concat_grad, paddle::operators::SeqConcatGradKernel; -REGISTER_OP_CUDA_KERNEL(sequence_concat_grad, GradKernel, - GradKernel, GradKernel, - GradKernel); + float>, + paddle::operators::SeqConcatGradKernel, + paddle::operators::SeqConcatGradKernel, + paddle::operators::SeqConcatGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu index a7fdf39340c28b4cab7f64753eeb5de28a968750..c8b6156881c96ff37fd3208dad789d8fc1589ddd 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu @@ -62,7 +62,7 @@ static __global__ void sequence_expand_as_grad_kernel( } template -struct SequenceExpandFunctor { +struct SequenceExpandAsFunctor { void operator()( const platform::CUDADeviceContext &context, const LoDTensor &x, const framework::Vector &ref_lod, /*expand referenced lod*/ diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h index 6afcc72763d32222bc71a45c5b5d354a1ace680d..d2f07599811ad19cf4cad338acb5861a3b7bebf8 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h @@ -24,7 +24,7 @@ namespace paddle { namespace operators { template -struct SequenceExpandFunctor { +struct SequenceExpandAsFunctor { void operator()( const DeviceContext &ctx, const framework::LoDTensor &x, const framework::Vector &ref_lod, /*expand referenced lod*/ @@ -40,7 +40,7 @@ struct SequenceExpandAsGradFunctor { }; template -struct SequenceExpandFunctor { +struct SequenceExpandAsFunctor { void operator()( const platform::CPUDeviceContext &context, const framework::LoDTensor &x, const framework::Vector &ref_lod, /*expand referenced lod*/ @@ -97,7 +97,7 @@ class SequenceExpandAsKernel : public framework::OpKernel { out->mutable_data(context.GetPlace()); auto &dev_ctx = context.template device_context(); - SequenceExpandFunctor seq_espand_functor; + SequenceExpandAsFunctor seq_espand_functor; seq_espand_functor(dev_ctx, *x, y_lod[0], out); } }; diff --git a/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake b/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake index c29eea70c496d47163b35955feab1c8455c65d67..9ccc4432df5cd8a2b007b747daadbe0323e64c01 100644 --- a/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake +++ b/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake @@ -12,8 +12,7 @@ register_unity_group(cc sequence_expand_op.cc sequence_mask_op.cc sequence_pad_op.cc - sequence_pool_op.cc) -register_unity_group(cc + sequence_pool_op.cc sequence_expand_as_op.cc sequence_reshape_op.cc sequence_reverse_op.cc @@ -21,8 +20,7 @@ register_unity_group(cc sequence_slice_op.cc sequence_softmax_op.cc sequence_topk_avg_pooling_op.cc - sequence_unpad_op.cc) -register_unity_group(cc + sequence_unpad_op.cc sequence_concat_op.cu.cc sequence_conv_op.cu.cc) register_unity_group(cu @@ -31,8 +29,7 @@ register_unity_group(cu sequence_expand_op.cu sequence_mask_op.cu sequence_pad_op.cu - sequence_pool_op.cu) -register_unity_group(cu + sequence_pool_op.cu sequence_expand_as_op.cu sequence_reshape_op.cu sequence_reverse_op.cu