未验证 提交 2e5b4a21 编写于 作者: L LoveAn 提交者: GitHub

Optimize compilation time with Unity Build (#29733)

* Test compilation time with less parallel count, notest, test=windows_ci

* optimize rules of Unity Build, notest, test=windows_ci, test=windows_op

* limit parallel counts used only on GPU, test=develop

* remove limit of argument /m:8 on Windows, test=develop
上级 0c23ba95
...@@ -87,7 +87,13 @@ if(WIN32) ...@@ -87,7 +87,13 @@ if(WIN32)
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}") string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
set(${flag_var} "${${flag_var}} /MP") # NOTE(Avin0323): Less parallel count result in faster compilation with
# Unity Build on GPU.
if(WITH_UNITY_BUILD AND WITH_GPU)
set(${flag_var} "${${flag_var}} /MP8")
else()
set(${flag_var} "${${flag_var}} /MP")
endif()
endforeach(flag_var) endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS) foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w") set(${flag_var} "${${flag_var}} /w")
......
...@@ -14,3 +14,7 @@ register_unity_group(cc ...@@ -14,3 +14,7 @@ register_unity_group(cc
logical_op.cc logical_op.cc
tensor_array_read_write_op.cc tensor_array_read_write_op.cc
while_op.cc) while_op.cc)
register_unity_group(cu
logical_op.cu
compare_op.cu
compare_all_op.cu)
...@@ -20,9 +20,6 @@ namespace paddle { ...@@ -20,9 +20,6 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ProximalAdagradOpKernel : public framework::OpKernel<T> { class ProximalAdagradOpKernel : public framework::OpKernel<T> {
...@@ -38,13 +35,14 @@ class ProximalAdagradOpKernel : public framework::OpKernel<T> { ...@@ -38,13 +35,14 @@ class ProximalAdagradOpKernel : public framework::OpKernel<T> {
auto l2 = static_cast<T>(ctx.Attr<float>("l2")); auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
auto grad = ctx.Input<Tensor>("Grad"); auto grad = ctx.Input<Tensor>("Grad");
auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param")); auto p = framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
auto m = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment")); auto m = framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
auto g = EigenVector<T>::Flatten(*grad); auto g = framework::EigenVector<T>::Flatten(*grad);
auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate")); auto lr =
framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
auto p_out = EigenVector<T>::Flatten(*param_out); auto p_out = framework::EigenVector<T>::Flatten(*param_out);
auto m_out = EigenVector<T>::Flatten(*moment_out); auto m_out = framework::EigenVector<T>::Flatten(*moment_out);
auto* place = ctx.template device_context<DeviceContext>().eigen_device(); auto* place = ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 1> grad_dsize(grad->numel()); Eigen::DSizes<int, 1> grad_dsize(grad->numel());
......
...@@ -20,9 +20,6 @@ namespace paddle { ...@@ -20,9 +20,6 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ProximalGDOpKernel : public framework::OpKernel<T> { class ProximalGDOpKernel : public framework::OpKernel<T> {
...@@ -37,11 +34,12 @@ class ProximalGDOpKernel : public framework::OpKernel<T> { ...@@ -37,11 +34,12 @@ class ProximalGDOpKernel : public framework::OpKernel<T> {
auto l1 = static_cast<T>(ctx.Attr<float>("l1")); auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
auto l2 = static_cast<T>(ctx.Attr<float>("l2")); auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param")); auto p = framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
auto g = EigenVector<T>::Flatten(*grad); auto g = framework::EigenVector<T>::Flatten(*grad);
auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate")); auto lr =
framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
auto p_out = EigenVector<T>::Flatten(*param_out); auto p_out = framework::EigenVector<T>::Flatten(*param_out);
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 1> grad_dsize(grad->numel()); Eigen::DSizes<int, 1> grad_dsize(grad->numel());
...@@ -52,10 +50,10 @@ class ProximalGDOpKernel : public framework::OpKernel<T> { ...@@ -52,10 +50,10 @@ class ProximalGDOpKernel : public framework::OpKernel<T> {
prox_param.sign() * prox_param.sign() *
(((prox_param.abs() - (lr * l1).broadcast(grad_dsize)) (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
.cwiseMax(T(0.0))) / .cwiseMax(T(0.0))) /
(1.0 + (lr * l2).broadcast(grad_dsize))); (1.0f + (lr * l2).broadcast(grad_dsize)));
} else { } else {
p_out.device(place) = p_out.device(place) =
prox_param / (1.0 + (lr * l2).broadcast(grad_dsize)); prox_param / (1.0f + (lr * l2).broadcast(grad_dsize));
} }
} }
}; };
......
...@@ -23,10 +23,6 @@ limitations under the License. */ ...@@ -23,10 +23,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T> template <typename T>
struct DenseRmspropGradFunctor { struct DenseRmspropGradFunctor {
inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {} inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
...@@ -169,25 +165,25 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -169,25 +165,25 @@ class RmspropOpKernel : public framework::OpKernel<T> {
*ctx.template device_context<DeviceContext>().eigen_device(); *ctx.template device_context<DeviceContext>().eigen_device();
auto lr_value = lr_tensor.data<T>()[0]; auto lr_value = lr_tensor.data<T>()[0];
auto p = EigenVector<T>::Flatten(p_tensor); auto p = framework::EigenVector<T>::Flatten(p_tensor);
auto ms = EigenVector<T>::Flatten(ms_tensor); auto ms = framework::EigenVector<T>::Flatten(ms_tensor);
auto g = EigenVector<T>::Flatten(grad_tensor); auto g = framework::EigenVector<T>::Flatten(grad_tensor);
auto mom = EigenVector<T>::Flatten(mom_tensor); auto mom = framework::EigenVector<T>::Flatten(mom_tensor);
auto p_out = EigenVector<T>::Flatten(*param_out); auto p_out = framework::EigenVector<T>::Flatten(*param_out);
auto mom_out = EigenVector<T>::Flatten(*moment_out); auto mom_out = framework::EigenVector<T>::Flatten(*moment_out);
auto ms_out = EigenVector<T>::Flatten(*mean_square_out); auto ms_out = framework::EigenVector<T>::Flatten(*mean_square_out);
ms_out.device(place) = rho * ms + (1 - rho) * g * g; ms_out.device(place) = rho * ms + (1 - rho) * g * g;
if (centered) { if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad"); auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto mg = EigenVector<T>::Flatten(mg_tensor); auto mg = framework::EigenVector<T>::Flatten(mg_tensor);
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut"); auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
&mg_tensor, mean_grad_out, &mg_tensor, mean_grad_out,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor")); "MeanGrad and MeanGradOut must be the same Tensor"));
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out); auto mg_out = framework::EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g; mg_out.device(place) = rho * mg + (1 - rho) * g;
mom_out.device(place) = mom_out.device(place) =
......
...@@ -8,14 +8,13 @@ register_unity_group(cc ...@@ -8,14 +8,13 @@ register_unity_group(cc
ftrl_op.cc ftrl_op.cc
lars_momentum_op.cc lars_momentum_op.cc
momentum_op.cc momentum_op.cc
sgd_op.cc) sgd_op.cc
register_unity_group(cc proximal_adagrad_op.cc
adagrad_op.cc adagrad_op.cc
adam_op.cc adam_op.cc
adamax_op.cc adamax_op.cc
dgc_momentum_op.cc dgc_momentum_op.cc
proximal_gd_op.cc) proximal_gd_op.cc
register_unity_group(cc
decayed_adagrad_op.cc decayed_adagrad_op.cc
adadelta_op.cc adadelta_op.cc
lamb_op.cc lamb_op.cc
...@@ -25,16 +24,12 @@ register_unity_group(cu ...@@ -25,16 +24,12 @@ register_unity_group(cu
ftrl_op.cu ftrl_op.cu
lars_momentum_op.cu lars_momentum_op.cu
momentum_op.cu momentum_op.cu
sgd_op.cu) sgd_op.cu
register_unity_group(cu proximal_adagrad_op.cu
adagrad_op.cu adagrad_op.cu
adam_op.cu adam_op.cu
adamax_op.cu) adamax_op.cu
register_unity_group(cu
decayed_adagrad_op.cu decayed_adagrad_op.cu
adadelta_op.cu adadelta_op.cu
lamb_op.cu lamb_op.cu
rmsprop_op.cu) rmsprop_op.cu)
# The following groups are to make better use of `/MP` which MSVC's parallel
# compilation instruction when compiling in Unity Build.
register_unity_group(cu proximal_adagrad_op.cu)
...@@ -133,16 +133,18 @@ namespace op = paddle::operators; ...@@ -133,16 +133,18 @@ namespace op = paddle::operators;
REGISTER_OPERATOR(sequence_concat, op::SequenceConcatOp, op::SeqConcatOpMaker, REGISTER_OPERATOR(sequence_concat, op::SequenceConcatOp, op::SeqConcatOpMaker,
op::SeqConcatGradOpMaker<paddle::framework::OpDesc>, op::SeqConcatGradOpMaker<paddle::framework::OpDesc>,
op::SeqConcatGradOpMaker<paddle::imperative::OpBase>); op::SeqConcatGradOpMaker<paddle::imperative::OpBase>);
template <typename T> REGISTER_OP_CPU_KERNEL(
using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>; sequence_concat,
REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>, op::SeqConcatKernel<paddle::platform::CPUDeviceContext, float>,
Kernel<int>, Kernel<int64_t>); op::SeqConcatKernel<paddle::platform::CPUDeviceContext, double>,
op::SeqConcatKernel<paddle::platform::CPUDeviceContext, int>,
op::SeqConcatKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OPERATOR(sequence_concat_grad, op::SeqConcatGradOp, REGISTER_OPERATOR(sequence_concat_grad, op::SeqConcatGradOp,
op::SeqConcatGradNoNeedBufferVarsInferer); op::SeqConcatGradNoNeedBufferVarsInferer);
template <typename T> REGISTER_OP_CPU_KERNEL(
using GradKernel = sequence_concat_grad,
op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, T>; op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, float>,
REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel<float>, op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
GradKernel<double>, GradKernel<int>, op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, int>,
GradKernel<int64_t>); op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
...@@ -21,15 +21,23 @@ class CUDADeviceContext; ...@@ -21,15 +21,23 @@ class CUDADeviceContext;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
template <typename T> REGISTER_OP_CUDA_KERNEL(
using Kernel = sequence_concat,
paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext, T>; paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
REGISTER_OP_CUDA_KERNEL(sequence_concat, Kernel<float>, Kernel<double>, float>,
Kernel<int>, Kernel<int64_t>); paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
template <typename T> double>,
using GradKernel = paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
int>,
paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
int64_t>);
REGISTER_OP_CUDA_KERNEL(
sequence_concat_grad,
paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext, paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext,
T>; float>,
REGISTER_OP_CUDA_KERNEL(sequence_concat_grad, GradKernel<float>, paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext,
GradKernel<double>, GradKernel<int>, double>,
GradKernel<int64_t>); paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext,
int>,
paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext,
int64_t>);
...@@ -62,7 +62,7 @@ static __global__ void sequence_expand_as_grad_kernel( ...@@ -62,7 +62,7 @@ static __global__ void sequence_expand_as_grad_kernel(
} }
template <typename T> template <typename T>
struct SequenceExpandFunctor<platform::CUDADeviceContext, T> { struct SequenceExpandAsFunctor<platform::CUDADeviceContext, T> {
void operator()( void operator()(
const platform::CUDADeviceContext &context, const LoDTensor &x, const platform::CUDADeviceContext &context, const LoDTensor &x,
const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/ const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
......
...@@ -24,7 +24,7 @@ namespace paddle { ...@@ -24,7 +24,7 @@ namespace paddle {
namespace operators { namespace operators {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct SequenceExpandFunctor { struct SequenceExpandAsFunctor {
void operator()( void operator()(
const DeviceContext &ctx, const framework::LoDTensor &x, const DeviceContext &ctx, const framework::LoDTensor &x,
const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/ const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
...@@ -40,7 +40,7 @@ struct SequenceExpandAsGradFunctor { ...@@ -40,7 +40,7 @@ struct SequenceExpandAsGradFunctor {
}; };
template <typename T> template <typename T>
struct SequenceExpandFunctor<platform::CPUDeviceContext, T> { struct SequenceExpandAsFunctor<platform::CPUDeviceContext, T> {
void operator()( void operator()(
const platform::CPUDeviceContext &context, const framework::LoDTensor &x, const platform::CPUDeviceContext &context, const framework::LoDTensor &x,
const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/ const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
...@@ -97,7 +97,7 @@ class SequenceExpandAsKernel : public framework::OpKernel<T> { ...@@ -97,7 +97,7 @@ class SequenceExpandAsKernel : public framework::OpKernel<T> {
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
auto &dev_ctx = context.template device_context<DeviceContext>(); auto &dev_ctx = context.template device_context<DeviceContext>();
SequenceExpandFunctor<DeviceContext, T> seq_espand_functor; SequenceExpandAsFunctor<DeviceContext, T> seq_espand_functor;
seq_espand_functor(dev_ctx, *x, y_lod[0], out); seq_espand_functor(dev_ctx, *x, y_lod[0], out);
} }
}; };
......
...@@ -12,8 +12,7 @@ register_unity_group(cc ...@@ -12,8 +12,7 @@ register_unity_group(cc
sequence_expand_op.cc sequence_expand_op.cc
sequence_mask_op.cc sequence_mask_op.cc
sequence_pad_op.cc sequence_pad_op.cc
sequence_pool_op.cc) sequence_pool_op.cc
register_unity_group(cc
sequence_expand_as_op.cc sequence_expand_as_op.cc
sequence_reshape_op.cc sequence_reshape_op.cc
sequence_reverse_op.cc sequence_reverse_op.cc
...@@ -21,8 +20,7 @@ register_unity_group(cc ...@@ -21,8 +20,7 @@ register_unity_group(cc
sequence_slice_op.cc sequence_slice_op.cc
sequence_softmax_op.cc sequence_softmax_op.cc
sequence_topk_avg_pooling_op.cc sequence_topk_avg_pooling_op.cc
sequence_unpad_op.cc) sequence_unpad_op.cc
register_unity_group(cc
sequence_concat_op.cu.cc sequence_concat_op.cu.cc
sequence_conv_op.cu.cc) sequence_conv_op.cu.cc)
register_unity_group(cu register_unity_group(cu
...@@ -31,8 +29,7 @@ register_unity_group(cu ...@@ -31,8 +29,7 @@ register_unity_group(cu
sequence_expand_op.cu sequence_expand_op.cu
sequence_mask_op.cu sequence_mask_op.cu
sequence_pad_op.cu sequence_pad_op.cu
sequence_pool_op.cu) sequence_pool_op.cu
register_unity_group(cu
sequence_expand_as_op.cu sequence_expand_as_op.cu
sequence_reshape_op.cu sequence_reshape_op.cu
sequence_reverse_op.cu sequence_reverse_op.cu
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册