未验证 提交 2e5b4a21 编写于 作者: L LoveAn 提交者: GitHub

Optimize compilation time with Unity Build (#29733)

* Test compilation time with less parallel count, notest, test=windows_ci

* optimize rules of Unity Build, notest, test=windows_ci, test=windows_op

* limit parallel counts used only on GPU, test=develop

* remove limit of argument /m:8 on Windows, test=develop
上级 0c23ba95
......@@ -87,7 +87,13 @@ if(WIN32)
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
# NOTE(Avin0323): Less parallel count result in faster compilation with
# Unity Build on GPU.
if(WITH_UNITY_BUILD AND WITH_GPU)
set(${flag_var} "${${flag_var}} /MP8")
else()
set(${flag_var} "${${flag_var}} /MP")
endif()
endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w")
......
......@@ -14,3 +14,7 @@ register_unity_group(cc
logical_op.cc
tensor_array_read_write_op.cc
while_op.cc)
register_unity_group(cu
logical_op.cu
compare_op.cu
compare_all_op.cu)
......@@ -20,9 +20,6 @@ namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext, typename T>
class ProximalAdagradOpKernel : public framework::OpKernel<T> {
......@@ -38,13 +35,14 @@ class ProximalAdagradOpKernel : public framework::OpKernel<T> {
auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
auto grad = ctx.Input<Tensor>("Grad");
auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
auto m = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
auto g = EigenVector<T>::Flatten(*grad);
auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
auto p = framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
auto m = framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
auto g = framework::EigenVector<T>::Flatten(*grad);
auto lr =
framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
auto p_out = EigenVector<T>::Flatten(*param_out);
auto m_out = EigenVector<T>::Flatten(*moment_out);
auto p_out = framework::EigenVector<T>::Flatten(*param_out);
auto m_out = framework::EigenVector<T>::Flatten(*moment_out);
auto* place = ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 1> grad_dsize(grad->numel());
......
......@@ -20,9 +20,6 @@ namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext, typename T>
class ProximalGDOpKernel : public framework::OpKernel<T> {
......@@ -37,11 +34,12 @@ class ProximalGDOpKernel : public framework::OpKernel<T> {
auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
auto g = EigenVector<T>::Flatten(*grad);
auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
auto p = framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
auto g = framework::EigenVector<T>::Flatten(*grad);
auto lr =
framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
auto p_out = EigenVector<T>::Flatten(*param_out);
auto p_out = framework::EigenVector<T>::Flatten(*param_out);
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 1> grad_dsize(grad->numel());
......@@ -52,10 +50,10 @@ class ProximalGDOpKernel : public framework::OpKernel<T> {
prox_param.sign() *
(((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
.cwiseMax(T(0.0))) /
(1.0 + (lr * l2).broadcast(grad_dsize)));
(1.0f + (lr * l2).broadcast(grad_dsize)));
} else {
p_out.device(place) =
prox_param / (1.0 + (lr * l2).broadcast(grad_dsize));
prox_param / (1.0f + (lr * l2).broadcast(grad_dsize));
}
}
};
......
......@@ -23,10 +23,6 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T>
struct DenseRmspropGradFunctor {
inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
......@@ -169,25 +165,25 @@ class RmspropOpKernel : public framework::OpKernel<T> {
*ctx.template device_context<DeviceContext>().eigen_device();
auto lr_value = lr_tensor.data<T>()[0];
auto p = EigenVector<T>::Flatten(p_tensor);
auto ms = EigenVector<T>::Flatten(ms_tensor);
auto g = EigenVector<T>::Flatten(grad_tensor);
auto mom = EigenVector<T>::Flatten(mom_tensor);
auto p = framework::EigenVector<T>::Flatten(p_tensor);
auto ms = framework::EigenVector<T>::Flatten(ms_tensor);
auto g = framework::EigenVector<T>::Flatten(grad_tensor);
auto mom = framework::EigenVector<T>::Flatten(mom_tensor);
auto p_out = EigenVector<T>::Flatten(*param_out);
auto mom_out = EigenVector<T>::Flatten(*moment_out);
auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
auto p_out = framework::EigenVector<T>::Flatten(*param_out);
auto mom_out = framework::EigenVector<T>::Flatten(*moment_out);
auto ms_out = framework::EigenVector<T>::Flatten(*mean_square_out);
ms_out.device(place) = rho * ms + (1 - rho) * g * g;
if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto mg = EigenVector<T>::Flatten(mg_tensor);
auto mg = framework::EigenVector<T>::Flatten(mg_tensor);
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE_EQ(
&mg_tensor, mean_grad_out,
platform::errors::InvalidArgument(
"MeanGrad and MeanGradOut must be the same Tensor"));
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
auto mg_out = framework::EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g;
mom_out.device(place) =
......
......@@ -8,14 +8,13 @@ register_unity_group(cc
ftrl_op.cc
lars_momentum_op.cc
momentum_op.cc
sgd_op.cc)
register_unity_group(cc
sgd_op.cc
proximal_adagrad_op.cc
adagrad_op.cc
adam_op.cc
adamax_op.cc
dgc_momentum_op.cc
proximal_gd_op.cc)
register_unity_group(cc
proximal_gd_op.cc
decayed_adagrad_op.cc
adadelta_op.cc
lamb_op.cc
......@@ -25,16 +24,12 @@ register_unity_group(cu
ftrl_op.cu
lars_momentum_op.cu
momentum_op.cu
sgd_op.cu)
register_unity_group(cu
sgd_op.cu
proximal_adagrad_op.cu
adagrad_op.cu
adam_op.cu
adamax_op.cu)
register_unity_group(cu
adamax_op.cu
decayed_adagrad_op.cu
adadelta_op.cu
lamb_op.cu
rmsprop_op.cu)
# The following groups are to make better use of `/MP` which MSVC's parallel
# compilation instruction when compiling in Unity Build.
register_unity_group(cu proximal_adagrad_op.cu)
......@@ -133,16 +133,18 @@ namespace op = paddle::operators;
REGISTER_OPERATOR(sequence_concat, op::SequenceConcatOp, op::SeqConcatOpMaker,
op::SeqConcatGradOpMaker<paddle::framework::OpDesc>,
op::SeqConcatGradOpMaker<paddle::imperative::OpBase>);
template <typename T>
using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>;
REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>,
Kernel<int>, Kernel<int64_t>);
REGISTER_OP_CPU_KERNEL(
sequence_concat,
op::SeqConcatKernel<paddle::platform::CPUDeviceContext, float>,
op::SeqConcatKernel<paddle::platform::CPUDeviceContext, double>,
op::SeqConcatKernel<paddle::platform::CPUDeviceContext, int>,
op::SeqConcatKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OPERATOR(sequence_concat_grad, op::SeqConcatGradOp,
op::SeqConcatGradNoNeedBufferVarsInferer);
template <typename T>
using GradKernel =
op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, T>;
REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel<float>,
GradKernel<double>, GradKernel<int>,
GradKernel<int64_t>);
REGISTER_OP_CPU_KERNEL(
sequence_concat_grad,
op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, float>,
op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, int>,
op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
......@@ -21,15 +21,23 @@ class CUDADeviceContext;
} // namespace platform
} // namespace paddle
template <typename T>
using Kernel =
paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext, T>;
REGISTER_OP_CUDA_KERNEL(sequence_concat, Kernel<float>, Kernel<double>,
Kernel<int>, Kernel<int64_t>);
template <typename T>
using GradKernel =
REGISTER_OP_CUDA_KERNEL(
sequence_concat,
paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
float>,
paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
double>,
paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
int>,
paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
int64_t>);
REGISTER_OP_CUDA_KERNEL(
sequence_concat_grad,
paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext,
T>;
REGISTER_OP_CUDA_KERNEL(sequence_concat_grad, GradKernel<float>,
GradKernel<double>, GradKernel<int>,
GradKernel<int64_t>);
float>,
paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext,
double>,
paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext,
int>,
paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext,
int64_t>);
......@@ -62,7 +62,7 @@ static __global__ void sequence_expand_as_grad_kernel(
}
template <typename T>
struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
struct SequenceExpandAsFunctor<platform::CUDADeviceContext, T> {
void operator()(
const platform::CUDADeviceContext &context, const LoDTensor &x,
const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
......
......@@ -24,7 +24,7 @@ namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
struct SequenceExpandFunctor {
struct SequenceExpandAsFunctor {
void operator()(
const DeviceContext &ctx, const framework::LoDTensor &x,
const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
......@@ -40,7 +40,7 @@ struct SequenceExpandAsGradFunctor {
};
template <typename T>
struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
struct SequenceExpandAsFunctor<platform::CPUDeviceContext, T> {
void operator()(
const platform::CPUDeviceContext &context, const framework::LoDTensor &x,
const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
......@@ -97,7 +97,7 @@ class SequenceExpandAsKernel : public framework::OpKernel<T> {
out->mutable_data<T>(context.GetPlace());
auto &dev_ctx = context.template device_context<DeviceContext>();
SequenceExpandFunctor<DeviceContext, T> seq_espand_functor;
SequenceExpandAsFunctor<DeviceContext, T> seq_espand_functor;
seq_espand_functor(dev_ctx, *x, y_lod[0], out);
}
};
......
......@@ -12,8 +12,7 @@ register_unity_group(cc
sequence_expand_op.cc
sequence_mask_op.cc
sequence_pad_op.cc
sequence_pool_op.cc)
register_unity_group(cc
sequence_pool_op.cc
sequence_expand_as_op.cc
sequence_reshape_op.cc
sequence_reverse_op.cc
......@@ -21,8 +20,7 @@ register_unity_group(cc
sequence_slice_op.cc
sequence_softmax_op.cc
sequence_topk_avg_pooling_op.cc
sequence_unpad_op.cc)
register_unity_group(cc
sequence_unpad_op.cc
sequence_concat_op.cu.cc
sequence_conv_op.cu.cc)
register_unity_group(cu
......@@ -31,8 +29,7 @@ register_unity_group(cu
sequence_expand_op.cu
sequence_mask_op.cu
sequence_pad_op.cu
sequence_pool_op.cu)
register_unity_group(cu
sequence_pool_op.cu
sequence_expand_as_op.cu
sequence_reshape_op.cu
sequence_reverse_op.cu
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册