From 7e671c07b6cdd5b284804e078fc569dc0a08dfa6 Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Wed, 20 Jan 2021 19:43:01 +0800 Subject: [PATCH] optimize unity build (#30195) * optimize unity build, test=develop * fix code style error, test=develop * fix code style error and test /MP settings, test=develop --- CMakeLists.txt | 10 +-- .../fluid/operators/average_accumulates_op.h | 18 ++--- .../operators/conv_transpose_cudnn_op.cu | 41 +++++----- paddle/fluid/operators/correlation_op.cu | 1 - paddle/fluid/operators/dot_op.h | 30 +++---- paddle/fluid/operators/meshgrid_op.h | 18 +---- paddle/fluid/operators/rank_loss_op.h | 7 +- .../operators/softmax_with_cross_entropy_op.h | 9 +-- .../fluid/operators/squared_l2_distance_op.h | 32 +++----- paddle/fluid/operators/unity_build_rule.cmake | 81 ++++++++++++------- paddle/scripts/paddle_build.bat | 2 +- 11 files changed, 119 insertions(+), 130 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 487aa200d7..6c2848d0b1 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,6 +84,8 @@ if(WIN32) endforeach(flag_var) endif() + # NOTE(Avin0323): Less parallel count result in faster compilation. + math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3") # windows build turn off warnings, use parallel compiling. foreach(flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE @@ -91,13 +93,7 @@ if(WIN32) CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}") - # NOTE(Avin0323): Less parallel count result in faster compilation with - # Unity Build on GPU. - if(WITH_UNITY_BUILD AND WITH_GPU) - set(${flag_var} "${${flag_var}} /MP8") - else() - set(${flag_var} "${${flag_var}} /MP") - endif() + set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}") endforeach(flag_var) foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS) set(${flag_var} "${${flag_var}} /w") diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h index 338e46111f..6813f56675 100644 --- a/paddle/fluid/operators/average_accumulates_op.h +++ b/paddle/fluid/operators/average_accumulates_op.h @@ -23,10 +23,6 @@ namespace operators { using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; - template void GetAccumulators(const framework::ExecutionContext& ctx, int64_t* num_updates, int64_t* num_accumulates, @@ -67,18 +63,18 @@ class AverageAccumulatesKernel : public framework::OpKernel { auto* in_sum_1 = ctx.Input("in_sum_1"); auto* in_sum_2 = ctx.Input("in_sum_2"); auto* in_sum_3 = ctx.Input("in_sum_3"); - auto param_tensor = EigenVector::Flatten(*param); - auto in_sum_1_tensor = EigenVector::Flatten(*in_sum_1); - auto in_sum_2_tensor = EigenVector::Flatten(*in_sum_2); - auto in_sum_3_tensor = EigenVector::Flatten(*in_sum_3); + auto param_tensor = framework::EigenVector::Flatten(*param); + auto in_sum_1_tensor = framework::EigenVector::Flatten(*in_sum_1); + auto in_sum_2_tensor = framework::EigenVector::Flatten(*in_sum_2); + auto in_sum_3_tensor = framework::EigenVector::Flatten(*in_sum_3); // Get outputs auto* out_sum_1 = ctx.Output("out_sum_1"); auto* out_sum_2 = ctx.Output("out_sum_2"); auto* out_sum_3 = ctx.Output("out_sum_3"); - auto out_sum_1_tensor = EigenVector::Flatten(*out_sum_1); - auto out_sum_2_tensor = EigenVector::Flatten(*out_sum_2); - auto out_sum_3_tensor = EigenVector::Flatten(*out_sum_3); + auto out_sum_1_tensor = framework::EigenVector::Flatten(*out_sum_1); + auto out_sum_2_tensor = framework::EigenVector::Flatten(*out_sum_2); + auto out_sum_3_tensor = framework::EigenVector::Flatten(*out_sum_3); // Compute auto& place = *ctx.template device_context().eigen_device(); diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index a12629b7a4..edf00eb2ba 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -25,7 +25,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using DataLayout = platform::DataLayout; template static void DataTranspose(const framework::ExecutionContext& ctx, @@ -67,14 +66,15 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { int groups = ctx.Attr("groups"); const T* filter_data = filter->data(); const std::string data_layout_str = ctx.Attr("data_format"); - const paddle::operators::DataLayout data_layout = - (data_layout_str != "NHWC" ? DataLayout::kNCHW : DataLayout::kNHWC); + const paddle::platform::DataLayout data_layout = + (data_layout_str != "NHWC" ? platform::DataLayout::kNCHW + : platform::DataLayout::kNHWC); // if channel_last, transpose to channel_first Tensor input_transpose; std::vector input_vec = framework::vectorize(input->dims()); std::vector output_vec = framework::vectorize(output->dims()); - if (data_layout == DataLayout::kNHWC) { + if (data_layout == platform::DataLayout::kNHWC) { if (strides.size() == 2U) { std::vector axis = {0, 3, 1, 2}; for (size_t i = 0; i < axis.size(); ++i) { @@ -195,7 +195,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { } T* transformed_output_data = transformed_output.data(); - DataLayout layout; + platform::DataLayout layout; int iwo_groups = groups; int c_groups = 1; @@ -206,9 +206,9 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { #endif if (strides.size() == 2U) { - layout = DataLayout::kNCHW; + layout = platform::DataLayout::kNCHW; } else { - layout = DataLayout::kNCDHW; + layout = platform::DataLayout::kNCDHW; } size_t workspace_size = 0; @@ -269,7 +269,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { ctx, &transformed_output, output, starts, ends, axes); } - if (data_layout == DataLayout::kNHWC) { + if (data_layout == platform::DataLayout::kNHWC) { Tensor output_transpose; Tensor output_nchw; output_nchw.ShareDataWith(*output); @@ -309,8 +309,9 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { std::string padding_algorithm = ctx.Attr("padding_algorithm"); int user_workspace_size = ctx.Attr("workspace_size_MB"); const std::string data_layout_str = ctx.Attr("data_format"); - const paddle::operators::DataLayout data_layout = - (data_layout_str != "NHWC" ? DataLayout::kNCHW : DataLayout::kNHWC); + const paddle::platform::DataLayout data_layout = + (data_layout_str != "NHWC" ? platform::DataLayout::kNCHW + : platform::DataLayout::kNHWC); // if channel_last, transpose to channel_first Tensor input_transpose; @@ -318,7 +319,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { std::vector input_vec = framework::vectorize(input->dims()); std::vector output_vec = framework::vectorize(output_grad->dims()); - if (data_layout == DataLayout::kNHWC) { + if (data_layout == platform::DataLayout::kNHWC) { if (strides.size() == 2U) { std::vector axis = {0, 3, 1, 2}; for (size_t i = 0; i < axis.size(); ++i) { @@ -416,12 +417,12 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { output_vec = framework::vectorize(transformed_output_grad.dims()); // ------------------- cudnn descriptors --------------------- - DataLayout layout; + platform::DataLayout layout; if (strides.size() == 2U) { - layout = DataLayout::kNCHW; + layout = platform::DataLayout::kNCHW; } else { - layout = DataLayout::kNCDHW; + layout = platform::DataLayout::kNCDHW; } int iwo_groups = groups; @@ -515,7 +516,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { workspace_handle.RunFunc(cudnn_func, workspace_size); } - if (data_layout == DataLayout::kNHWC) { + if (data_layout == platform::DataLayout::kNHWC) { Tensor input_grad_transpose; Tensor input_grad_nchw; input_grad_nchw.ShareDataWith(*input_grad); @@ -849,7 +850,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { cudnnConvolutionBwdFilterAlgo_t filter_algo = static_cast(0); - auto layout = GetCudnnTensorFormat(DataLayout::kNCHW); + auto layout = GetCudnnTensorFormat(platform::DataLayout::kNCHW); // ddo = conv(ddI, W) + conv(I, ddW) size_t workspace_size = 0; @@ -916,12 +917,12 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { } int i_n, i_c, i_d, i_h, i_w; - GetNCDHW(transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, - &i_w); + GetNCDHW(transformed_X.dims(), platform::DataLayout::kNCHW, &i_n, &i_c, + &i_d, &i_h, &i_w); int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_dO.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, &o_h, - &o_w); + GetNCDHW(transformed_dO.dims(), platform::DataLayout::kNCHW, &o_n, &o_c, + &o_d, &o_h, &o_w); int group_offset_in = transformed_X.numel() / transformed_X.dims()[0] / groups; diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu index 0d177f653e..6cf1ff5e72 100644 --- a/paddle/fluid/operators/correlation_op.cu +++ b/paddle/fluid/operators/correlation_op.cu @@ -23,7 +23,6 @@ namespace operators { #define FULL_MASK 0xffffffff using framework::Tensor; -using DataLayout = framework::DataLayout; template __forceinline__ __device__ T warpReduceSum(T val) { diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h index c78ac87084..a197e2149e 100644 --- a/paddle/fluid/operators/dot_op.h +++ b/paddle/fluid/operators/dot_op.h @@ -26,10 +26,6 @@ using Tensor = framework::Tensor; using complex64 = platform::complex64; using complex128 = platform::complex128; -template -using EigenMatrix = framework::EigenMatrix; - template struct P { void operator()(T a, R b); @@ -85,11 +81,11 @@ struct DotGradFunction> { dy.device(dev) = dy * dout.broadcast(size); } } else { - auto dout = EigenMatrix::From(*tensor_dout); + auto dout = framework::EigenMatrix::From(*tensor_dout); if (tensor_dx) { tensor_dx->mutable_data(ctx.GetPlace()); - auto y = EigenMatrix::From(*tensor_y); + auto y = framework::EigenMatrix::From(*tensor_y); auto& dev_raw = ctx.template device_context(); auto& dev = *dev_raw.eigen_device(); Eigen::DSizes size(1, tensor_dx->dims()[1]); @@ -99,14 +95,14 @@ struct DotGradFunction> { math::ConjFunctor functor(tensor_y->data(), tensor_y->numel(), tensor_dx->data()); for_range(functor); - auto dx = EigenMatrix::From(*tensor_dx); + auto dx = framework::EigenMatrix::From(*tensor_dx); dx.device(dev) = dx * dout.broadcast(size); } if (tensor_dy) { tensor_dy->mutable_data(ctx.GetPlace()); - auto x = EigenMatrix::From(*tensor_x); + auto x = framework::EigenMatrix::From(*tensor_x); auto& dev_raw = ctx.template device_context(); auto& dev = *dev_raw.eigen_device(); Eigen::DSizes size(1, tensor_dy->dims()[1]); @@ -117,7 +113,7 @@ struct DotGradFunction> { tensor_dy->data()); for_range(functor); - auto dy = EigenMatrix::From(*tensor_dy); + auto dy = framework::EigenMatrix::From(*tensor_dy); dy.device(dev) = dy * dout.broadcast(size); } @@ -186,12 +182,12 @@ struct DotGradFunction> { dy.device(dev) = x * dout.broadcast(size); } } else { - auto dout = EigenMatrix::From(*tensor_dout); + auto dout = framework::EigenMatrix::From(*tensor_dout); if (tensor_dx) { tensor_dx->mutable_data(ctx.GetPlace()); - auto y = EigenMatrix::From(*tensor_y); - auto dx = EigenMatrix::From(*tensor_dx); + auto y = framework::EigenMatrix::From(*tensor_y); + auto dx = framework::EigenMatrix::From(*tensor_dx); auto& dev = *ctx.template device_context().eigen_device(); Eigen::DSizes size(1, tensor_dx->dims()[1]); @@ -200,8 +196,8 @@ struct DotGradFunction> { if (tensor_dy) { tensor_dy->mutable_data(ctx.GetPlace()); - auto x = EigenMatrix::From(*tensor_x); - auto dy = EigenMatrix::From(*tensor_dy); + auto x = framework::EigenMatrix::From(*tensor_x); + auto dy = framework::EigenMatrix::From(*tensor_dy); auto& dev = *ctx.template device_context().eigen_device(); Eigen::DSizes size(1, tensor_dy->dims()[1]); @@ -262,9 +258,9 @@ class DotKernel : public framework::OpKernel { auto& dev = *ctx.template device_context().eigen_device(); out.device(dev) = (x * y).sum(); } else { - auto out = EigenMatrix::From(*tensor_out); - auto x = EigenMatrix::From(*tensor_x); - auto y = EigenMatrix::From(*tensor_y); + auto out = framework::EigenMatrix::From(*tensor_out); + auto x = framework::EigenMatrix::From(*tensor_x); + auto y = framework::EigenMatrix::From(*tensor_y); auto& dev = *ctx.template device_context().eigen_device(); out.device(dev) = (x * y).sum(Eigen::DSizes(1)); diff --git a/paddle/fluid/operators/meshgrid_op.h b/paddle/fluid/operators/meshgrid_op.h index d591912bef..11cd43b220 100644 --- a/paddle/fluid/operators/meshgrid_op.h +++ b/paddle/fluid/operators/meshgrid_op.h @@ -50,16 +50,6 @@ namespace paddle { namespace operators { -template -using EigenMatrix = framework::EigenMatrix; -template -using EigenVector = framework::EigenVector; -template -using EigenTensor = framework::EigenTensor; - template class MeshgridKernel : public framework::OpKernel { public: @@ -120,9 +110,9 @@ class MeshgridKernel : public framework::OpKernel { bcast_dims[i] = 1; outs[i]->Resize(out_dims); - auto x = EigenTensor::From(reshape_ins_tensor); + auto x = framework::EigenTensor::From(reshape_ins_tensor); outs[i]->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*outs[i]); + auto y = framework::EigenTensor::From(*outs[i]); auto& place = *context.template device_context().eigen_device(); y.device(place) = x.broadcast(bcast_dims); @@ -159,8 +149,8 @@ class MeshgridGradKernel : public framework::OpKernel { for (int i = 0; i < n; i++) { outs[i]->mutable_data(context.GetPlace()); - auto out_grad_tmp = EigenVector::Flatten(*out_grad[i]); - auto in_grad = EigenVector::Flatten(*outs[i]); + auto out_grad_tmp = framework::EigenVector::Flatten(*out_grad[i]); + auto in_grad = framework::EigenVector::Flatten(*outs[i]); std::vector reduce_dims_vec; std::vector reshape_dims_vec; diff --git a/paddle/fluid/operators/rank_loss_op.h b/paddle/fluid/operators/rank_loss_op.h index 28626c0e2e..8609958476 100644 --- a/paddle/fluid/operators/rank_loss_op.h +++ b/paddle/fluid/operators/rank_loss_op.h @@ -37,7 +37,7 @@ class RankLossKernel : public framework::OpKernel { auto& dev = *ctx.template device_context().eigen_device(); out.device(dev) = - (1. + (left - right).exp()).log() - label * (left - right); + (1.0f + (left - right).exp()).log() - label * (left - right); } }; @@ -65,14 +65,15 @@ class RankLossGradKernel : public framework::OpKernel { if (d_left_t) { d_left_t->mutable_data(ctx.GetPlace()); auto d_left = framework::EigenVector::Flatten(*d_left_t); - d_left.device(dev) = d_out * (1. / (1. + (right - left).exp()) - label); + d_left.device(dev) = + d_out * (1.0f / (1.0f + (right - left).exp()) - label); } // compute d_right if (d_right_t) { d_right_t->mutable_data(ctx.GetPlace()); auto d_right = framework::EigenVector::Flatten(*d_right_t); d_right.device(dev) = - -d_out * (1.0 / (1. + (right - left).exp()) - label); + -d_out * (1.0f / (1.0f + (right - left).exp()) - label); } } }; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index 93f2552c3c..35663bd9b7 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -23,9 +23,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenMatrix = framework::EigenMatrix; template class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { @@ -95,12 +92,12 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n}); out_grad_2d.ShareDataWith(*out_grad).Resize({n, d / axis_dim}); - auto out_grad_mat = EigenMatrix::From(out_grad_2d); - auto logit_grad_mat = EigenMatrix::From(logit_grad_2d); + auto out_grad_mat = framework::EigenMatrix::From(out_grad_2d); + auto logit_grad_mat = framework::EigenMatrix::From(logit_grad_2d); auto& place = *context.template device_context() .eigen_device(); if (soft_label) { - auto lbl_mat = EigenMatrix::From(labels_2d); + auto lbl_mat = framework::EigenMatrix::From(labels_2d); logit_grad_mat.device(place) = out_grad_mat.broadcast(Eigen::DSizes(1, axis_dim)) * (logit_grad_mat - lbl_mat); diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h index b8735a69c4..5472ecaf99 100644 --- a/paddle/fluid/operators/squared_l2_distance_op.h +++ b/paddle/fluid/operators/squared_l2_distance_op.h @@ -20,12 +20,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; -template -using EigenMatrix = framework::EigenMatrix; template class SquaredL2DistanceKernel : public framework::OpKernel { @@ -41,15 +35,15 @@ class SquaredL2DistanceKernel : public framework::OpKernel { int cols = in0->numel() / in0_dims[0]; // reduce dimensions except the first - auto x = - EigenMatrix::From(*in0, framework::make_ddim({in0_dims[0], cols})); - auto y = - EigenMatrix::From(*in1, framework::make_ddim({in1_dims[0], cols})); + auto x = framework::EigenMatrix::From( + *in0, framework::make_ddim({in0_dims[0], cols})); + auto y = framework::EigenMatrix::From( + *in1, framework::make_ddim({in1_dims[0], cols})); out0->mutable_data(context.GetPlace()); out1->mutable_data(context.GetPlace()); - auto sub_result = EigenMatrix::From(*out0); - auto z = EigenVector::Flatten(*out1); + auto sub_result = framework::EigenMatrix::From(*out0); + auto z = framework::EigenVector::Flatten(*out1); auto& place = *context.template device_context().eigen_device(); @@ -88,8 +82,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel { "in scope for operator 'squared_l2_distance_grad'.", framework::GradVarName("Y"))); - auto sub_result = EigenMatrix::From(*in0); - auto out_grad = EigenMatrix::From(*in1); + auto sub_result = framework::EigenMatrix::From(*in0); + auto out_grad = framework::EigenMatrix::From(*in1); auto x_dims = x_g->dims(); auto y_dims = y_g->dims(); @@ -106,8 +100,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel { x_g->mutable_data(context.GetPlace()); // eigen matrix - auto x_grad = - EigenMatrix::From(*x_g, framework::make_ddim({x_dims[0], cols})); + auto x_grad = framework::EigenMatrix::From( + *x_g, framework::make_ddim({x_dims[0], cols})); // dimensions are same with subResult x_grad.device(eigen_place) = grad_mat; @@ -121,12 +115,12 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel { sub_result.dimensions()[0], y_dims[0])); if (sub_result.dimensions()[0] == y_dims[0]) { - auto y_grad = - EigenMatrix::From(*y_g, framework::make_ddim({y_dims[0], cols})); + auto y_grad = framework::EigenMatrix::From( + *y_g, framework::make_ddim({y_dims[0], cols})); y_grad.device(eigen_place) = -1 * grad_mat; } else { auto col_sum_res = -1 * (grad_mat.sum(Eigen::array({{0}}))); - auto y_grad = EigenVector::Flatten(*y_g); + auto y_grad = framework::EigenVector::Flatten(*y_g); y_grad.device(eigen_place) = col_sum_res; } } diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index c59a239c4b..d21f6b2d69 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -307,32 +307,36 @@ register_unity_group(cc spp_op.cu.cc squeeze_op.cu.cc unbind_op.cu.cc - unique_op.cu unpool_op.cu.cc unsqueeze_op.cu.cc) register_unity_group(cu addmm_op.cu affine_channel_op.cu allclose_op.cu - argsort_op.cu assign_value_op.cu bce_loss_op.cu bernoulli_op.cu - bilateral_slice_op.cu) + bilateral_slice_op.cu + batch_norm_op.cu) register_unity_group(cu bilinear_tensor_product_op.cu bmm_op.cu cast_op.cu cholesky_op.cu clip_by_norm_op.cu - clip_op.cu) + clip_op.cu + conv_cudnn_op.cu + affine_grid_op.cu) register_unity_group(cu center_loss_op.cu conv_op.cu conv_transpose_cudnn_op.cu conv_transpose_op.cu cos_sim_op.cu - crop_op.cu) + crop_op.cu + average_accumulates_op.cu + conj_op.cu + correlation_op.cu) register_unity_group(cu cross_entropy_op.cu cross_op.cu @@ -349,7 +353,9 @@ register_unity_group(cu diag_op.cu diag_v2_op.cu edit_distance_op.cu - erf_op.cu) + erf_op.cu + meshgrid_op.cu + imag_op.cu) register_unity_group(cu expand_v2_op.cu fake_dequantize_op.cu @@ -377,10 +383,8 @@ register_unity_group(cu inplace_abn_op.cu interpolate_v2_op.cu isfinite_op.cu - kron_op.cu l1_norm_op.cu label_smooth_op.cu - layer_norm_op.cu linspace_op.cu load_combine_op.cu load_op.cu) @@ -388,20 +392,30 @@ register_unity_group(cu lod_reset_op.cu log_softmax_op.cu lrn_op.cu - lstm_unit_op.cu) + lstm_unit_op.cu + dot_op.cu + psroi_pool_op.cu + rank_loss_op.cu + real_op.cu) register_unity_group(cu log_loss_op.cu lookup_table_v2_op.cu margin_rank_loss_op.cu masked_select_op.cu - merge_selected_rows_op.cu) + merge_selected_rows_op.cu + lstmp_op.cu + shuffle_channel_op.cu + softmax_cudnn_op.cu + squared_l2_distance_op.cu) register_unity_group(cu conv_shift_op.cu dequantize_log_op.cu dropout_op.cu fake_quantize_op.cu gelu_op.cu - lookup_table_op.cu) + lookup_table_op.cu + sigmoid_cross_entropy_with_logits_op.cu + softmax_with_cross_entropy_op.cu) register_unity_group(cu mean_iou_op.cu mean_op.cu @@ -430,7 +444,10 @@ register_unity_group(cu random_crop_op.cu randperm_op.cu range_op.cu - reverse_op.cu) + reverse_op.cu + partial_concat_op.cu + kldiv_loss_op.cu + instance_norm_op.cu) register_unity_group(cu roi_align_op.cu roll_op.cu @@ -457,40 +474,42 @@ register_unity_group(cu split_op.cu split_selected_rows_op.cu squared_l2_norm_op.cu - stack_op.cu - strided_slice_op.cu sum_op.cu - temporal_shift_op.cu) + temporal_shift_op.cu + arg_max_op.cu) register_unity_group(cu row_conv_op.cu - tile_op.cu - trace_op.cu - transpose_op.cu tree_conv_op.cu tril_triu_op.cu truncated_gaussian_random_op.cu - unfold_op.cu) + unfold_op.cu + arg_min_op.cu + crop_tensor_op.cu) register_unity_group(cu smooth_l1_loss_op.cu uniform_random_op.cu - unique_op.cu unstack_op.cu where_index_op.cu - where_op.cu) + where_op.cu + layer_norm_op.cu) +register_unity_group(cu + expand_as_op.cu + stack_op.cu) # The following groups are to make better use of `/MP` which MSVC's parallel # compilation instruction when compiling in Unity Build. register_unity_group(cu activation_op.cu) -register_unity_group(cu arg_max_op.cu) -register_unity_group(cu arg_min_op.cu) -register_unity_group(cu batch_norm_op.cu) -register_unity_group(cu crop_tensor_op.cu) register_unity_group(cu dist_op.cu) -register_unity_group(cu expand_as_op.cu) register_unity_group(cu expand_as_v2_op.cu) register_unity_group(cu gru_unit_op.cu) -register_unity_group(cu instance_norm_op.cu) -register_unity_group(cu kldiv_loss_op.cu) -register_unity_group(cu partial_concat_op.cu) -register_unity_group(cu softmax_with_cross_entropy_op.cu) -register_unity_group(cu squared_l2_distance_op.cu) register_unity_group(cu top_k_op.cu) +register_unity_group(cu argsort_op.cu) +register_unity_group(cu kron_op.cu) +register_unity_group(cu unique_op.cu) +register_unity_group(cu tile_op.cu) +register_unity_group(cu trace_op.cu) +register_unity_group(cu transpose_op.cu) +register_unity_group(cu strided_slice_op.cu) +register_unity_group(cu expand_op.cu) +register_unity_group(cu matmul_v2_op.cu) +register_unity_group(cu top_k_v2_op.cu) +register_unity_group(cu set_value_op.cu) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index a6c6a065d2..eb356b5869 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -262,7 +262,7 @@ echo ======================================== echo Step 2. Buile Paddle ... echo ======================================== -for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*9/10 +for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*2/3 set build_times=1 :build_tp echo Build third_party the %build_times% time: -- GitLab