未验证 提交 7e671c07 编写于 作者: W wuhuanzhou 提交者: GitHub

optimize unity build (#30195)

* optimize unity build, test=develop

* fix code style error, test=develop

* fix code style error and test /MP settings, test=develop
上级 e5b0d9e1
......@@ -84,6 +84,8 @@ if(WIN32)
endforeach(flag_var)
endif()
# NOTE(Avin0323): Less parallel count result in faster compilation.
math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
# windows build turn off warnings, use parallel compiling.
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
......@@ -91,13 +93,7 @@ if(WIN32)
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
# NOTE(Avin0323): Less parallel count result in faster compilation with
# Unity Build on GPU.
if(WITH_UNITY_BUILD AND WITH_GPU)
set(${flag_var} "${${flag_var}} /MP8")
else()
set(${flag_var} "${${flag_var}} /MP")
endif()
set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w")
......
......@@ -23,10 +23,6 @@ namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext>
void GetAccumulators(const framework::ExecutionContext& ctx,
int64_t* num_updates, int64_t* num_accumulates,
......@@ -67,18 +63,18 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
auto param_tensor = EigenVector<T>::Flatten(*param);
auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);
auto param_tensor = framework::EigenVector<T>::Flatten(*param);
auto in_sum_1_tensor = framework::EigenVector<T>::Flatten(*in_sum_1);
auto in_sum_2_tensor = framework::EigenVector<T>::Flatten(*in_sum_2);
auto in_sum_3_tensor = framework::EigenVector<T>::Flatten(*in_sum_3);
// Get outputs
auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
auto out_sum_1_tensor = framework::EigenVector<T>::Flatten(*out_sum_1);
auto out_sum_2_tensor = framework::EigenVector<T>::Flatten(*out_sum_2);
auto out_sum_3_tensor = framework::EigenVector<T>::Flatten(*out_sum_3);
// Compute
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
......
......@@ -25,7 +25,6 @@ namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DataLayout = platform::DataLayout;
template <typename T, int D>
static void DataTranspose(const framework::ExecutionContext& ctx,
......@@ -67,14 +66,15 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
int groups = ctx.Attr<int>("groups");
const T* filter_data = filter->data<T>();
const std::string data_layout_str = ctx.Attr<std::string>("data_format");
const paddle::operators::DataLayout data_layout =
(data_layout_str != "NHWC" ? DataLayout::kNCHW : DataLayout::kNHWC);
const paddle::platform::DataLayout data_layout =
(data_layout_str != "NHWC" ? platform::DataLayout::kNCHW
: platform::DataLayout::kNHWC);
// if channel_last, transpose to channel_first
Tensor input_transpose;
std::vector<int> input_vec = framework::vectorize<int>(input->dims());
std::vector<int> output_vec = framework::vectorize<int>(output->dims());
if (data_layout == DataLayout::kNHWC) {
if (data_layout == platform::DataLayout::kNHWC) {
if (strides.size() == 2U) {
std::vector<int> axis = {0, 3, 1, 2};
for (size_t i = 0; i < axis.size(); ++i) {
......@@ -195,7 +195,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
}
T* transformed_output_data = transformed_output.data<T>();
DataLayout layout;
platform::DataLayout layout;
int iwo_groups = groups;
int c_groups = 1;
......@@ -206,9 +206,9 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
#endif
if (strides.size() == 2U) {
layout = DataLayout::kNCHW;
layout = platform::DataLayout::kNCHW;
} else {
layout = DataLayout::kNCDHW;
layout = platform::DataLayout::kNCDHW;
}
size_t workspace_size = 0;
......@@ -269,7 +269,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
ctx, &transformed_output, output, starts, ends, axes);
}
if (data_layout == DataLayout::kNHWC) {
if (data_layout == platform::DataLayout::kNHWC) {
Tensor output_transpose;
Tensor output_nchw;
output_nchw.ShareDataWith(*output);
......@@ -309,8 +309,9 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
const std::string data_layout_str = ctx.Attr<std::string>("data_format");
const paddle::operators::DataLayout data_layout =
(data_layout_str != "NHWC" ? DataLayout::kNCHW : DataLayout::kNHWC);
const paddle::platform::DataLayout data_layout =
(data_layout_str != "NHWC" ? platform::DataLayout::kNCHW
: platform::DataLayout::kNHWC);
// if channel_last, transpose to channel_first
Tensor input_transpose;
......@@ -318,7 +319,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
std::vector<int> input_vec = framework::vectorize<int>(input->dims());
std::vector<int> output_vec =
framework::vectorize<int>(output_grad->dims());
if (data_layout == DataLayout::kNHWC) {
if (data_layout == platform::DataLayout::kNHWC) {
if (strides.size() == 2U) {
std::vector<int> axis = {0, 3, 1, 2};
for (size_t i = 0; i < axis.size(); ++i) {
......@@ -416,12 +417,12 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
output_vec = framework::vectorize<int>(transformed_output_grad.dims());
// ------------------- cudnn descriptors ---------------------
DataLayout layout;
platform::DataLayout layout;
if (strides.size() == 2U) {
layout = DataLayout::kNCHW;
layout = platform::DataLayout::kNCHW;
} else {
layout = DataLayout::kNCDHW;
layout = platform::DataLayout::kNCDHW;
}
int iwo_groups = groups;
......@@ -515,7 +516,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
workspace_handle.RunFunc(cudnn_func, workspace_size);
}
if (data_layout == DataLayout::kNHWC) {
if (data_layout == platform::DataLayout::kNHWC) {
Tensor input_grad_transpose;
Tensor input_grad_nchw;
input_grad_nchw.ShareDataWith(*input_grad);
......@@ -849,7 +850,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
cudnnConvolutionBwdFilterAlgo_t filter_algo =
static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
auto layout = GetCudnnTensorFormat(DataLayout::kNCHW);
auto layout = GetCudnnTensorFormat(platform::DataLayout::kNCHW);
// ddo = conv(ddI, W) + conv(I, ddW)
size_t workspace_size = 0;
......@@ -916,12 +917,12 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
}
int i_n, i_c, i_d, i_h, i_w;
GetNCDHW(transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h,
&i_w);
GetNCDHW(transformed_X.dims(), platform::DataLayout::kNCHW, &i_n, &i_c,
&i_d, &i_h, &i_w);
int o_n, o_c, o_d, o_h, o_w;
GetNCDHW(transformed_dO.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, &o_h,
&o_w);
GetNCDHW(transformed_dO.dims(), platform::DataLayout::kNCHW, &o_n, &o_c,
&o_d, &o_h, &o_w);
int group_offset_in =
transformed_X.numel() / transformed_X.dims()[0] / groups;
......
......@@ -23,7 +23,6 @@ namespace operators {
#define FULL_MASK 0xffffffff
using framework::Tensor;
using DataLayout = framework::DataLayout;
template <typename T>
__forceinline__ __device__ T warpReduceSum(T val) {
......
......@@ -26,10 +26,6 @@ using Tensor = framework::Tensor;
using complex64 = platform::complex64;
using complex128 = platform::complex128;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T, typename R>
struct P {
void operator()(T a, R b);
......@@ -85,11 +81,11 @@ struct DotGradFunction<DeviceContext, T, math::EnableComplex<T>> {
dy.device(dev) = dy * dout.broadcast(size);
}
} else {
auto dout = EigenMatrix<T>::From(*tensor_dout);
auto dout = framework::EigenMatrix<T>::From(*tensor_dout);
if (tensor_dx) {
tensor_dx->mutable_data<T>(ctx.GetPlace());
auto y = EigenMatrix<T>::From(*tensor_y);
auto y = framework::EigenMatrix<T>::From(*tensor_y);
auto& dev_raw = ctx.template device_context<DeviceContext>();
auto& dev = *dev_raw.eigen_device();
Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
......@@ -99,14 +95,14 @@ struct DotGradFunction<DeviceContext, T, math::EnableComplex<T>> {
math::ConjFunctor<T> functor(tensor_y->data<T>(), tensor_y->numel(),
tensor_dx->data<T>());
for_range(functor);
auto dx = EigenMatrix<T>::From(*tensor_dx);
auto dx = framework::EigenMatrix<T>::From(*tensor_dx);
dx.device(dev) = dx * dout.broadcast(size);
}
if (tensor_dy) {
tensor_dy->mutable_data<T>(ctx.GetPlace());
auto x = EigenMatrix<T>::From(*tensor_x);
auto x = framework::EigenMatrix<T>::From(*tensor_x);
auto& dev_raw = ctx.template device_context<DeviceContext>();
auto& dev = *dev_raw.eigen_device();
Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
......@@ -117,7 +113,7 @@ struct DotGradFunction<DeviceContext, T, math::EnableComplex<T>> {
tensor_dy->data<T>());
for_range(functor);
auto dy = EigenMatrix<T>::From(*tensor_dy);
auto dy = framework::EigenMatrix<T>::From(*tensor_dy);
dy.device(dev) = dy * dout.broadcast(size);
}
......@@ -186,12 +182,12 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
dy.device(dev) = x * dout.broadcast(size);
}
} else {
auto dout = EigenMatrix<T>::From(*tensor_dout);
auto dout = framework::EigenMatrix<T>::From(*tensor_dout);
if (tensor_dx) {
tensor_dx->mutable_data<T>(ctx.GetPlace());
auto y = EigenMatrix<T>::From(*tensor_y);
auto dx = EigenMatrix<T>::From(*tensor_dx);
auto y = framework::EigenMatrix<T>::From(*tensor_y);
auto dx = framework::EigenMatrix<T>::From(*tensor_dx);
auto& dev =
*ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
......@@ -200,8 +196,8 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
if (tensor_dy) {
tensor_dy->mutable_data<T>(ctx.GetPlace());
auto x = EigenMatrix<T>::From(*tensor_x);
auto dy = EigenMatrix<T>::From(*tensor_dy);
auto x = framework::EigenMatrix<T>::From(*tensor_x);
auto dy = framework::EigenMatrix<T>::From(*tensor_dy);
auto& dev =
*ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
......@@ -262,9 +258,9 @@ class DotKernel : public framework::OpKernel<T> {
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
out.device(dev) = (x * y).sum();
} else {
auto out = EigenMatrix<T>::From(*tensor_out);
auto x = EigenMatrix<T>::From(*tensor_x);
auto y = EigenMatrix<T>::From(*tensor_y);
auto out = framework::EigenMatrix<T>::From(*tensor_out);
auto x = framework::EigenMatrix<T>::From(*tensor_x);
auto y = framework::EigenMatrix<T>::From(*tensor_y);
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
out.device(dev) = (x * y).sum(Eigen::DSizes<int, 1>(1));
......
......@@ -50,16 +50,6 @@
namespace paddle {
namespace operators {
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename DeviceContext, typename T>
class MeshgridKernel : public framework::OpKernel<T> {
public:
......@@ -120,9 +110,9 @@ class MeshgridKernel : public framework::OpKernel<T> {
bcast_dims[i] = 1;
outs[i]->Resize(out_dims);
auto x = EigenTensor<T, Rank>::From(reshape_ins_tensor);
auto x = framework::EigenTensor<T, Rank>::From(reshape_ins_tensor);
outs[i]->mutable_data<T>(context.GetPlace());
auto y = EigenTensor<T, Rank>::From(*outs[i]);
auto y = framework::EigenTensor<T, Rank>::From(*outs[i]);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
y.device(place) = x.broadcast(bcast_dims);
......@@ -159,8 +149,8 @@ class MeshgridGradKernel : public framework::OpKernel<T> {
for (int i = 0; i < n; i++) {
outs[i]->mutable_data<T>(context.GetPlace());
auto out_grad_tmp = EigenVector<T>::Flatten(*out_grad[i]);
auto in_grad = EigenVector<T>::Flatten(*outs[i]);
auto out_grad_tmp = framework::EigenVector<T>::Flatten(*out_grad[i]);
auto in_grad = framework::EigenVector<T>::Flatten(*outs[i]);
std::vector<int> reduce_dims_vec;
std::vector<int> reshape_dims_vec;
......
......@@ -37,7 +37,7 @@ class RankLossKernel : public framework::OpKernel<T> {
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
out.device(dev) =
(1. + (left - right).exp()).log() - label * (left - right);
(1.0f + (left - right).exp()).log() - label * (left - right);
}
};
......@@ -65,14 +65,15 @@ class RankLossGradKernel : public framework::OpKernel<T> {
if (d_left_t) {
d_left_t->mutable_data<T>(ctx.GetPlace());
auto d_left = framework::EigenVector<T>::Flatten(*d_left_t);
d_left.device(dev) = d_out * (1. / (1. + (right - left).exp()) - label);
d_left.device(dev) =
d_out * (1.0f / (1.0f + (right - left).exp()) - label);
}
// compute d_right
if (d_right_t) {
d_right_t->mutable_data<T>(ctx.GetPlace());
auto d_right = framework::EigenVector<T>::Flatten(*d_right_t);
d_right.device(dev) =
-d_out * (1.0 / (1. + (right - left).exp()) - label);
-d_out * (1.0f / (1.0f + (right - left).exp()) - label);
}
}
};
......
......@@ -23,9 +23,6 @@ namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
......@@ -95,12 +92,12 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n});
out_grad_2d.ShareDataWith(*out_grad).Resize({n, d / axis_dim});
auto out_grad_mat = EigenMatrix<T>::From(out_grad_2d);
auto logit_grad_mat = EigenMatrix<T>::From(logit_grad_2d);
auto out_grad_mat = framework::EigenMatrix<T>::From(out_grad_2d);
auto logit_grad_mat = framework::EigenMatrix<T>::From(logit_grad_2d);
auto& place = *context.template device_context<platform::CPUDeviceContext>()
.eigen_device();
if (soft_label) {
auto lbl_mat = EigenMatrix<T>::From(labels_2d);
auto lbl_mat = framework::EigenMatrix<T>::From(labels_2d);
logit_grad_mat.device(place) =
out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, axis_dim)) *
(logit_grad_mat - lbl_mat);
......
......@@ -20,12 +20,6 @@ namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T>
class SquaredL2DistanceKernel : public framework::OpKernel<T> {
......@@ -41,15 +35,15 @@ class SquaredL2DistanceKernel : public framework::OpKernel<T> {
int cols = in0->numel() / in0_dims[0];
// reduce dimensions except the first
auto x =
EigenMatrix<T>::From(*in0, framework::make_ddim({in0_dims[0], cols}));
auto y =
EigenMatrix<T>::From(*in1, framework::make_ddim({in1_dims[0], cols}));
auto x = framework::EigenMatrix<T>::From(
*in0, framework::make_ddim({in0_dims[0], cols}));
auto y = framework::EigenMatrix<T>::From(
*in1, framework::make_ddim({in1_dims[0], cols}));
out0->mutable_data<T>(context.GetPlace());
out1->mutable_data<T>(context.GetPlace());
auto sub_result = EigenMatrix<T>::From(*out0);
auto z = EigenVector<T>::Flatten(*out1);
auto sub_result = framework::EigenMatrix<T>::From(*out0);
auto z = framework::EigenVector<T>::Flatten(*out1);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
......@@ -88,8 +82,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
"in scope for operator 'squared_l2_distance_grad'.",
framework::GradVarName("Y")));
auto sub_result = EigenMatrix<T>::From(*in0);
auto out_grad = EigenMatrix<T>::From(*in1);
auto sub_result = framework::EigenMatrix<T>::From(*in0);
auto out_grad = framework::EigenMatrix<T>::From(*in1);
auto x_dims = x_g->dims();
auto y_dims = y_g->dims();
......@@ -106,8 +100,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
x_g->mutable_data<T>(context.GetPlace());
// eigen matrix
auto x_grad =
EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
auto x_grad = framework::EigenMatrix<T>::From(
*x_g, framework::make_ddim({x_dims[0], cols}));
// dimensions are same with subResult
x_grad.device(eigen_place) = grad_mat;
......@@ -121,12 +115,12 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
sub_result.dimensions()[0], y_dims[0]));
if (sub_result.dimensions()[0] == y_dims[0]) {
auto y_grad =
EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
auto y_grad = framework::EigenMatrix<T>::From(
*y_g, framework::make_ddim({y_dims[0], cols}));
y_grad.device(eigen_place) = -1 * grad_mat;
} else {
auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
auto y_grad = EigenVector<T>::Flatten(*y_g);
auto y_grad = framework::EigenVector<T>::Flatten(*y_g);
y_grad.device(eigen_place) = col_sum_res;
}
}
......
......@@ -307,32 +307,36 @@ register_unity_group(cc
spp_op.cu.cc
squeeze_op.cu.cc
unbind_op.cu.cc
unique_op.cu
unpool_op.cu.cc
unsqueeze_op.cu.cc)
register_unity_group(cu
addmm_op.cu
affine_channel_op.cu
allclose_op.cu
argsort_op.cu
assign_value_op.cu
bce_loss_op.cu
bernoulli_op.cu
bilateral_slice_op.cu)
bilateral_slice_op.cu
batch_norm_op.cu)
register_unity_group(cu
bilinear_tensor_product_op.cu
bmm_op.cu
cast_op.cu
cholesky_op.cu
clip_by_norm_op.cu
clip_op.cu)
clip_op.cu
conv_cudnn_op.cu
affine_grid_op.cu)
register_unity_group(cu
center_loss_op.cu
conv_op.cu
conv_transpose_cudnn_op.cu
conv_transpose_op.cu
cos_sim_op.cu
crop_op.cu)
crop_op.cu
average_accumulates_op.cu
conj_op.cu
correlation_op.cu)
register_unity_group(cu
cross_entropy_op.cu
cross_op.cu
......@@ -349,7 +353,9 @@ register_unity_group(cu
diag_op.cu
diag_v2_op.cu
edit_distance_op.cu
erf_op.cu)
erf_op.cu
meshgrid_op.cu
imag_op.cu)
register_unity_group(cu
expand_v2_op.cu
fake_dequantize_op.cu
......@@ -377,10 +383,8 @@ register_unity_group(cu
inplace_abn_op.cu
interpolate_v2_op.cu
isfinite_op.cu
kron_op.cu
l1_norm_op.cu
label_smooth_op.cu
layer_norm_op.cu
linspace_op.cu
load_combine_op.cu
load_op.cu)
......@@ -388,20 +392,30 @@ register_unity_group(cu
lod_reset_op.cu
log_softmax_op.cu
lrn_op.cu
lstm_unit_op.cu)
lstm_unit_op.cu
dot_op.cu
psroi_pool_op.cu
rank_loss_op.cu
real_op.cu)
register_unity_group(cu
log_loss_op.cu
lookup_table_v2_op.cu
margin_rank_loss_op.cu
masked_select_op.cu
merge_selected_rows_op.cu)
merge_selected_rows_op.cu
lstmp_op.cu
shuffle_channel_op.cu
softmax_cudnn_op.cu
squared_l2_distance_op.cu)
register_unity_group(cu
conv_shift_op.cu
dequantize_log_op.cu
dropout_op.cu
fake_quantize_op.cu
gelu_op.cu
lookup_table_op.cu)
lookup_table_op.cu
sigmoid_cross_entropy_with_logits_op.cu
softmax_with_cross_entropy_op.cu)
register_unity_group(cu
mean_iou_op.cu
mean_op.cu
......@@ -430,7 +444,10 @@ register_unity_group(cu
random_crop_op.cu
randperm_op.cu
range_op.cu
reverse_op.cu)
reverse_op.cu
partial_concat_op.cu
kldiv_loss_op.cu
instance_norm_op.cu)
register_unity_group(cu
roi_align_op.cu
roll_op.cu
......@@ -457,40 +474,42 @@ register_unity_group(cu
split_op.cu
split_selected_rows_op.cu
squared_l2_norm_op.cu
stack_op.cu
strided_slice_op.cu
sum_op.cu
temporal_shift_op.cu)
temporal_shift_op.cu
arg_max_op.cu)
register_unity_group(cu
row_conv_op.cu
tile_op.cu
trace_op.cu
transpose_op.cu
tree_conv_op.cu
tril_triu_op.cu
truncated_gaussian_random_op.cu
unfold_op.cu)
unfold_op.cu
arg_min_op.cu
crop_tensor_op.cu)
register_unity_group(cu
smooth_l1_loss_op.cu
uniform_random_op.cu
unique_op.cu
unstack_op.cu
where_index_op.cu
where_op.cu)
where_op.cu
layer_norm_op.cu)
register_unity_group(cu
expand_as_op.cu
stack_op.cu)
# The following groups are to make better use of `/MP` which MSVC's parallel
# compilation instruction when compiling in Unity Build.
register_unity_group(cu activation_op.cu)
register_unity_group(cu arg_max_op.cu)
register_unity_group(cu arg_min_op.cu)
register_unity_group(cu batch_norm_op.cu)
register_unity_group(cu crop_tensor_op.cu)
register_unity_group(cu dist_op.cu)
register_unity_group(cu expand_as_op.cu)
register_unity_group(cu expand_as_v2_op.cu)
register_unity_group(cu gru_unit_op.cu)
register_unity_group(cu instance_norm_op.cu)
register_unity_group(cu kldiv_loss_op.cu)
register_unity_group(cu partial_concat_op.cu)
register_unity_group(cu softmax_with_cross_entropy_op.cu)
register_unity_group(cu squared_l2_distance_op.cu)
register_unity_group(cu top_k_op.cu)
register_unity_group(cu argsort_op.cu)
register_unity_group(cu kron_op.cu)
register_unity_group(cu unique_op.cu)
register_unity_group(cu tile_op.cu)
register_unity_group(cu trace_op.cu)
register_unity_group(cu transpose_op.cu)
register_unity_group(cu strided_slice_op.cu)
register_unity_group(cu expand_op.cu)
register_unity_group(cu matmul_v2_op.cu)
register_unity_group(cu top_k_v2_op.cu)
register_unity_group(cu set_value_op.cu)
......@@ -262,7 +262,7 @@ echo ========================================
echo Step 2. Buile Paddle ...
echo ========================================
for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*9/10
for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*2/3
set build_times=1
:build_tp
echo Build third_party the %build_times% time:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册