optimize unity build (#31119)

* optimize unity build, test=develop * fix compilation error on Windows, test=develop * fix compilation error, test=develop * fix code style error, test=develop

optimize unity build (#31119)
* optimize unity build, test=develop * fix compilation error on Windows, test=develop * fix compilation error, test=develop * fix code style error, test=develop
a13f1d69 · wuhuanzhou · GitHub · 8f4ac6b5 · a13f1d69 · a13f1d69
7 changed file
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -24,13 +24,6 @@ namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;

 enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };

@@ -73,17 +66,17 @@ class GRUUnitKernel : public framework::OpKernel<T> {
    int batch_size = input->dims()[0];
    int frame_size = hidden_prev->dims()[1];

-    auto x = EigenMatrix<T>::From(*input);
-    auto h_p = EigenMatrix<T>::From(*hidden_prev);
-    auto g = EigenMatrix<T>::From(*gate);
-    auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
-    auto h = EigenMatrix<T>::From(*hidden);
+    auto x = framework::EigenMatrix<T>::From(*input);
+    auto h_p = framework::EigenMatrix<T>::From(*hidden_prev);
+    auto g = framework::EigenMatrix<T>::From(*gate);
+    auto r_h_p = framework::EigenMatrix<T>::From(*reset_hidden_prev);
+    auto h = framework::EigenMatrix<T>::From(*hidden);
    auto& place =
        *context.template device_context<DeviceContext>().eigen_device();

    // calculate unactivated gate outputs
    if (bias) {
-      auto b = EigenMatrix<T>::From(*bias);
+      auto b = framework::EigenMatrix<T>::From(*bias);
      g.device(place) = x +
                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
@@ -177,11 +170,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
        reset_hidden_prev->dims(), context.GetPlace());

-    auto h_p = EigenMatrix<T>::From(*hidden_prev);
-    auto g = EigenMatrix<T>::From(*gate);
-    auto d_h = EigenMatrix<T>::From(*hidden_grad);
-    auto d_g = EigenMatrix<T>::From(gate_grad);
-    auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
+    auto h_p = framework::EigenMatrix<T>::From(*hidden_prev);
+    auto g = framework::EigenMatrix<T>::From(*gate);
+    auto d_h = framework::EigenMatrix<T>::From(*hidden_grad);
+    auto d_g = framework::EigenMatrix<T>::From(gate_grad);
+    auto d_r_h_p = framework::EigenMatrix<T>::From(reset_hidden_prev_grad);
    auto& place =
        *context.template device_context<DeviceContext>().eigen_device();

@@ -237,7 +230,7 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
    if (hidden_prev_grad) {
      T* hidden_prev_grad_data =
          hidden_prev_grad->mutable_data<T>(context.GetPlace());
-      auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
+      auto d_h_p = framework::EigenMatrix<T>::From(*hidden_prev_grad);
      if (context.Attr<bool>("origin_mode")) {
        d_h_p.device(place) = d_r_h_p * r + d_h * u;
      } else {
@@ -250,13 +243,13 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
    // backward for input
    if (input_grad) {
      input_grad->mutable_data<T>(context.GetPlace());
-      auto d_x = EigenMatrix<T>::From(*input_grad);
+      auto d_x = framework::EigenMatrix<T>::From(*input_grad);
      d_x.device(place) = d_g;
    }
    // backward for bias
    if (bias_grad) {
      bias_grad->mutable_data<T>(context.GetPlace());
-      auto d_b = EigenVector<T>::Flatten(*bias_grad);
+      auto d_b = framework::EigenVector<T>::Flatten(*bias_grad);
      d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
    }
  }

--- a/paddle/fluid/operators/kldiv_loss_op.h
+++ b/paddle/fluid/operators/kldiv_loss_op.h
@@ -19,10 +19,6 @@ namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
 using Array1 = Eigen::DSizes<int64_t, 1>;

 template <typename T>
@@ -64,9 +60,9 @@ class KLDivLossKernel : public framework::OpKernel<T> {
    const int n = input->dims()[0];

    loss->mutable_data<T>(ctx.GetPlace());
-    auto input_t = EigenVector<T>::Flatten(*input);
-    auto target_t = EigenVector<T>::Flatten(*target);
-    auto loss_t = EigenVector<T>::Flatten(*loss);
+    auto input_t = framework::EigenVector<T>::Flatten(*input);
+    auto target_t = framework::EigenVector<T>::Flatten(*target);
+    auto loss_t = framework::EigenVector<T>::Flatten(*loss);
    auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
    if ("none" == reduction) {
      loss_t.device(place) = output;
@@ -101,10 +97,10 @@ class KLDivLossGradKernel : public framework::OpKernel<T> {

    input_grad->mutable_data<T>(ctx.GetPlace());

-    auto target_t = EigenVector<T>::Flatten(*target);
+    auto target_t = framework::EigenVector<T>::Flatten(*target);

-    auto input_grad_t = EigenVector<T>::Flatten(*input_grad);
-    auto loss_grad_t = EigenVector<T>::Flatten(*loss_grad);
+    auto input_grad_t = framework::EigenVector<T>::Flatten(*input_grad);
+    auto loss_grad_t = framework::EigenVector<T>::Flatten(*loss_grad);

    auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
    auto grad_t = target_t * loss_grad_expand;

--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -47,9 +47,6 @@ struct ScalarMul {
 using framework::LoDTensor;
 using framework::LoD;
 using framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;

 template <typename DeviceContext, typename T>
 class LinearChainCRFOpKernel : public framework::OpKernel<T> {
@@ -127,16 +124,16 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
        platform::CPUPlace());
    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
                       .eigen_device();
-    auto x = EigenMatrix<T>::From(emission_weights_tmp);
-    auto x_row_max = EigenMatrix<T>::From(emission_row_max);
+    auto x = framework::EigenMatrix<T>::From(emission_weights_tmp);
+    auto x_row_max = framework::EigenMatrix<T>::From(emission_row_max);
    x_row_max.device(place) =
        x.maximum(Eigen::DSizes<int, 1>(1))
            .reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
-    auto x_exps = EigenMatrix<T>::From(emission_exps_tmp);
+    auto x_exps = framework::EigenMatrix<T>::From(emission_exps_tmp);
    x_exps.device(place) =
        (x - x_row_max.broadcast(Eigen::DSizes<int, 2>(1, tag_num))).exp();
-    auto w = EigenMatrix<T>::From(*transition_weights);
-    auto w_exps = EigenMatrix<T>::From(*transition_exps);
+    auto w = framework::EigenMatrix<T>::From(*transition_weights);
+    auto w_exps = framework::EigenMatrix<T>::From(*transition_exps);
    w_exps.device(place) = w.exp();
    T* log_likelihood = ll->data<T>();
    for (int64_t i = 0; i < seq_num; ++i) {
@@ -355,9 +352,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
      NormalizeL1<T>(beta_value + k * tag_num, tag_num);
    }

-    auto x_grad_mat = EigenMatrix<T>::From(*emission_grad);
-    auto alpha_mat = EigenMatrix<T>::From(alpha);
-    auto beta_mat = EigenMatrix<T>::From(*beta);
+    auto x_grad_mat = framework::EigenMatrix<T>::From(*emission_grad);
+    auto alpha_mat = framework::EigenMatrix<T>::From(alpha);
+    auto beta_mat = framework::EigenMatrix<T>::From(*beta);

    auto* place = ctx.eigen_device();
    auto prob = alpha_mat * beta_mat;
@@ -381,13 +378,13 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
            x_grad_mat(/*to end state*/ seq_length - 1, k);
      }

-      auto x_exps_mat = EigenMatrix<T>::From(emission_exps);
+      auto x_exps_mat = framework::EigenMatrix<T>::From(emission_exps);

      // TODO(caoying): Fix this to avoid using this local variable if we can
      // profile the training process.
      Tensor tmp;
      tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
-      auto tmp_mat = EigenMatrix<T>::From(tmp);
+      auto tmp_mat = framework::EigenMatrix<T>::From(tmp);
      auto prob = beta_mat * x_exps_mat;
      auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
                         .reshape(Eigen::DSizes<int, 2>(seq_length, 1))

--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -500,13 +500,14 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
    // copy sliced data to output.
    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
-    auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
-    auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
+    auto e_indices =
+        framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
+    auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(temp_indices);

    std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
    auto dim = framework::make_ddim(odims);
-    auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
-    auto e_tmp_values = EigenMatrix<T>::From(temp_values);
+    auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
+    auto e_tmp_values = framework::EigenMatrix<T>::From(temp_values);

    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);

--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -25,14 +25,6 @@ namespace operators {

 using Tensor = framework::Tensor;

-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
 template <typename DeviceContext, typename T>
 class TopkKernel : public framework::OpKernel<T> {
 public:
@@ -70,12 +62,13 @@ class TopkKernel : public framework::OpKernel<T> {
      vec.reserve(col);
      // 1D vector
      if (inputdims.size() == 1) {
-        auto eg_input = EigenVector<T>::Flatten(*input);
+        auto eg_input = framework::EigenVector<T>::Flatten(*input);
        for (size_t j = 0; j < col; j++) {
          vec.push_back(std::pair<T, size_t>(eg_input(j), j));
        }
      } else {
-        auto eg_input = EigenMatrix<T>::Reshape(*input, inputdims.size() - 1);
+        auto eg_input =
+            framework::EigenMatrix<T>::Reshape(*input, inputdims.size() - 1);
        for (size_t j = 0; j < col; j++) {
          vec.push_back(std::pair<T, size_t>(eg_input(i, j), j));
        }

--- a/paddle/fluid/operators/top_k_v2_op.h
+++ b/paddle/fluid/operators/top_k_v2_op.h
@@ -61,12 +61,12 @@ static void FullTopK(Type input_height, Type input_width, int input_dim,
    std::vector<std::pair<T, Type>> col_vec;
    col_vec.reserve(input_width);
    if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_input = framework::EigenVector<T>::Flatten(*input);
      for (Type j = 0; j < input_width; ++j) {
        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
      }
    } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
      for (Type j = 0; j < input_width; ++j) {
        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
      }
@@ -142,14 +142,15 @@ static void FullTopKAssign(const Type& input_height, const Type& input_width,
 #endif
  for (Type i = 0; i < input_height; ++i) {
    if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
-      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      auto e_input = framework::EigenVector<T>::Flatten(*input);
+      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
      for (Type j = 0; j < k; ++j) {
        output_data[i * input_width + e_indices(j)] = e_input(j);
      }
    } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices =
+          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
      for (Type j = 0; j < k; ++j) {
        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
      }

--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -5,6 +5,7 @@
 # If there are some redefined error in compiling with the source file which
 # in combination rule, you can remove the source file from the following rules.
 register_unity_group(cc
+    abs_op.cc
    add_position_encoding_op.cc
    addmm_op.cc
    affine_channel_op.cc
@@ -33,7 +34,11 @@ register_unity_group(cc
    chunk_eval_op.cc
    clip_by_norm_op.cc
    clip_op.cc
-    coalesce_tensor_op.cc)
+    coalesce_tensor_op.cc
+    mkldnn/activation_mkldnn_op.cc
+    mkldnn/interpolate_mkldnn_op.cc
+    mkldnn/pool_mkldnn_op.cc
+    mkldnn/softmax_mkldnn_op.cc)
 register_unity_group(cc
    center_loss_op.cc
    mkldnn/concat_mkldnn_op.cc
@@ -42,7 +47,12 @@ register_unity_group(cc
    correlation_op.cc
    cos_sim_op.cc
    crf_decoding_op.cc
-    crop_op.cc)
+    crop_op.cc
+    ascend_trigger_op.cc
+    conj_op.cc
+    imag_op.cc
+    kldiv_loss_op.cc
+    memcpy_op.cc)
 register_unity_group(cc
    cross_entropy_op.cc
    cross_op.cc
@@ -69,7 +79,14 @@ register_unity_group(cc
    edit_distance_op.cc
    empty_op.cc
    enqueue_op.cc
-    erf_op.cc)
+    erf_op.cc
+    py_func_op.cc
+    real_op.cc
+    sync_batch_norm_op.cc
+    top_k_op.cc
+    conv_op.cc
+    conv_transpose_op.cc
+    gru_unit_op.cc)
 register_unity_group(cc
    expand_v2_op.cc
    fake_dequantize_op.cc
@@ -309,6 +326,29 @@ register_unity_group(cc
    unbind_op.cu.cc
    unpool_op.cu.cc
    unsqueeze_op.cu.cc)
+register_unity_group(cc
+    arg_max_op.cc
+    arg_min_op.cc
+    squared_l2_distance_op.cc)
+register_unity_group(cc
+    linear_chain_crf_op.cc
+    lstm_op.cc
+    partial_concat_op.cc
+    pyramid_hash_op.cc
+    recurrent_op.cc
+    run_program_op.cc
+    softmax_with_cross_entropy_op.cc
+    warpctc_op.cc)
+register_unity_group(cc
+    conv_op.cu.cc
+    lstm_op.cu.cc
+    rnn_op.cu.cc
+    split_op.cu.cc
+    activation_cudnn_op.cu.cc
+    assign_value_op.cu.cc
+    merge_selected_rows_op.cu.cc
+    run_program_op.cu.cc
+    warpctc_op.cu.cc)
 register_unity_group(cu
    addmm_op.cu
    affine_channel_op.cu