diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 2d1a89f9ae471017d30f3278955498de71c053aa..b727da4ae0cd3c2c97be08ebcaed0128ce9bcdf5 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -24,13 +24,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
 
@@ -73,17 +66,17 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     int batch_size = input->dims()[0];
     int frame_size = hidden_prev->dims()[1];
 
-    auto x = EigenMatrix<T>::From(*input);
-    auto h_p = EigenMatrix<T>::From(*hidden_prev);
-    auto g = EigenMatrix<T>::From(*gate);
-    auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
-    auto h = EigenMatrix<T>::From(*hidden);
+    auto x = framework::EigenMatrix<T>::From(*input);
+    auto h_p = framework::EigenMatrix<T>::From(*hidden_prev);
+    auto g = framework::EigenMatrix<T>::From(*gate);
+    auto r_h_p = framework::EigenMatrix<T>::From(*reset_hidden_prev);
+    auto h = framework::EigenMatrix<T>::From(*hidden);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
 
     // calculate unactivated gate outputs
     if (bias) {
-      auto b = EigenMatrix<T>::From(*bias);
+      auto b = framework::EigenMatrix<T>::From(*bias);
       g.device(place) = x +
                         b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
                             .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
@@ -177,11 +170,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
         reset_hidden_prev->dims(), context.GetPlace());
 
-    auto h_p = EigenMatrix<T>::From(*hidden_prev);
-    auto g = EigenMatrix<T>::From(*gate);
-    auto d_h = EigenMatrix<T>::From(*hidden_grad);
-    auto d_g = EigenMatrix<T>::From(gate_grad);
-    auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
+    auto h_p = framework::EigenMatrix<T>::From(*hidden_prev);
+    auto g = framework::EigenMatrix<T>::From(*gate);
+    auto d_h = framework::EigenMatrix<T>::From(*hidden_grad);
+    auto d_g = framework::EigenMatrix<T>::From(gate_grad);
+    auto d_r_h_p = framework::EigenMatrix<T>::From(reset_hidden_prev_grad);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
 
@@ -237,7 +230,7 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     if (hidden_prev_grad) {
       T* hidden_prev_grad_data =
           hidden_prev_grad->mutable_data<T>(context.GetPlace());
-      auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
+      auto d_h_p = framework::EigenMatrix<T>::From(*hidden_prev_grad);
       if (context.Attr<bool>("origin_mode")) {
         d_h_p.device(place) = d_r_h_p * r + d_h * u;
       } else {
@@ -250,13 +243,13 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     // backward for input
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
-      auto d_x = EigenMatrix<T>::From(*input_grad);
+      auto d_x = framework::EigenMatrix<T>::From(*input_grad);
       d_x.device(place) = d_g;
     }
     // backward for bias
     if (bias_grad) {
       bias_grad->mutable_data<T>(context.GetPlace());
-      auto d_b = EigenVector<T>::Flatten(*bias_grad);
+      auto d_b = framework::EigenVector<T>::Flatten(*bias_grad);
       d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
     }
   }
diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h
index 857ecda303c2607b1b6fb9a5d2ec132b335d6c29..0bc53d7dd7b3b1f3cb8545003efadecb252cf74c 100644
--- a/paddle/fluid/operators/kldiv_loss_op.h
+++ b/paddle/fluid/operators/kldiv_loss_op.h
@@ -19,10 +19,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
 using Array1 = Eigen::DSizes<int64_t, 1>;
 
 template <typename T>
@@ -64,9 +60,9 @@ class KLDivLossKernel : public framework::OpKernel<T> {
     const int n = input->dims()[0];
 
     loss->mutable_data<T>(ctx.GetPlace());
-    auto input_t = EigenVector<T>::Flatten(*input);
-    auto target_t = EigenVector<T>::Flatten(*target);
-    auto loss_t = EigenVector<T>::Flatten(*loss);
+    auto input_t = framework::EigenVector<T>::Flatten(*input);
+    auto target_t = framework::EigenVector<T>::Flatten(*target);
+    auto loss_t = framework::EigenVector<T>::Flatten(*loss);
     auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
     if ("none" == reduction) {
       loss_t.device(place) = output;
@@ -101,10 +97,10 @@ class KLDivLossGradKernel : public framework::OpKernel<T> {
 
     input_grad->mutable_data<T>(ctx.GetPlace());
 
-    auto target_t = EigenVector<T>::Flatten(*target);
+    auto target_t = framework::EigenVector<T>::Flatten(*target);
 
-    auto input_grad_t = EigenVector<T>::Flatten(*input_grad);
-    auto loss_grad_t = EigenVector<T>::Flatten(*loss_grad);
+    auto input_grad_t = framework::EigenVector<T>::Flatten(*input_grad);
+    auto loss_grad_t = framework::EigenVector<T>::Flatten(*loss_grad);
 
     auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
     auto grad_t = target_t * loss_grad_expand;
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index d4f3fc5d7a622f27c99c4a1e343f0a7390b6839c..eacc5f467d22977202601cfeac6b968bc7065370 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -47,9 +47,6 @@ struct ScalarMul {
 using framework::LoDTensor;
 using framework::LoD;
 using framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T>
 class LinearChainCRFOpKernel : public framework::OpKernel<T> {
@@ -127,16 +124,16 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
         platform::CPUPlace());
     auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
                        .eigen_device();
-    auto x = EigenMatrix<T>::From(emission_weights_tmp);
-    auto x_row_max = EigenMatrix<T>::From(emission_row_max);
+    auto x = framework::EigenMatrix<T>::From(emission_weights_tmp);
+    auto x_row_max = framework::EigenMatrix<T>::From(emission_row_max);
     x_row_max.device(place) =
         x.maximum(Eigen::DSizes<int, 1>(1))
             .reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
-    auto x_exps = EigenMatrix<T>::From(emission_exps_tmp);
+    auto x_exps = framework::EigenMatrix<T>::From(emission_exps_tmp);
     x_exps.device(place) =
         (x - x_row_max.broadcast(Eigen::DSizes<int, 2>(1, tag_num))).exp();
-    auto w = EigenMatrix<T>::From(*transition_weights);
-    auto w_exps = EigenMatrix<T>::From(*transition_exps);
+    auto w = framework::EigenMatrix<T>::From(*transition_weights);
+    auto w_exps = framework::EigenMatrix<T>::From(*transition_exps);
     w_exps.device(place) = w.exp();
     T* log_likelihood = ll->data<T>();
     for (int64_t i = 0; i < seq_num; ++i) {
@@ -355,9 +352,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
       NormalizeL1<T>(beta_value + k * tag_num, tag_num);
     }
 
-    auto x_grad_mat = EigenMatrix<T>::From(*emission_grad);
-    auto alpha_mat = EigenMatrix<T>::From(alpha);
-    auto beta_mat = EigenMatrix<T>::From(*beta);
+    auto x_grad_mat = framework::EigenMatrix<T>::From(*emission_grad);
+    auto alpha_mat = framework::EigenMatrix<T>::From(alpha);
+    auto beta_mat = framework::EigenMatrix<T>::From(*beta);
 
     auto* place = ctx.eigen_device();
     auto prob = alpha_mat * beta_mat;
@@ -381,13 +378,13 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
             x_grad_mat(/*to end state*/ seq_length - 1, k);
       }
 
-      auto x_exps_mat = EigenMatrix<T>::From(emission_exps);
+      auto x_exps_mat = framework::EigenMatrix<T>::From(emission_exps);
 
       // TODO(caoying): Fix this to avoid using this local variable if we can
       // profile the training process.
       Tensor tmp;
       tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
-      auto tmp_mat = EigenMatrix<T>::From(tmp);
+      auto tmp_mat = framework::EigenMatrix<T>::From(tmp);
       auto prob = beta_mat * x_exps_mat;
       auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
                          .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index 0fd5f2ac01df3f96e537978603f260b212e7ba56..41df6f488f1925273ac85c560b0639d162b3ac73 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -500,13 +500,14 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
     // copy sliced data to output.
     const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
     const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
-    auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
-    auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
+    auto e_indices =
+        framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
+    auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(temp_indices);
 
     std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
     auto dim = framework::make_ddim(odims);
-    auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
-    auto e_tmp_values = EigenMatrix<T>::From(temp_values);
+    auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
+    auto e_tmp_values = framework::EigenMatrix<T>::From(temp_values);
 
     e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
     e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 1ba01d93acc3efa88e73947594fca36f0b22bf74..f279b9529cc09810a1fac7fc5874e1fc8b9870af 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -25,14 +25,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
 template <typename DeviceContext, typename T>
 class TopkKernel : public framework::OpKernel<T> {
  public:
@@ -70,12 +62,13 @@ class TopkKernel : public framework::OpKernel<T> {
       vec.reserve(col);
       // 1D vector
       if (inputdims.size() == 1) {
-        auto eg_input = EigenVector<T>::Flatten(*input);
+        auto eg_input = framework::EigenVector<T>::Flatten(*input);
         for (size_t j = 0; j < col; j++) {
           vec.push_back(std::pair<T, size_t>(eg_input(j), j));
         }
       } else {
-        auto eg_input = EigenMatrix<T>::Reshape(*input, inputdims.size() - 1);
+        auto eg_input =
+            framework::EigenMatrix<T>::Reshape(*input, inputdims.size() - 1);
         for (size_t j = 0; j < col; j++) {
           vec.push_back(std::pair<T, size_t>(eg_input(i, j), j));
         }
diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h
index 89b5d36b1b3f915e8719c8791e8c12c2e0348f26..c836c993b2910b55d4dcc2c4b7b250855216f2dd 100644
--- a/paddle/fluid/operators/top_k_v2_op.h
+++ b/paddle/fluid/operators/top_k_v2_op.h
@@ -61,12 +61,12 @@ static void FullTopK(Type input_height, Type input_width, int input_dim,
     std::vector<std::pair<T, Type>> col_vec;
     col_vec.reserve(input_width);
     if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_input = framework::EigenVector<T>::Flatten(*input);
       for (Type j = 0; j < input_width; ++j) {
         col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
       }
     } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
       for (Type j = 0; j < input_width; ++j) {
         col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
       }
@@ -142,14 +142,15 @@ static void FullTopKAssign(const Type& input_height, const Type& input_width,
 #endif
   for (Type i = 0; i < input_height; ++i) {
     if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
-      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      auto e_input = framework::EigenVector<T>::Flatten(*input);
+      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
       for (Type j = 0; j < k; ++j) {
         output_data[i * input_width + e_indices(j)] = e_input(j);
       }
     } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices =
+          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
       for (Type j = 0; j < k; ++j) {
         output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
       }
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index d21f6b2d69d84f4894fd74e260bb77ac47781d72..cd8b31d72e72adba6232b703e9d2513c90e46cdf 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -5,6 +5,7 @@
 # If there are some redefined error in compiling with the source file which
 # in combination rule, you can remove the source file from the following rules.
 register_unity_group(cc
+    abs_op.cc
     add_position_encoding_op.cc
     addmm_op.cc
     affine_channel_op.cc
@@ -33,7 +34,11 @@ register_unity_group(cc
     chunk_eval_op.cc
     clip_by_norm_op.cc
     clip_op.cc
-    coalesce_tensor_op.cc)
+    coalesce_tensor_op.cc
+    mkldnn/activation_mkldnn_op.cc
+    mkldnn/interpolate_mkldnn_op.cc
+    mkldnn/pool_mkldnn_op.cc
+    mkldnn/softmax_mkldnn_op.cc)
 register_unity_group(cc
     center_loss_op.cc
     mkldnn/concat_mkldnn_op.cc
@@ -42,7 +47,12 @@ register_unity_group(cc
     correlation_op.cc
     cos_sim_op.cc
     crf_decoding_op.cc
-    crop_op.cc)
+    crop_op.cc
+    ascend_trigger_op.cc
+    conj_op.cc
+    imag_op.cc
+    kldiv_loss_op.cc
+    memcpy_op.cc)
 register_unity_group(cc
     cross_entropy_op.cc
     cross_op.cc
@@ -69,7 +79,14 @@ register_unity_group(cc
     edit_distance_op.cc
     empty_op.cc
     enqueue_op.cc
-    erf_op.cc)
+    erf_op.cc
+    py_func_op.cc
+    real_op.cc
+    sync_batch_norm_op.cc
+    top_k_op.cc
+    conv_op.cc
+    conv_transpose_op.cc
+    gru_unit_op.cc)
 register_unity_group(cc
     expand_v2_op.cc
     fake_dequantize_op.cc
@@ -309,6 +326,29 @@ register_unity_group(cc
     unbind_op.cu.cc
     unpool_op.cu.cc
     unsqueeze_op.cu.cc)
+register_unity_group(cc
+    arg_max_op.cc
+    arg_min_op.cc
+    squared_l2_distance_op.cc)
+register_unity_group(cc
+    linear_chain_crf_op.cc
+    lstm_op.cc
+    partial_concat_op.cc
+    pyramid_hash_op.cc
+    recurrent_op.cc
+    run_program_op.cc
+    softmax_with_cross_entropy_op.cc
+    warpctc_op.cc)
+register_unity_group(cc
+    conv_op.cu.cc
+    lstm_op.cu.cc
+    rnn_op.cu.cc
+    split_op.cu.cc
+    activation_cudnn_op.cu.cc
+    assign_value_op.cu.cc
+    merge_selected_rows_op.cu.cc
+    run_program_op.cu.cc
+    warpctc_op.cu.cc)
 register_unity_group(cu
     addmm_op.cu
     affine_channel_op.cu