From a13f1d6930f4190877dbdd21e5b085b919f1f2a8 Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Mon, 1 Mar 2021 17:26:57 +0800 Subject: [PATCH] optimize unity build (#31119) * optimize unity build, test=develop * fix compilation error on Windows, test=develop * fix compilation error, test=develop * fix code style error, test=develop --- paddle/fluid/operators/gru_unit_op.h | 35 ++++++-------- paddle/fluid/operators/kldiv_loss_op.h | 16 +++---- paddle/fluid/operators/linear_chain_crf_op.h | 23 ++++------ paddle/fluid/operators/top_k_function_cuda.h | 9 ++-- paddle/fluid/operators/top_k_op.h | 13 ++---- paddle/fluid/operators/top_k_v2_op.h | 13 +++--- paddle/fluid/operators/unity_build_rule.cmake | 46 +++++++++++++++++-- 7 files changed, 88 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h index 2d1a89f9ae4..b727da4ae0c 100644 --- a/paddle/fluid/operators/gru_unit_op.h +++ b/paddle/fluid/operators/gru_unit_op.h @@ -24,13 +24,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenVector = framework::EigenVector; enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 }; @@ -73,17 +66,17 @@ class GRUUnitKernel : public framework::OpKernel { int batch_size = input->dims()[0]; int frame_size = hidden_prev->dims()[1]; - auto x = EigenMatrix::From(*input); - auto h_p = EigenMatrix::From(*hidden_prev); - auto g = EigenMatrix::From(*gate); - auto r_h_p = EigenMatrix::From(*reset_hidden_prev); - auto h = EigenMatrix::From(*hidden); + auto x = framework::EigenMatrix::From(*input); + auto h_p = framework::EigenMatrix::From(*hidden_prev); + auto g = framework::EigenMatrix::From(*gate); + auto r_h_p = framework::EigenMatrix::From(*reset_hidden_prev); + auto h = framework::EigenMatrix::From(*hidden); auto& place = *context.template device_context().eigen_device(); // calculate unactivated gate outputs if (bias) { - auto b = EigenMatrix::From(*bias); + auto b = framework::EigenMatrix::From(*bias); g.device(place) = x + b.reshape(Eigen::array({{1, frame_size * 3}})) .broadcast(Eigen::array({{batch_size, 1}})); @@ -177,11 +170,11 @@ class GRUUnitGradKernel : public framework::OpKernel { T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data( reset_hidden_prev->dims(), context.GetPlace()); - auto h_p = EigenMatrix::From(*hidden_prev); - auto g = EigenMatrix::From(*gate); - auto d_h = EigenMatrix::From(*hidden_grad); - auto d_g = EigenMatrix::From(gate_grad); - auto d_r_h_p = EigenMatrix::From(reset_hidden_prev_grad); + auto h_p = framework::EigenMatrix::From(*hidden_prev); + auto g = framework::EigenMatrix::From(*gate); + auto d_h = framework::EigenMatrix::From(*hidden_grad); + auto d_g = framework::EigenMatrix::From(gate_grad); + auto d_r_h_p = framework::EigenMatrix::From(reset_hidden_prev_grad); auto& place = *context.template device_context().eigen_device(); @@ -237,7 +230,7 @@ class GRUUnitGradKernel : public framework::OpKernel { if (hidden_prev_grad) { T* hidden_prev_grad_data = hidden_prev_grad->mutable_data(context.GetPlace()); - auto d_h_p = EigenMatrix::From(*hidden_prev_grad); + auto d_h_p = framework::EigenMatrix::From(*hidden_prev_grad); if (context.Attr("origin_mode")) { d_h_p.device(place) = d_r_h_p * r + d_h * u; } else { @@ -250,13 +243,13 @@ class GRUUnitGradKernel : public framework::OpKernel { // backward for input if (input_grad) { input_grad->mutable_data(context.GetPlace()); - auto d_x = EigenMatrix::From(*input_grad); + auto d_x = framework::EigenMatrix::From(*input_grad); d_x.device(place) = d_g; } // backward for bias if (bias_grad) { bias_grad->mutable_data(context.GetPlace()); - auto d_b = EigenVector::Flatten(*bias_grad); + auto d_b = framework::EigenVector::Flatten(*bias_grad); d_b.device(place) = d_g.sum(Eigen::array({{0}})); } } diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h index 857ecda303c..0bc53d7dd7b 100644 --- a/paddle/fluid/operators/kldiv_loss_op.h +++ b/paddle/fluid/operators/kldiv_loss_op.h @@ -19,10 +19,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; - using Array1 = Eigen::DSizes; template @@ -64,9 +60,9 @@ class KLDivLossKernel : public framework::OpKernel { const int n = input->dims()[0]; loss->mutable_data(ctx.GetPlace()); - auto input_t = EigenVector::Flatten(*input); - auto target_t = EigenVector::Flatten(*target); - auto loss_t = EigenVector::Flatten(*loss); + auto input_t = framework::EigenVector::Flatten(*input); + auto target_t = framework::EigenVector::Flatten(*target); + auto loss_t = framework::EigenVector::Flatten(*loss); auto output = target_t.binaryExpr(input_t, KLDivLossForward()); if ("none" == reduction) { loss_t.device(place) = output; @@ -101,10 +97,10 @@ class KLDivLossGradKernel : public framework::OpKernel { input_grad->mutable_data(ctx.GetPlace()); - auto target_t = EigenVector::Flatten(*target); + auto target_t = framework::EigenVector::Flatten(*target); - auto input_grad_t = EigenVector::Flatten(*input_grad); - auto loss_grad_t = EigenVector::Flatten(*loss_grad); + auto input_grad_t = framework::EigenVector::Flatten(*input_grad); + auto loss_grad_t = framework::EigenVector::Flatten(*loss_grad); auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand)); auto grad_t = target_t * loss_grad_expand; diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h index d4f3fc5d7a6..eacc5f467d2 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.h +++ b/paddle/fluid/operators/linear_chain_crf_op.h @@ -47,9 +47,6 @@ struct ScalarMul { using framework::LoDTensor; using framework::LoD; using framework::Tensor; -template -using EigenMatrix = framework::EigenMatrix; template class LinearChainCRFOpKernel : public framework::OpKernel { @@ -127,16 +124,16 @@ class LinearChainCRFOpKernel : public framework::OpKernel { platform::CPUPlace()); auto& place = *ctx.template device_context() .eigen_device(); - auto x = EigenMatrix::From(emission_weights_tmp); - auto x_row_max = EigenMatrix::From(emission_row_max); + auto x = framework::EigenMatrix::From(emission_weights_tmp); + auto x_row_max = framework::EigenMatrix::From(emission_row_max); x_row_max.device(place) = x.maximum(Eigen::DSizes(1)) .reshape(Eigen::DSizes(static_cast(batch_size), 1)); - auto x_exps = EigenMatrix::From(emission_exps_tmp); + auto x_exps = framework::EigenMatrix::From(emission_exps_tmp); x_exps.device(place) = (x - x_row_max.broadcast(Eigen::DSizes(1, tag_num))).exp(); - auto w = EigenMatrix::From(*transition_weights); - auto w_exps = EigenMatrix::From(*transition_exps); + auto w = framework::EigenMatrix::From(*transition_weights); + auto w_exps = framework::EigenMatrix::From(*transition_exps); w_exps.device(place) = w.exp(); T* log_likelihood = ll->data(); for (int64_t i = 0; i < seq_num; ++i) { @@ -355,9 +352,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { NormalizeL1(beta_value + k * tag_num, tag_num); } - auto x_grad_mat = EigenMatrix::From(*emission_grad); - auto alpha_mat = EigenMatrix::From(alpha); - auto beta_mat = EigenMatrix::From(*beta); + auto x_grad_mat = framework::EigenMatrix::From(*emission_grad); + auto alpha_mat = framework::EigenMatrix::From(alpha); + auto beta_mat = framework::EigenMatrix::From(*beta); auto* place = ctx.eigen_device(); auto prob = alpha_mat * beta_mat; @@ -381,13 +378,13 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { x_grad_mat(/*to end state*/ seq_length - 1, k); } - auto x_exps_mat = EigenMatrix::From(emission_exps); + auto x_exps_mat = framework::EigenMatrix::From(emission_exps); // TODO(caoying): Fix this to avoid using this local variable if we can // profile the training process. Tensor tmp; tmp.mutable_data(beta->dims(), platform::CPUPlace()); - auto tmp_mat = EigenMatrix::From(tmp); + auto tmp_mat = framework::EigenMatrix::From(tmp); auto prob = beta_mat * x_exps_mat; auto row_sum = prob.sum(Eigen::DSizes(1)) .reshape(Eigen::DSizes(seq_length, 1)) diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index 0fd5f2ac01d..41df6f488f1 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -500,13 +500,14 @@ bool SortTopk(const platform::CUDADeviceContext& ctx, // copy sliced data to output. const Eigen::DSizes slice_indices{0, 0}; const Eigen::DSizes slice_sizes{num_rows, k}; - auto e_indices = EigenMatrix::From(*indices_tensor, dim); - auto e_tmp_indices = EigenMatrix::From(temp_indices); + auto e_indices = + framework::EigenMatrix::From(*indices_tensor, dim); + auto e_tmp_indices = framework::EigenMatrix::From(temp_indices); std::vector odims = {static_cast(num_rows), static_cast(k)}; auto dim = framework::make_ddim(odims); - auto e_values = EigenMatrix::From(*out_tensor, dim); - auto e_tmp_values = EigenMatrix::From(temp_values); + auto e_values = framework::EigenMatrix::From(*out_tensor, dim); + auto e_tmp_values = framework::EigenMatrix::From(temp_values); e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes); e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes); diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h index 1ba01d93acc..f279b9529cc 100644 --- a/paddle/fluid/operators/top_k_op.h +++ b/paddle/fluid/operators/top_k_op.h @@ -25,14 +25,6 @@ namespace operators { using Tensor = framework::Tensor; -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenVector = framework::EigenVector; - template class TopkKernel : public framework::OpKernel { public: @@ -70,12 +62,13 @@ class TopkKernel : public framework::OpKernel { vec.reserve(col); // 1D vector if (inputdims.size() == 1) { - auto eg_input = EigenVector::Flatten(*input); + auto eg_input = framework::EigenVector::Flatten(*input); for (size_t j = 0; j < col; j++) { vec.push_back(std::pair(eg_input(j), j)); } } else { - auto eg_input = EigenMatrix::Reshape(*input, inputdims.size() - 1); + auto eg_input = + framework::EigenMatrix::Reshape(*input, inputdims.size() - 1); for (size_t j = 0; j < col; j++) { vec.push_back(std::pair(eg_input(i, j), j)); } diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h index 89b5d36b1b3..c836c993b29 100644 --- a/paddle/fluid/operators/top_k_v2_op.h +++ b/paddle/fluid/operators/top_k_v2_op.h @@ -61,12 +61,12 @@ static void FullTopK(Type input_height, Type input_width, int input_dim, std::vector> col_vec; col_vec.reserve(input_width); if (input_dim == 1) { - auto e_input = EigenVector::Flatten(*input); + auto e_input = framework::EigenVector::Flatten(*input); for (Type j = 0; j < input_width; ++j) { col_vec.emplace_back(std::pair(e_input(j), j)); } } else { - auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); for (Type j = 0; j < input_width; ++j) { col_vec.emplace_back(std::pair(e_input(i, j), j)); } @@ -142,14 +142,15 @@ static void FullTopKAssign(const Type& input_height, const Type& input_width, #endif for (Type i = 0; i < input_height; ++i) { if (input_dim == 1) { - auto e_input = EigenVector::Flatten(*input); - auto e_indices = EigenVector::Flatten(*indices); + auto e_input = framework::EigenVector::Flatten(*input); + auto e_indices = framework::EigenVector::Flatten(*indices); for (Type j = 0; j < k; ++j) { output_data[i * input_width + e_indices(j)] = e_input(j); } } else { - auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); + auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); + auto e_indices = + framework::EigenMatrix::Reshape(*indices, input_dim - 1); for (Type j = 0; j < k; ++j) { output_data[i * input_width + e_indices(i, j)] = e_input(i, j); } diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index d21f6b2d69d..cd8b31d72e7 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -5,6 +5,7 @@ # If there are some redefined error in compiling with the source file which # in combination rule, you can remove the source file from the following rules. register_unity_group(cc + abs_op.cc add_position_encoding_op.cc addmm_op.cc affine_channel_op.cc @@ -33,7 +34,11 @@ register_unity_group(cc chunk_eval_op.cc clip_by_norm_op.cc clip_op.cc - coalesce_tensor_op.cc) + coalesce_tensor_op.cc + mkldnn/activation_mkldnn_op.cc + mkldnn/interpolate_mkldnn_op.cc + mkldnn/pool_mkldnn_op.cc + mkldnn/softmax_mkldnn_op.cc) register_unity_group(cc center_loss_op.cc mkldnn/concat_mkldnn_op.cc @@ -42,7 +47,12 @@ register_unity_group(cc correlation_op.cc cos_sim_op.cc crf_decoding_op.cc - crop_op.cc) + crop_op.cc + ascend_trigger_op.cc + conj_op.cc + imag_op.cc + kldiv_loss_op.cc + memcpy_op.cc) register_unity_group(cc cross_entropy_op.cc cross_op.cc @@ -69,7 +79,14 @@ register_unity_group(cc edit_distance_op.cc empty_op.cc enqueue_op.cc - erf_op.cc) + erf_op.cc + py_func_op.cc + real_op.cc + sync_batch_norm_op.cc + top_k_op.cc + conv_op.cc + conv_transpose_op.cc + gru_unit_op.cc) register_unity_group(cc expand_v2_op.cc fake_dequantize_op.cc @@ -309,6 +326,29 @@ register_unity_group(cc unbind_op.cu.cc unpool_op.cu.cc unsqueeze_op.cu.cc) +register_unity_group(cc + arg_max_op.cc + arg_min_op.cc + squared_l2_distance_op.cc) +register_unity_group(cc + linear_chain_crf_op.cc + lstm_op.cc + partial_concat_op.cc + pyramid_hash_op.cc + recurrent_op.cc + run_program_op.cc + softmax_with_cross_entropy_op.cc + warpctc_op.cc) +register_unity_group(cc + conv_op.cu.cc + lstm_op.cu.cc + rnn_op.cu.cc + split_op.cu.cc + activation_cudnn_op.cu.cc + assign_value_op.cu.cc + merge_selected_rows_op.cu.cc + run_program_op.cu.cc + warpctc_op.cu.cc) register_unity_group(cu addmm_op.cu affine_channel_op.cu -- GitLab