未验证 提交 a13f1d69 编写于 作者: W wuhuanzhou 提交者: GitHub

optimize unity build (#31119)

* optimize unity build, test=develop

* fix compilation error on Windows, test=develop

* fix compilation error, test=develop

* fix code style error, test=develop
上级 8f4ac6b5
...@@ -24,13 +24,6 @@ namespace paddle { ...@@ -24,13 +24,6 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 }; enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
...@@ -73,17 +66,17 @@ class GRUUnitKernel : public framework::OpKernel<T> { ...@@ -73,17 +66,17 @@ class GRUUnitKernel : public framework::OpKernel<T> {
int batch_size = input->dims()[0]; int batch_size = input->dims()[0];
int frame_size = hidden_prev->dims()[1]; int frame_size = hidden_prev->dims()[1];
auto x = EigenMatrix<T>::From(*input); auto x = framework::EigenMatrix<T>::From(*input);
auto h_p = EigenMatrix<T>::From(*hidden_prev); auto h_p = framework::EigenMatrix<T>::From(*hidden_prev);
auto g = EigenMatrix<T>::From(*gate); auto g = framework::EigenMatrix<T>::From(*gate);
auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev); auto r_h_p = framework::EigenMatrix<T>::From(*reset_hidden_prev);
auto h = EigenMatrix<T>::From(*hidden); auto h = framework::EigenMatrix<T>::From(*hidden);
auto& place = auto& place =
*context.template device_context<DeviceContext>().eigen_device(); *context.template device_context<DeviceContext>().eigen_device();
// calculate unactivated gate outputs // calculate unactivated gate outputs
if (bias) { if (bias) {
auto b = EigenMatrix<T>::From(*bias); auto b = framework::EigenMatrix<T>::From(*bias);
g.device(place) = x + g.device(place) = x +
b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}})) b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
.broadcast(Eigen::array<int, 2>({{batch_size, 1}})); .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
...@@ -177,11 +170,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> { ...@@ -177,11 +170,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>( T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
reset_hidden_prev->dims(), context.GetPlace()); reset_hidden_prev->dims(), context.GetPlace());
auto h_p = EigenMatrix<T>::From(*hidden_prev); auto h_p = framework::EigenMatrix<T>::From(*hidden_prev);
auto g = EigenMatrix<T>::From(*gate); auto g = framework::EigenMatrix<T>::From(*gate);
auto d_h = EigenMatrix<T>::From(*hidden_grad); auto d_h = framework::EigenMatrix<T>::From(*hidden_grad);
auto d_g = EigenMatrix<T>::From(gate_grad); auto d_g = framework::EigenMatrix<T>::From(gate_grad);
auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad); auto d_r_h_p = framework::EigenMatrix<T>::From(reset_hidden_prev_grad);
auto& place = auto& place =
*context.template device_context<DeviceContext>().eigen_device(); *context.template device_context<DeviceContext>().eigen_device();
...@@ -237,7 +230,7 @@ class GRUUnitGradKernel : public framework::OpKernel<T> { ...@@ -237,7 +230,7 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
if (hidden_prev_grad) { if (hidden_prev_grad) {
T* hidden_prev_grad_data = T* hidden_prev_grad_data =
hidden_prev_grad->mutable_data<T>(context.GetPlace()); hidden_prev_grad->mutable_data<T>(context.GetPlace());
auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad); auto d_h_p = framework::EigenMatrix<T>::From(*hidden_prev_grad);
if (context.Attr<bool>("origin_mode")) { if (context.Attr<bool>("origin_mode")) {
d_h_p.device(place) = d_r_h_p * r + d_h * u; d_h_p.device(place) = d_r_h_p * r + d_h * u;
} else { } else {
...@@ -250,13 +243,13 @@ class GRUUnitGradKernel : public framework::OpKernel<T> { ...@@ -250,13 +243,13 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
// backward for input // backward for input
if (input_grad) { if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
auto d_x = EigenMatrix<T>::From(*input_grad); auto d_x = framework::EigenMatrix<T>::From(*input_grad);
d_x.device(place) = d_g; d_x.device(place) = d_g;
} }
// backward for bias // backward for bias
if (bias_grad) { if (bias_grad) {
bias_grad->mutable_data<T>(context.GetPlace()); bias_grad->mutable_data<T>(context.GetPlace());
auto d_b = EigenVector<T>::Flatten(*bias_grad); auto d_b = framework::EigenVector<T>::Flatten(*bias_grad);
d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}})); d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
} }
} }
......
...@@ -19,10 +19,6 @@ namespace paddle { ...@@ -19,10 +19,6 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
using Array1 = Eigen::DSizes<int64_t, 1>; using Array1 = Eigen::DSizes<int64_t, 1>;
template <typename T> template <typename T>
...@@ -64,9 +60,9 @@ class KLDivLossKernel : public framework::OpKernel<T> { ...@@ -64,9 +60,9 @@ class KLDivLossKernel : public framework::OpKernel<T> {
const int n = input->dims()[0]; const int n = input->dims()[0];
loss->mutable_data<T>(ctx.GetPlace()); loss->mutable_data<T>(ctx.GetPlace());
auto input_t = EigenVector<T>::Flatten(*input); auto input_t = framework::EigenVector<T>::Flatten(*input);
auto target_t = EigenVector<T>::Flatten(*target); auto target_t = framework::EigenVector<T>::Flatten(*target);
auto loss_t = EigenVector<T>::Flatten(*loss); auto loss_t = framework::EigenVector<T>::Flatten(*loss);
auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>()); auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
if ("none" == reduction) { if ("none" == reduction) {
loss_t.device(place) = output; loss_t.device(place) = output;
...@@ -101,10 +97,10 @@ class KLDivLossGradKernel : public framework::OpKernel<T> { ...@@ -101,10 +97,10 @@ class KLDivLossGradKernel : public framework::OpKernel<T> {
input_grad->mutable_data<T>(ctx.GetPlace()); input_grad->mutable_data<T>(ctx.GetPlace());
auto target_t = EigenVector<T>::Flatten(*target); auto target_t = framework::EigenVector<T>::Flatten(*target);
auto input_grad_t = EigenVector<T>::Flatten(*input_grad); auto input_grad_t = framework::EigenVector<T>::Flatten(*input_grad);
auto loss_grad_t = EigenVector<T>::Flatten(*loss_grad); auto loss_grad_t = framework::EigenVector<T>::Flatten(*loss_grad);
auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand)); auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
auto grad_t = target_t * loss_grad_expand; auto grad_t = target_t * loss_grad_expand;
......
...@@ -47,9 +47,6 @@ struct ScalarMul { ...@@ -47,9 +47,6 @@ struct ScalarMul {
using framework::LoDTensor; using framework::LoDTensor;
using framework::LoD; using framework::LoD;
using framework::Tensor; using framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class LinearChainCRFOpKernel : public framework::OpKernel<T> { class LinearChainCRFOpKernel : public framework::OpKernel<T> {
...@@ -127,16 +124,16 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> { ...@@ -127,16 +124,16 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
platform::CPUPlace()); platform::CPUPlace());
auto& place = *ctx.template device_context<platform::CPUDeviceContext>() auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device(); .eigen_device();
auto x = EigenMatrix<T>::From(emission_weights_tmp); auto x = framework::EigenMatrix<T>::From(emission_weights_tmp);
auto x_row_max = EigenMatrix<T>::From(emission_row_max); auto x_row_max = framework::EigenMatrix<T>::From(emission_row_max);
x_row_max.device(place) = x_row_max.device(place) =
x.maximum(Eigen::DSizes<int, 1>(1)) x.maximum(Eigen::DSizes<int, 1>(1))
.reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1)); .reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
auto x_exps = EigenMatrix<T>::From(emission_exps_tmp); auto x_exps = framework::EigenMatrix<T>::From(emission_exps_tmp);
x_exps.device(place) = x_exps.device(place) =
(x - x_row_max.broadcast(Eigen::DSizes<int, 2>(1, tag_num))).exp(); (x - x_row_max.broadcast(Eigen::DSizes<int, 2>(1, tag_num))).exp();
auto w = EigenMatrix<T>::From(*transition_weights); auto w = framework::EigenMatrix<T>::From(*transition_weights);
auto w_exps = EigenMatrix<T>::From(*transition_exps); auto w_exps = framework::EigenMatrix<T>::From(*transition_exps);
w_exps.device(place) = w.exp(); w_exps.device(place) = w.exp();
T* log_likelihood = ll->data<T>(); T* log_likelihood = ll->data<T>();
for (int64_t i = 0; i < seq_num; ++i) { for (int64_t i = 0; i < seq_num; ++i) {
...@@ -355,9 +352,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> { ...@@ -355,9 +352,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
NormalizeL1<T>(beta_value + k * tag_num, tag_num); NormalizeL1<T>(beta_value + k * tag_num, tag_num);
} }
auto x_grad_mat = EigenMatrix<T>::From(*emission_grad); auto x_grad_mat = framework::EigenMatrix<T>::From(*emission_grad);
auto alpha_mat = EigenMatrix<T>::From(alpha); auto alpha_mat = framework::EigenMatrix<T>::From(alpha);
auto beta_mat = EigenMatrix<T>::From(*beta); auto beta_mat = framework::EigenMatrix<T>::From(*beta);
auto* place = ctx.eigen_device(); auto* place = ctx.eigen_device();
auto prob = alpha_mat * beta_mat; auto prob = alpha_mat * beta_mat;
...@@ -381,13 +378,13 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> { ...@@ -381,13 +378,13 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
x_grad_mat(/*to end state*/ seq_length - 1, k); x_grad_mat(/*to end state*/ seq_length - 1, k);
} }
auto x_exps_mat = EigenMatrix<T>::From(emission_exps); auto x_exps_mat = framework::EigenMatrix<T>::From(emission_exps);
// TODO(caoying): Fix this to avoid using this local variable if we can // TODO(caoying): Fix this to avoid using this local variable if we can
// profile the training process. // profile the training process.
Tensor tmp; Tensor tmp;
tmp.mutable_data<T>(beta->dims(), platform::CPUPlace()); tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
auto tmp_mat = EigenMatrix<T>::From(tmp); auto tmp_mat = framework::EigenMatrix<T>::From(tmp);
auto prob = beta_mat * x_exps_mat; auto prob = beta_mat * x_exps_mat;
auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1)) auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
.reshape(Eigen::DSizes<int, 2>(seq_length, 1)) .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
......
...@@ -500,13 +500,14 @@ bool SortTopk(const platform::CUDADeviceContext& ctx, ...@@ -500,13 +500,14 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
// copy sliced data to output. // copy sliced data to output.
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0}; const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k}; const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim); auto e_indices =
auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices); framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(temp_indices);
std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)}; std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
auto dim = framework::make_ddim(odims); auto dim = framework::make_ddim(odims);
auto e_values = EigenMatrix<T>::From(*out_tensor, dim); auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
auto e_tmp_values = EigenMatrix<T>::From(temp_values); auto e_tmp_values = framework::EigenMatrix<T>::From(temp_values);
e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes); e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes); e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
......
...@@ -25,14 +25,6 @@ namespace operators { ...@@ -25,14 +25,6 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class TopkKernel : public framework::OpKernel<T> { class TopkKernel : public framework::OpKernel<T> {
public: public:
...@@ -70,12 +62,13 @@ class TopkKernel : public framework::OpKernel<T> { ...@@ -70,12 +62,13 @@ class TopkKernel : public framework::OpKernel<T> {
vec.reserve(col); vec.reserve(col);
// 1D vector // 1D vector
if (inputdims.size() == 1) { if (inputdims.size() == 1) {
auto eg_input = EigenVector<T>::Flatten(*input); auto eg_input = framework::EigenVector<T>::Flatten(*input);
for (size_t j = 0; j < col; j++) { for (size_t j = 0; j < col; j++) {
vec.push_back(std::pair<T, size_t>(eg_input(j), j)); vec.push_back(std::pair<T, size_t>(eg_input(j), j));
} }
} else { } else {
auto eg_input = EigenMatrix<T>::Reshape(*input, inputdims.size() - 1); auto eg_input =
framework::EigenMatrix<T>::Reshape(*input, inputdims.size() - 1);
for (size_t j = 0; j < col; j++) { for (size_t j = 0; j < col; j++) {
vec.push_back(std::pair<T, size_t>(eg_input(i, j), j)); vec.push_back(std::pair<T, size_t>(eg_input(i, j), j));
} }
......
...@@ -61,12 +61,12 @@ static void FullTopK(Type input_height, Type input_width, int input_dim, ...@@ -61,12 +61,12 @@ static void FullTopK(Type input_height, Type input_width, int input_dim,
std::vector<std::pair<T, Type>> col_vec; std::vector<std::pair<T, Type>> col_vec;
col_vec.reserve(input_width); col_vec.reserve(input_width);
if (input_dim == 1) { if (input_dim == 1) {
auto e_input = EigenVector<T>::Flatten(*input); auto e_input = framework::EigenVector<T>::Flatten(*input);
for (Type j = 0; j < input_width; ++j) { for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(j), j)); col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
} }
} else { } else {
auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1); auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
for (Type j = 0; j < input_width; ++j) { for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j)); col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
} }
...@@ -142,14 +142,15 @@ static void FullTopKAssign(const Type& input_height, const Type& input_width, ...@@ -142,14 +142,15 @@ static void FullTopKAssign(const Type& input_height, const Type& input_width,
#endif #endif
for (Type i = 0; i < input_height; ++i) { for (Type i = 0; i < input_height; ++i) {
if (input_dim == 1) { if (input_dim == 1) {
auto e_input = EigenVector<T>::Flatten(*input); auto e_input = framework::EigenVector<T>::Flatten(*input);
auto e_indices = EigenVector<Type>::Flatten(*indices); auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
for (Type j = 0; j < k; ++j) { for (Type j = 0; j < k; ++j) {
output_data[i * input_width + e_indices(j)] = e_input(j); output_data[i * input_width + e_indices(j)] = e_input(j);
} }
} else { } else {
auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1); auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1); auto e_indices =
framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
for (Type j = 0; j < k; ++j) { for (Type j = 0; j < k; ++j) {
output_data[i * input_width + e_indices(i, j)] = e_input(i, j); output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
} }
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
# If there are some redefined error in compiling with the source file which # If there are some redefined error in compiling with the source file which
# in combination rule, you can remove the source file from the following rules. # in combination rule, you can remove the source file from the following rules.
register_unity_group(cc register_unity_group(cc
abs_op.cc
add_position_encoding_op.cc add_position_encoding_op.cc
addmm_op.cc addmm_op.cc
affine_channel_op.cc affine_channel_op.cc
...@@ -33,7 +34,11 @@ register_unity_group(cc ...@@ -33,7 +34,11 @@ register_unity_group(cc
chunk_eval_op.cc chunk_eval_op.cc
clip_by_norm_op.cc clip_by_norm_op.cc
clip_op.cc clip_op.cc
coalesce_tensor_op.cc) coalesce_tensor_op.cc
mkldnn/activation_mkldnn_op.cc
mkldnn/interpolate_mkldnn_op.cc
mkldnn/pool_mkldnn_op.cc
mkldnn/softmax_mkldnn_op.cc)
register_unity_group(cc register_unity_group(cc
center_loss_op.cc center_loss_op.cc
mkldnn/concat_mkldnn_op.cc mkldnn/concat_mkldnn_op.cc
...@@ -42,7 +47,12 @@ register_unity_group(cc ...@@ -42,7 +47,12 @@ register_unity_group(cc
correlation_op.cc correlation_op.cc
cos_sim_op.cc cos_sim_op.cc
crf_decoding_op.cc crf_decoding_op.cc
crop_op.cc) crop_op.cc
ascend_trigger_op.cc
conj_op.cc
imag_op.cc
kldiv_loss_op.cc
memcpy_op.cc)
register_unity_group(cc register_unity_group(cc
cross_entropy_op.cc cross_entropy_op.cc
cross_op.cc cross_op.cc
...@@ -69,7 +79,14 @@ register_unity_group(cc ...@@ -69,7 +79,14 @@ register_unity_group(cc
edit_distance_op.cc edit_distance_op.cc
empty_op.cc empty_op.cc
enqueue_op.cc enqueue_op.cc
erf_op.cc) erf_op.cc
py_func_op.cc
real_op.cc
sync_batch_norm_op.cc
top_k_op.cc
conv_op.cc
conv_transpose_op.cc
gru_unit_op.cc)
register_unity_group(cc register_unity_group(cc
expand_v2_op.cc expand_v2_op.cc
fake_dequantize_op.cc fake_dequantize_op.cc
...@@ -309,6 +326,29 @@ register_unity_group(cc ...@@ -309,6 +326,29 @@ register_unity_group(cc
unbind_op.cu.cc unbind_op.cu.cc
unpool_op.cu.cc unpool_op.cu.cc
unsqueeze_op.cu.cc) unsqueeze_op.cu.cc)
register_unity_group(cc
arg_max_op.cc
arg_min_op.cc
squared_l2_distance_op.cc)
register_unity_group(cc
linear_chain_crf_op.cc
lstm_op.cc
partial_concat_op.cc
pyramid_hash_op.cc
recurrent_op.cc
run_program_op.cc
softmax_with_cross_entropy_op.cc
warpctc_op.cc)
register_unity_group(cc
conv_op.cu.cc
lstm_op.cu.cc
rnn_op.cu.cc
split_op.cu.cc
activation_cudnn_op.cu.cc
assign_value_op.cu.cc
merge_selected_rows_op.cu.cc
run_program_op.cu.cc
warpctc_op.cu.cc)
register_unity_group(cu register_unity_group(cu
addmm_op.cu addmm_op.cu
affine_channel_op.cu affine_channel_op.cu
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册