未验证 提交 a13f1d69 编写于 作者: W wuhuanzhou 提交者: GitHub

optimize unity build (#31119)

* optimize unity build, test=develop

* fix compilation error on Windows, test=develop

* fix compilation error, test=develop

* fix code style error, test=develop
上级 8f4ac6b5
......@@ -24,13 +24,6 @@ namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
......@@ -73,17 +66,17 @@ class GRUUnitKernel : public framework::OpKernel<T> {
int batch_size = input->dims()[0];
int frame_size = hidden_prev->dims()[1];
auto x = EigenMatrix<T>::From(*input);
auto h_p = EigenMatrix<T>::From(*hidden_prev);
auto g = EigenMatrix<T>::From(*gate);
auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
auto h = EigenMatrix<T>::From(*hidden);
auto x = framework::EigenMatrix<T>::From(*input);
auto h_p = framework::EigenMatrix<T>::From(*hidden_prev);
auto g = framework::EigenMatrix<T>::From(*gate);
auto r_h_p = framework::EigenMatrix<T>::From(*reset_hidden_prev);
auto h = framework::EigenMatrix<T>::From(*hidden);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
// calculate unactivated gate outputs
if (bias) {
auto b = EigenMatrix<T>::From(*bias);
auto b = framework::EigenMatrix<T>::From(*bias);
g.device(place) = x +
b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
.broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
......@@ -177,11 +170,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
reset_hidden_prev->dims(), context.GetPlace());
auto h_p = EigenMatrix<T>::From(*hidden_prev);
auto g = EigenMatrix<T>::From(*gate);
auto d_h = EigenMatrix<T>::From(*hidden_grad);
auto d_g = EigenMatrix<T>::From(gate_grad);
auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
auto h_p = framework::EigenMatrix<T>::From(*hidden_prev);
auto g = framework::EigenMatrix<T>::From(*gate);
auto d_h = framework::EigenMatrix<T>::From(*hidden_grad);
auto d_g = framework::EigenMatrix<T>::From(gate_grad);
auto d_r_h_p = framework::EigenMatrix<T>::From(reset_hidden_prev_grad);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
......@@ -237,7 +230,7 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
if (hidden_prev_grad) {
T* hidden_prev_grad_data =
hidden_prev_grad->mutable_data<T>(context.GetPlace());
auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
auto d_h_p = framework::EigenMatrix<T>::From(*hidden_prev_grad);
if (context.Attr<bool>("origin_mode")) {
d_h_p.device(place) = d_r_h_p * r + d_h * u;
} else {
......@@ -250,13 +243,13 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
// backward for input
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
auto d_x = EigenMatrix<T>::From(*input_grad);
auto d_x = framework::EigenMatrix<T>::From(*input_grad);
d_x.device(place) = d_g;
}
// backward for bias
if (bias_grad) {
bias_grad->mutable_data<T>(context.GetPlace());
auto d_b = EigenVector<T>::Flatten(*bias_grad);
auto d_b = framework::EigenVector<T>::Flatten(*bias_grad);
d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
}
}
......
......@@ -19,10 +19,6 @@ namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
using Array1 = Eigen::DSizes<int64_t, 1>;
template <typename T>
......@@ -64,9 +60,9 @@ class KLDivLossKernel : public framework::OpKernel<T> {
const int n = input->dims()[0];
loss->mutable_data<T>(ctx.GetPlace());
auto input_t = EigenVector<T>::Flatten(*input);
auto target_t = EigenVector<T>::Flatten(*target);
auto loss_t = EigenVector<T>::Flatten(*loss);
auto input_t = framework::EigenVector<T>::Flatten(*input);
auto target_t = framework::EigenVector<T>::Flatten(*target);
auto loss_t = framework::EigenVector<T>::Flatten(*loss);
auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
if ("none" == reduction) {
loss_t.device(place) = output;
......@@ -101,10 +97,10 @@ class KLDivLossGradKernel : public framework::OpKernel<T> {
input_grad->mutable_data<T>(ctx.GetPlace());
auto target_t = EigenVector<T>::Flatten(*target);
auto target_t = framework::EigenVector<T>::Flatten(*target);
auto input_grad_t = EigenVector<T>::Flatten(*input_grad);
auto loss_grad_t = EigenVector<T>::Flatten(*loss_grad);
auto input_grad_t = framework::EigenVector<T>::Flatten(*input_grad);
auto loss_grad_t = framework::EigenVector<T>::Flatten(*loss_grad);
auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
auto grad_t = target_t * loss_grad_expand;
......
......@@ -47,9 +47,6 @@ struct ScalarMul {
using framework::LoDTensor;
using framework::LoD;
using framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T>
class LinearChainCRFOpKernel : public framework::OpKernel<T> {
......@@ -127,16 +124,16 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
platform::CPUPlace());
auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device();
auto x = EigenMatrix<T>::From(emission_weights_tmp);
auto x_row_max = EigenMatrix<T>::From(emission_row_max);
auto x = framework::EigenMatrix<T>::From(emission_weights_tmp);
auto x_row_max = framework::EigenMatrix<T>::From(emission_row_max);
x_row_max.device(place) =
x.maximum(Eigen::DSizes<int, 1>(1))
.reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
auto x_exps = EigenMatrix<T>::From(emission_exps_tmp);
auto x_exps = framework::EigenMatrix<T>::From(emission_exps_tmp);
x_exps.device(place) =
(x - x_row_max.broadcast(Eigen::DSizes<int, 2>(1, tag_num))).exp();
auto w = EigenMatrix<T>::From(*transition_weights);
auto w_exps = EigenMatrix<T>::From(*transition_exps);
auto w = framework::EigenMatrix<T>::From(*transition_weights);
auto w_exps = framework::EigenMatrix<T>::From(*transition_exps);
w_exps.device(place) = w.exp();
T* log_likelihood = ll->data<T>();
for (int64_t i = 0; i < seq_num; ++i) {
......@@ -355,9 +352,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
NormalizeL1<T>(beta_value + k * tag_num, tag_num);
}
auto x_grad_mat = EigenMatrix<T>::From(*emission_grad);
auto alpha_mat = EigenMatrix<T>::From(alpha);
auto beta_mat = EigenMatrix<T>::From(*beta);
auto x_grad_mat = framework::EigenMatrix<T>::From(*emission_grad);
auto alpha_mat = framework::EigenMatrix<T>::From(alpha);
auto beta_mat = framework::EigenMatrix<T>::From(*beta);
auto* place = ctx.eigen_device();
auto prob = alpha_mat * beta_mat;
......@@ -381,13 +378,13 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
x_grad_mat(/*to end state*/ seq_length - 1, k);
}
auto x_exps_mat = EigenMatrix<T>::From(emission_exps);
auto x_exps_mat = framework::EigenMatrix<T>::From(emission_exps);
// TODO(caoying): Fix this to avoid using this local variable if we can
// profile the training process.
Tensor tmp;
tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
auto tmp_mat = EigenMatrix<T>::From(tmp);
auto tmp_mat = framework::EigenMatrix<T>::From(tmp);
auto prob = beta_mat * x_exps_mat;
auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
.reshape(Eigen::DSizes<int, 2>(seq_length, 1))
......
......@@ -500,13 +500,14 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
// copy sliced data to output.
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
auto e_indices =
framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(temp_indices);
std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
auto dim = framework::make_ddim(odims);
auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
auto e_tmp_values = EigenMatrix<T>::From(temp_values);
auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
auto e_tmp_values = framework::EigenMatrix<T>::From(temp_values);
e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
......
......@@ -25,14 +25,6 @@ namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext, typename T>
class TopkKernel : public framework::OpKernel<T> {
public:
......@@ -70,12 +62,13 @@ class TopkKernel : public framework::OpKernel<T> {
vec.reserve(col);
// 1D vector
if (inputdims.size() == 1) {
auto eg_input = EigenVector<T>::Flatten(*input);
auto eg_input = framework::EigenVector<T>::Flatten(*input);
for (size_t j = 0; j < col; j++) {
vec.push_back(std::pair<T, size_t>(eg_input(j), j));
}
} else {
auto eg_input = EigenMatrix<T>::Reshape(*input, inputdims.size() - 1);
auto eg_input =
framework::EigenMatrix<T>::Reshape(*input, inputdims.size() - 1);
for (size_t j = 0; j < col; j++) {
vec.push_back(std::pair<T, size_t>(eg_input(i, j), j));
}
......
......@@ -61,12 +61,12 @@ static void FullTopK(Type input_height, Type input_width, int input_dim,
std::vector<std::pair<T, Type>> col_vec;
col_vec.reserve(input_width);
if (input_dim == 1) {
auto e_input = EigenVector<T>::Flatten(*input);
auto e_input = framework::EigenVector<T>::Flatten(*input);
for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
}
} else {
auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
}
......@@ -142,14 +142,15 @@ static void FullTopKAssign(const Type& input_height, const Type& input_width,
#endif
for (Type i = 0; i < input_height; ++i) {
if (input_dim == 1) {
auto e_input = EigenVector<T>::Flatten(*input);
auto e_indices = EigenVector<Type>::Flatten(*indices);
auto e_input = framework::EigenVector<T>::Flatten(*input);
auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
for (Type j = 0; j < k; ++j) {
output_data[i * input_width + e_indices(j)] = e_input(j);
}
} else {
auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
auto e_indices =
framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
for (Type j = 0; j < k; ++j) {
output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
}
......
......@@ -5,6 +5,7 @@
# If there are some redefined error in compiling with the source file which
# in combination rule, you can remove the source file from the following rules.
register_unity_group(cc
abs_op.cc
add_position_encoding_op.cc
addmm_op.cc
affine_channel_op.cc
......@@ -33,7 +34,11 @@ register_unity_group(cc
chunk_eval_op.cc
clip_by_norm_op.cc
clip_op.cc
coalesce_tensor_op.cc)
coalesce_tensor_op.cc
mkldnn/activation_mkldnn_op.cc
mkldnn/interpolate_mkldnn_op.cc
mkldnn/pool_mkldnn_op.cc
mkldnn/softmax_mkldnn_op.cc)
register_unity_group(cc
center_loss_op.cc
mkldnn/concat_mkldnn_op.cc
......@@ -42,7 +47,12 @@ register_unity_group(cc
correlation_op.cc
cos_sim_op.cc
crf_decoding_op.cc
crop_op.cc)
crop_op.cc
ascend_trigger_op.cc
conj_op.cc
imag_op.cc
kldiv_loss_op.cc
memcpy_op.cc)
register_unity_group(cc
cross_entropy_op.cc
cross_op.cc
......@@ -69,7 +79,14 @@ register_unity_group(cc
edit_distance_op.cc
empty_op.cc
enqueue_op.cc
erf_op.cc)
erf_op.cc
py_func_op.cc
real_op.cc
sync_batch_norm_op.cc
top_k_op.cc
conv_op.cc
conv_transpose_op.cc
gru_unit_op.cc)
register_unity_group(cc
expand_v2_op.cc
fake_dequantize_op.cc
......@@ -309,6 +326,29 @@ register_unity_group(cc
unbind_op.cu.cc
unpool_op.cu.cc
unsqueeze_op.cu.cc)
register_unity_group(cc
arg_max_op.cc
arg_min_op.cc
squared_l2_distance_op.cc)
register_unity_group(cc
linear_chain_crf_op.cc
lstm_op.cc
partial_concat_op.cc
pyramid_hash_op.cc
recurrent_op.cc
run_program_op.cc
softmax_with_cross_entropy_op.cc
warpctc_op.cc)
register_unity_group(cc
conv_op.cu.cc
lstm_op.cu.cc
rnn_op.cu.cc
split_op.cu.cc
activation_cudnn_op.cu.cc
assign_value_op.cu.cc
merge_selected_rows_op.cu.cc
run_program_op.cu.cc
warpctc_op.cu.cc)
register_unity_group(cu
addmm_op.cu
affine_channel_op.cu
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册