未验证 提交 53604bac 编写于 作者: Y Yiqun Liu 提交者: GitHub

Optimize the compute implementation of several operators. (#2843)

* Optimize the transform from Paddle' Tensor to EigenVector, avoiding defining multiple DDim.

* Optimize the compute implementation of several operators.
test=develop
上级 800f5ce6
...@@ -24,12 +24,12 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> { ...@@ -24,12 +24,12 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
public: public:
void operator()(const lite::Context<lite::TargetType::kX86>& context, void operator()(const lite::Context<lite::TargetType::kX86>& context,
const lite::Tensor& src, const lite::Tensor& src,
std::vector<size_t> index_lod, const std::vector<size_t>& index_lod,
lite::Tensor* dst, lite::Tensor* dst,
bool is_src_index) { bool is_src_index) {
size_t* index = index_lod.data(); const size_t* index = index_lod.data();
auto src_dims = src.dims(); const auto& src_dims = src.dims();
auto dst_dims = dst->dims(); const auto& dst_dims = dst->dims();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
src_dims.size(), 2UL, "The src must be matrix with rank 2."); src_dims.size(), 2UL, "The src must be matrix with rank 2.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
......
...@@ -19,7 +19,6 @@ limitations under the License. */ ...@@ -19,7 +19,6 @@ limitations under the License. */
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/fluid/eigen.h" #include "lite/fluid/eigen.h"
// #include "lite/fluid/lod.h"
#include "lite/utils/paddle_enforce.h" #include "lite/utils/paddle_enforce.h"
namespace paddle { namespace paddle {
...@@ -27,11 +26,6 @@ namespace lite { ...@@ -27,11 +26,6 @@ namespace lite {
namespace x86 { namespace x86 {
namespace math { namespace math {
template <typename T,
int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
template <lite::TargetType Target, typename T> template <lite::TargetType Target, typename T>
class CopyMatrixRowsFunctor { class CopyMatrixRowsFunctor {
public: public:
...@@ -42,7 +36,7 @@ class CopyMatrixRowsFunctor { ...@@ -42,7 +36,7 @@ class CopyMatrixRowsFunctor {
// The indexed rows are based on the input index. // The indexed rows are based on the input index.
void operator()(const lite::Context<Target>& context, void operator()(const lite::Context<Target>& context,
const lite::Tensor& src, const lite::Tensor& src,
std::vector<size_t> index_lod, const std::vector<size_t>& index_lod,
lite::Tensor* dst, lite::Tensor* dst,
bool is_src_index); bool is_src_index);
}; };
...@@ -56,6 +50,7 @@ class LoDTensor2BatchFunctor { ...@@ -56,6 +50,7 @@ class LoDTensor2BatchFunctor {
// seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)} // seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
// //
struct SeqInfo { struct SeqInfo {
SeqInfo() = default;
SeqInfo(int start, int length, int seq_idx) SeqInfo(int start, int length, int seq_idx)
: start(start), length(length), seq_idx(seq_idx) {} : start(start), length(length), seq_idx(seq_idx) {}
int start; int start;
...@@ -89,10 +84,12 @@ class LoDTensor2BatchFunctor { ...@@ -89,10 +84,12 @@ class LoDTensor2BatchFunctor {
const auto& lod = lods[0]; const auto& lod = lods[0];
std::vector<SeqInfo> seq_info; std::vector<SeqInfo> seq_info(lod.size() - 1);
for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) { for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
int length = lod[seq_id + 1] - lod[seq_id]; int length = lod[seq_id + 1] - lod[seq_id];
seq_info.emplace_back(lod[seq_id], length, seq_id); seq_info[seq_id].start = lod[seq_id];
seq_info[seq_id].length = length;
seq_info[seq_id].seq_idx = seq_id;
} }
std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) { std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
...@@ -122,21 +119,19 @@ class LoDTensor2BatchFunctor { ...@@ -122,21 +119,19 @@ class LoDTensor2BatchFunctor {
// The max_seqlen represents batch size after rearranging the // The max_seqlen represents batch size after rearranging the
// input LodTensor. It is also the maximum length of input sequence. // input LodTensor. It is also the maximum length of input sequence.
lite::LoD batch_lods; LoD* batch_lods = batch->mutable_lod();
batch_lods.emplace_back(std::vector<size_t>{0}); batch_lods->resize(3);
batch_lods.emplace_back(std::vector<size_t>{0});
batch_lods.emplace_back(std::vector<size_t>{0});
// batch_lods[0] is the start positions for batch LoDTensor // batch_lods[0] is the start positions for batch LoDTensor
int max_seqlen = seq_info[0].length; int max_seqlen = seq_info[0].length;
batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1)); batch_lods->at(0).resize(static_cast<size_t>(max_seqlen + 1));
// batch_lods[1] is the raw index in the input LoDTensor // batch_lods[1] is the raw index in the input LoDTensor
batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0])); batch_lods->at(1).resize(static_cast<size_t>(lod_tensor.dims()[0]));
// batch_lods[2] is the sort order for the input LoDTensor. // batch_lods[2] is the sort order for the input LoDTensor.
batch_lods[2].resize(seq_info.size()); batch_lods->at(2).resize(seq_info.size());
size_t* batch_starts = batch_lods[0].data(); size_t* batch_starts = batch_lods->at(0).data();
size_t* seq2batch_idx = batch_lods[1].data(); size_t* seq2batch_idx = batch_lods->at(1).data();
batch_starts[0] = 0; batch_starts[0] = 0;
for (int n = 0; n < max_seqlen; n++) { for (int n = 0; n < max_seqlen; n++) {
auto batch_id = static_cast<int>(batch_starts[n]); auto batch_id = static_cast<int>(batch_starts[n]);
...@@ -153,14 +148,13 @@ class LoDTensor2BatchFunctor { ...@@ -153,14 +148,13 @@ class LoDTensor2BatchFunctor {
} }
batch_starts[n + 1] = static_cast<size_t>(batch_id); batch_starts[n + 1] = static_cast<size_t>(batch_id);
} }
size_t* seq_order = batch_lods[2].data(); size_t* seq_order = batch_lods->at(2).data();
for (size_t i = 0; i < seq_info.size(); ++i) { for (size_t i = 0; i < seq_info.size(); ++i) {
seq_order[i] = seq_info[i].seq_idx; seq_order[i] = seq_info[i].seq_idx;
} }
batch->set_lod(batch_lods);
CopyMatrixRowsFunctor<Target, T> to_batch; CopyMatrixRowsFunctor<Target, T> to_batch;
to_batch(context, lod_tensor, batch_lods[1], batch, true); to_batch(context, lod_tensor, batch_lods->at(1), batch, true);
} }
}; };
......
...@@ -99,7 +99,7 @@ class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> { ...@@ -99,7 +99,7 @@ class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> {
const int axis_dim, const int axis_dim,
const lite::Tensor* X, const lite::Tensor* X,
lite::Tensor* Y) { lite::Tensor* Y) {
auto in_dims = X->dims(); const auto& in_dims = X->dims();
constexpr int kBatchDim = 0; constexpr int kBatchDim = 0;
constexpr int kClassDim = 1; constexpr int kClassDim = 1;
...@@ -140,7 +140,7 @@ class SoftmaxFunctor<Target, float, true, enable_if_CPU<Target>> { ...@@ -140,7 +140,7 @@ class SoftmaxFunctor<Target, float, true, enable_if_CPU<Target>> {
const int axis_dim, const int axis_dim,
const lite::Tensor* X, const lite::Tensor* X,
lite::Tensor* Y) { lite::Tensor* Y) {
auto in_dims = X->dims(); const auto& in_dims = X->dims();
const float* in_data = X->data<float>(); const float* in_data = X->data<float>();
float* out_data = Y->mutable_data<float>(); float* out_data = Y->mutable_data<float>();
const int kBatchDim = 0; const int kBatchDim = 0;
......
...@@ -30,13 +30,20 @@ struct EigenDim { ...@@ -30,13 +30,20 @@ struct EigenDim {
using Type = Eigen::DSizes<Eigen::DenseIndex, D>; using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
static Type From(const lite::DDim& dims) { static Type From(const lite::DDim& dims) {
PADDLE_ENFORCE(dims.size() == D, "D must match DDim::size"); PADDLE_ENFORCE_EQ(dims.size(), D, "D must match DDim::size");
Type ret; Type ret;
for (size_t d = 0; d < dims.size(); d++) { for (size_t d = 0; d < dims.size(); d++) {
ret[d] = dims[d]; ret[d] = dims[d];
} }
return ret; return ret;
} }
static Type From(const DDim::value_type length) {
PADDLE_ENFORCE_EQ(D, 1, "D must be 1.");
Type ret;
ret[0] = length;
return ret;
}
}; };
// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor. // Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
...@@ -52,7 +59,7 @@ struct EigenTensor { ...@@ -52,7 +59,7 @@ struct EigenTensor {
using ConstType = using ConstType =
Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>; Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
static Type From(Tensor& tensor, lite::DDim dims) { // NOLINT static Type From(Tensor& tensor, const lite::DDim& dims) { // NOLINT
return Type(const_cast<T*>(tensor.data<T>()), return Type(const_cast<T*>(tensor.data<T>()),
EigenDim<D>::From(dims)); // NOLINT EigenDim<D>::From(dims)); // NOLINT
} }
...@@ -61,7 +68,7 @@ struct EigenTensor { ...@@ -61,7 +68,7 @@ struct EigenTensor {
return From(tensor, tensor.dims()); return From(tensor, tensor.dims());
} // NOLINT } // NOLINT
static ConstType From(const Tensor& tensor, lite::DDim dims) { static ConstType From(const Tensor& tensor, const lite::DDim& dims) {
return ConstType(tensor.data<T>(), EigenDim<D>::From(dims)); return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
} }
...@@ -97,14 +104,15 @@ template <typename T, ...@@ -97,14 +104,15 @@ template <typename T,
struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> { struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
// Flatten reshapes a Tensor into an EigenVector. // Flatten reshapes a Tensor into an EigenVector.
static typename EigenVector::Type Flatten(Tensor& tensor) { // NOLINT static typename EigenVector::Type Flatten(Tensor& tensor) { // NOLINT
return EigenVector::From( return typename EigenVector::Type(
tensor, lite::DDim(std::vector<int64_t>({tensor.dims().production()}))); const_cast<T*>(tensor.data<T>()),
EigenDim<1>::From(tensor.dims().production()));
} }
static typename EigenVector::ConstType Flatten( static typename EigenVector::ConstType Flatten(
const Tensor& tensor) { // NOLINT const Tensor& tensor) { // NOLINT
return EigenVector::From( return typename EigenVector::ConstType(
tensor, lite::DDim(std::vector<int64_t>({tensor.dims().production()}))); tensor.data<T>(), EigenDim<1>::From(tensor.dims().production()));
} }
}; };
......
...@@ -39,26 +39,28 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -39,26 +39,28 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
if (param.x.size() == 1) {
param.output->ShareDataWith(*param.x[0]);
return;
}
int64_t axis = static_cast<int64_t>(param.axis); int64_t axis = static_cast<int64_t>(param.axis);
auto* axis_tensor = param.axis_tensor; auto* axis_tensor = param.axis_tensor;
if (axis_tensor != nullptr) { if (axis_tensor != nullptr) {
auto* axis_tensor_data = axis_tensor->data<int>(); auto* axis_tensor_data = axis_tensor->data<int>();
axis = static_cast<int64_t>(axis_tensor_data[0]); axis = static_cast<int64_t>(axis_tensor_data[0]);
} }
auto x_dims = param.x[0]->dims();
auto out = param.output;
if (param.x.size() == 1) {
param.output->ShareDataWith(*param.x[0]);
return;
}
auto output_data = param.output->template mutable_data<T>(); const auto& x_dims = param.x[0]->dims();
auto* out = param.output;
T* output_data = param.output->template mutable_data<T>();
int offset_concat_axis = 0; int offset_concat_axis = 0;
int num_concat = count(0, axis, x_dims); int num_concat = count(0, axis, x_dims);
int concat_input_size = count(axis + 1, x_dims.size(), x_dims); int concat_input_size = count(axis + 1, x_dims.size(), x_dims);
const int top_concat_axis = out->dims()[axis]; const int top_concat_axis = out->dims()[axis];
for (size_t i = 0; i < param.x.size(); ++i) { for (size_t i = 0; i < param.x.size(); ++i) {
auto bottom_data = param.x[i]->data<T>(); const T* bottom_data = param.x[i]->data<T>();
const int64_t bottom_concat_axis = param.x[i]->dims()[axis]; const int64_t bottom_concat_axis = param.x[i]->dims()[axis];
for (int n = 0; n < num_concat; ++n) { for (int n = 0; n < num_concat; ++n) {
std::memcpy( std::memcpy(
......
...@@ -33,15 +33,15 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -33,15 +33,15 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto *ids_t = param.Ids; auto *ids_t = param.Ids;
auto *output_t = param.Out; auto *output_t = param.Out;
int64_t padding_idx = param.padding_idx; int64_t padding_idx = param.padding_idx;
auto *ids = ids_t->data<int64_t>(); const int64_t *ids = ids_t->data<int64_t>();
int64_t ids_numel = ids_t->dims().production(); int64_t ids_numel = ids_t->dims().production();
auto *table_t = param.W; auto *table_t = param.W;
int64_t row_number = table_t->dims()[0]; int64_t row_number = table_t->dims()[0];
int64_t row_width = table_t->dims()[1]; int64_t row_width = table_t->dims()[1];
auto *table = table_t->data<T>(); const T *table = table_t->data<T>();
auto *output = output_t->mutable_data<T>(); T *output = output_t->mutable_data<T>();
memset(output, 0, output_t->dims().production() * sizeof(T)); memset(output, 0, output_t->dims().production() * sizeof(T));
for (int64_t i = 0; i < ids_numel; ++i) { for (int64_t i = 0; i < ids_numel; ++i) {
if (padding_idx != -1 && ids[i] == padding_idx) { if (padding_idx != -1 && ids[i] == padding_idx) {
......
...@@ -51,7 +51,7 @@ class ReduceSumCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -51,7 +51,7 @@ class ReduceSumCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto* output = param.output; auto* output = param.output;
param.output->mutable_data<T>(); param.output->mutable_data<T>();
auto dims = param.dim; const auto& dims = param.dim;
bool keep_dim = param.keep_dim; bool keep_dim = param.keep_dim;
if (reduce_all) { if (reduce_all) {
// Flatten and reduce 1-D tensor // Flatten and reduce 1-D tensor
......
...@@ -47,33 +47,23 @@ void ReduceFunctor(const lite::Tensor& input, ...@@ -47,33 +47,23 @@ void ReduceFunctor(const lite::Tensor& input,
const std::vector<int>& dims, const std::vector<int>& dims,
bool keep_dim) { bool keep_dim) {
auto x = EigenTensor<T, D>::From(input); auto x = EigenTensor<T, D>::From(input);
auto x_rank = static_cast<int>(x.dimensions().size());
auto reduce_dim = Eigen::array<int, R_D>(); auto reduce_dim = Eigen::array<int, R_D>();
std::vector<int> dims_ref = dims; auto x_rank = static_cast<int>(x.dimensions().size());
for (size_t i = 0; i < dims_ref.size(); ++i) { for (size_t i = 0; i < dims.size(); ++i) {
if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i]; if (dims[i] < 0) {
reduce_dim[i] = dims_ref[i]; reduce_dim[i] = x_rank + dims[i];
} } else {
// construct the squeezed output tensor reduce_dim[i] = dims[i];
lite::DDim out_dims = output->dims();
if (keep_dim && x_rank > 1) {
const int kDelFlag = -2;
auto dims_vector = out_dims.Vectorize();
for (size_t i = 0; i < dims_ref.size(); ++i) {
dims_vector[dims_ref[i]] = kDelFlag;
} }
dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
dims_vector.end());
out_dims = lite::DDim(dims_vector);
} }
// auto& place = *context.eigen_device();
Functor functor;
Functor functor;
if (D == 1) { if (D == 1) {
auto out = EigenScalar<T>::From(output); auto out = EigenScalar<T>::From(output);
functor(&x, &out, reduce_dim); functor(&x, &out, reduce_dim);
} else { } else {
auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims); auto out = EigenTensor<T, (D - R_D)>::From(*output, output->dims());
functor(&x, &out, reduce_dim); functor(&x, &out, reduce_dim);
} }
} }
......
...@@ -36,11 +36,10 @@ class SequenceReshapeCompute ...@@ -36,11 +36,10 @@ class SequenceReshapeCompute
auto* out = param.output; auto* out = param.output;
int out_width = param.new_dim; int out_width = param.new_dim;
auto in_dims = in->dims(); const auto& in_dims = in->dims();
int64_t in_width = in_dims[1]; int64_t in_width = in_dims[1];
// LOG(INFO)<<"sequence_reshape in tensor:"<<*in;
auto& in_lod = in->lod();
auto& in_lod = in->lod();
CHECK_EQ(in_lod.size(), 1UL); CHECK_EQ(in_lod.size(), 1UL);
CHECK_EQ((uint64_t)in_dims[0], in_lod[0].back()); CHECK_EQ((uint64_t)in_dims[0], in_lod[0].back());
...@@ -63,13 +62,11 @@ class SequenceReshapeCompute ...@@ -63,13 +62,11 @@ class SequenceReshapeCompute
} }
} }
out->Resize(in_dims); out->Resize(std::vector<int64_t>{static_cast<int64_t>(out->lod()[0].back()),
out_width});
auto* dst_ptr = out->mutable_data<T>(); auto* dst_ptr = out->mutable_data<T>();
auto size = in->numel() * sizeof(T); auto size = in->numel() * sizeof(T);
std::memcpy(dst_ptr, in->data<T>(), size); std::memcpy(dst_ptr, in->data<T>(), size);
std::vector<int64_t> out_shape{static_cast<int64_t>(out->lod()[0].back()),
out_width};
out->Resize(lite::DDim(out_shape));
} }
virtual ~SequenceReshapeCompute() = default; virtual ~SequenceReshapeCompute() = default;
......
...@@ -29,7 +29,7 @@ static inline int CanonicalAxis(const int axis, const int rank) { ...@@ -29,7 +29,7 @@ static inline int CanonicalAxis(const int axis, const int rank) {
return axis; return axis;
} }
static inline int SizeToAxis(const int axis, lite::DDim dims) { static inline int SizeToAxis(const int axis, const DDim& dims) {
int size = 1; int size = 1;
for (int i = 0; i < axis; i++) { for (int i = 0; i < axis; i++) {
size *= dims[i]; size *= dims[i];
...@@ -37,7 +37,7 @@ static inline int SizeToAxis(const int axis, lite::DDim dims) { ...@@ -37,7 +37,7 @@ static inline int SizeToAxis(const int axis, lite::DDim dims) {
return size; return size;
} }
static inline int SizeFromAxis(const int axis, lite::DDim dims) { static inline int SizeFromAxis(const int axis, const DDim& dims) {
int size = 1; int size = 1;
for (size_t i = axis; i < dims.size(); i++) { for (size_t i = axis; i < dims.size(); i++) {
size *= dims[i]; size *= dims[i];
...@@ -61,13 +61,15 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -61,13 +61,15 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
int axis_dim = param.x->dims()[axis]; int axis_dim = param.x->dims()[axis];
const int n = SizeToAxis(axis, param.x->dims()); const int n = SizeToAxis(axis, param.x->dims());
const int d = SizeFromAxis(axis, param.x->dims()); const int d = SizeFromAxis(axis, param.x->dims());
std::vector<int64_t> shape{n, d};
lite::Tensor input_2d, out_2d; DDim shape(std::vector<DDim::value_type>{n, d});
Tensor input_2d;
Tensor out_2d;
input_2d.ShareDataWith(*param.x); input_2d.ShareDataWith(*param.x);
input_2d.Resize(lite::DDim(shape)); input_2d.Resize(shape);
out_2d.ShareDataWith(*param.output); out_2d.ShareDataWith(*param.output);
out_2d.Resize(lite::DDim(shape)); out_2d.Resize(shape);
lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()( lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
context, axis_dim, &input_2d, &out_2d); context, axis_dim, &input_2d, &out_2d);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册