提交 4c1bf784 编写于 作者: Y Yiqun Liu 提交者: GitHub

Optimize the compute implementation of several operators. (#2843)

* Optimize the transform from Paddle' Tensor to EigenVector, avoiding defining multiple DDim.

* Optimize the compute implementation of several operators.
test=develop
上级 87e30f5c
......@@ -24,12 +24,12 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
public:
void operator()(const lite::Context<lite::TargetType::kX86>& context,
const lite::Tensor& src,
std::vector<size_t> index_lod,
const std::vector<size_t>& index_lod,
lite::Tensor* dst,
bool is_src_index) {
size_t* index = index_lod.data();
auto src_dims = src.dims();
auto dst_dims = dst->dims();
const size_t* index = index_lod.data();
const auto& src_dims = src.dims();
const auto& dst_dims = dst->dims();
PADDLE_ENFORCE_EQ(
src_dims.size(), 2UL, "The src must be matrix with rank 2.");
PADDLE_ENFORCE_EQ(
......
......@@ -19,7 +19,6 @@ limitations under the License. */
#include "lite/core/context.h"
#include "lite/core/tensor.h"
#include "lite/fluid/eigen.h"
// #include "lite/fluid/lod.h"
#include "lite/utils/paddle_enforce.h"
namespace paddle {
......@@ -27,11 +26,6 @@ namespace lite {
namespace x86 {
namespace math {
template <typename T,
int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
template <lite::TargetType Target, typename T>
class CopyMatrixRowsFunctor {
public:
......@@ -42,7 +36,7 @@ class CopyMatrixRowsFunctor {
// The indexed rows are based on the input index.
void operator()(const lite::Context<Target>& context,
const lite::Tensor& src,
std::vector<size_t> index_lod,
const std::vector<size_t>& index_lod,
lite::Tensor* dst,
bool is_src_index);
};
......@@ -56,6 +50,7 @@ class LoDTensor2BatchFunctor {
// seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
//
struct SeqInfo {
SeqInfo() = default;
SeqInfo(int start, int length, int seq_idx)
: start(start), length(length), seq_idx(seq_idx) {}
int start;
......@@ -89,10 +84,12 @@ class LoDTensor2BatchFunctor {
const auto& lod = lods[0];
std::vector<SeqInfo> seq_info;
std::vector<SeqInfo> seq_info(lod.size() - 1);
for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
int length = lod[seq_id + 1] - lod[seq_id];
seq_info.emplace_back(lod[seq_id], length, seq_id);
seq_info[seq_id].start = lod[seq_id];
seq_info[seq_id].length = length;
seq_info[seq_id].seq_idx = seq_id;
}
std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
......@@ -122,21 +119,19 @@ class LoDTensor2BatchFunctor {
// The max_seqlen represents batch size after rearranging the
// input LodTensor. It is also the maximum length of input sequence.
lite::LoD batch_lods;
batch_lods.emplace_back(std::vector<size_t>{0});
batch_lods.emplace_back(std::vector<size_t>{0});
batch_lods.emplace_back(std::vector<size_t>{0});
LoD* batch_lods = batch->mutable_lod();
batch_lods->resize(3);
// batch_lods[0] is the start positions for batch LoDTensor
int max_seqlen = seq_info[0].length;
batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
batch_lods->at(0).resize(static_cast<size_t>(max_seqlen + 1));
// batch_lods[1] is the raw index in the input LoDTensor
batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
batch_lods->at(1).resize(static_cast<size_t>(lod_tensor.dims()[0]));
// batch_lods[2] is the sort order for the input LoDTensor.
batch_lods[2].resize(seq_info.size());
batch_lods->at(2).resize(seq_info.size());
size_t* batch_starts = batch_lods[0].data();
size_t* seq2batch_idx = batch_lods[1].data();
size_t* batch_starts = batch_lods->at(0).data();
size_t* seq2batch_idx = batch_lods->at(1).data();
batch_starts[0] = 0;
for (int n = 0; n < max_seqlen; n++) {
auto batch_id = static_cast<int>(batch_starts[n]);
......@@ -153,14 +148,13 @@ class LoDTensor2BatchFunctor {
}
batch_starts[n + 1] = static_cast<size_t>(batch_id);
}
size_t* seq_order = batch_lods[2].data();
size_t* seq_order = batch_lods->at(2).data();
for (size_t i = 0; i < seq_info.size(); ++i) {
seq_order[i] = seq_info[i].seq_idx;
}
batch->set_lod(batch_lods);
CopyMatrixRowsFunctor<Target, T> to_batch;
to_batch(context, lod_tensor, batch_lods[1], batch, true);
to_batch(context, lod_tensor, batch_lods->at(1), batch, true);
}
};
......
......@@ -99,7 +99,7 @@ class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> {
const int axis_dim,
const lite::Tensor* X,
lite::Tensor* Y) {
auto in_dims = X->dims();
const auto& in_dims = X->dims();
constexpr int kBatchDim = 0;
constexpr int kClassDim = 1;
......@@ -140,7 +140,7 @@ class SoftmaxFunctor<Target, float, true, enable_if_CPU<Target>> {
const int axis_dim,
const lite::Tensor* X,
lite::Tensor* Y) {
auto in_dims = X->dims();
const auto& in_dims = X->dims();
const float* in_data = X->data<float>();
float* out_data = Y->mutable_data<float>();
const int kBatchDim = 0;
......
......@@ -30,13 +30,20 @@ struct EigenDim {
using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
static Type From(const lite::DDim& dims) {
PADDLE_ENFORCE(dims.size() == D, "D must match DDim::size");
PADDLE_ENFORCE_EQ(dims.size(), D, "D must match DDim::size");
Type ret;
for (size_t d = 0; d < dims.size(); d++) {
ret[d] = dims[d];
}
return ret;
}
static Type From(const DDim::value_type length) {
PADDLE_ENFORCE_EQ(D, 1, "D must be 1.");
Type ret;
ret[0] = length;
return ret;
}
};
// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
......@@ -52,7 +59,7 @@ struct EigenTensor {
using ConstType =
Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
static Type From(Tensor& tensor, lite::DDim dims) { // NOLINT
static Type From(Tensor& tensor, const lite::DDim& dims) { // NOLINT
return Type(const_cast<T*>(tensor.data<T>()),
EigenDim<D>::From(dims)); // NOLINT
}
......@@ -61,7 +68,7 @@ struct EigenTensor {
return From(tensor, tensor.dims());
} // NOLINT
static ConstType From(const Tensor& tensor, lite::DDim dims) {
static ConstType From(const Tensor& tensor, const lite::DDim& dims) {
return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
}
......@@ -97,14 +104,15 @@ template <typename T,
struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
// Flatten reshapes a Tensor into an EigenVector.
static typename EigenVector::Type Flatten(Tensor& tensor) { // NOLINT
return EigenVector::From(
tensor, lite::DDim(std::vector<int64_t>({tensor.dims().production()})));
return typename EigenVector::Type(
const_cast<T*>(tensor.data<T>()),
EigenDim<1>::From(tensor.dims().production()));
}
static typename EigenVector::ConstType Flatten(
const Tensor& tensor) { // NOLINT
return EigenVector::From(
tensor, lite::DDim(std::vector<int64_t>({tensor.dims().production()})));
return typename EigenVector::ConstType(
tensor.data<T>(), EigenDim<1>::From(tensor.dims().production()));
}
};
......
......@@ -39,26 +39,28 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
void Run() override {
auto& param = *param_.get_mutable<param_t>();
if (param.x.size() == 1) {
param.output->ShareDataWith(*param.x[0]);
return;
}
int64_t axis = static_cast<int64_t>(param.axis);
auto* axis_tensor = param.axis_tensor;
if (axis_tensor != nullptr) {
auto* axis_tensor_data = axis_tensor->data<int>();
axis = static_cast<int64_t>(axis_tensor_data[0]);
}
auto x_dims = param.x[0]->dims();
auto out = param.output;
if (param.x.size() == 1) {
param.output->ShareDataWith(*param.x[0]);
return;
}
auto output_data = param.output->template mutable_data<T>();
const auto& x_dims = param.x[0]->dims();
auto* out = param.output;
T* output_data = param.output->template mutable_data<T>();
int offset_concat_axis = 0;
int num_concat = count(0, axis, x_dims);
int concat_input_size = count(axis + 1, x_dims.size(), x_dims);
const int top_concat_axis = out->dims()[axis];
for (size_t i = 0; i < param.x.size(); ++i) {
auto bottom_data = param.x[i]->data<T>();
const T* bottom_data = param.x[i]->data<T>();
const int64_t bottom_concat_axis = param.x[i]->dims()[axis];
for (int n = 0; n < num_concat; ++n) {
std::memcpy(
......
......@@ -33,15 +33,15 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto *ids_t = param.Ids;
auto *output_t = param.Out;
int64_t padding_idx = param.padding_idx;
auto *ids = ids_t->data<int64_t>();
const int64_t *ids = ids_t->data<int64_t>();
int64_t ids_numel = ids_t->dims().production();
auto *table_t = param.W;
int64_t row_number = table_t->dims()[0];
int64_t row_width = table_t->dims()[1];
auto *table = table_t->data<T>();
auto *output = output_t->mutable_data<T>();
const T *table = table_t->data<T>();
T *output = output_t->mutable_data<T>();
memset(output, 0, output_t->dims().production() * sizeof(T));
for (int64_t i = 0; i < ids_numel; ++i) {
if (padding_idx != -1 && ids[i] == padding_idx) {
......
......@@ -51,7 +51,7 @@ class ReduceSumCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto* output = param.output;
param.output->mutable_data<T>();
auto dims = param.dim;
const auto& dims = param.dim;
bool keep_dim = param.keep_dim;
if (reduce_all) {
// Flatten and reduce 1-D tensor
......
......@@ -47,33 +47,23 @@ void ReduceFunctor(const lite::Tensor& input,
const std::vector<int>& dims,
bool keep_dim) {
auto x = EigenTensor<T, D>::From(input);
auto x_rank = static_cast<int>(x.dimensions().size());
auto reduce_dim = Eigen::array<int, R_D>();
std::vector<int> dims_ref = dims;
for (size_t i = 0; i < dims_ref.size(); ++i) {
if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
reduce_dim[i] = dims_ref[i];
}
// construct the squeezed output tensor
lite::DDim out_dims = output->dims();
if (keep_dim && x_rank > 1) {
const int kDelFlag = -2;
auto dims_vector = out_dims.Vectorize();
for (size_t i = 0; i < dims_ref.size(); ++i) {
dims_vector[dims_ref[i]] = kDelFlag;
auto x_rank = static_cast<int>(x.dimensions().size());
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) {
reduce_dim[i] = x_rank + dims[i];
} else {
reduce_dim[i] = dims[i];
}
dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
dims_vector.end());
out_dims = lite::DDim(dims_vector);
}
// auto& place = *context.eigen_device();
Functor functor;
Functor functor;
if (D == 1) {
auto out = EigenScalar<T>::From(output);
functor(&x, &out, reduce_dim);
} else {
auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
auto out = EigenTensor<T, (D - R_D)>::From(*output, output->dims());
functor(&x, &out, reduce_dim);
}
}
......
......@@ -36,11 +36,10 @@ class SequenceReshapeCompute
auto* out = param.output;
int out_width = param.new_dim;
auto in_dims = in->dims();
const auto& in_dims = in->dims();
int64_t in_width = in_dims[1];
// LOG(INFO)<<"sequence_reshape in tensor:"<<*in;
auto& in_lod = in->lod();
auto& in_lod = in->lod();
CHECK_EQ(in_lod.size(), 1UL);
CHECK_EQ((uint64_t)in_dims[0], in_lod[0].back());
......@@ -63,13 +62,11 @@ class SequenceReshapeCompute
}
}
out->Resize(in_dims);
out->Resize(std::vector<int64_t>{static_cast<int64_t>(out->lod()[0].back()),
out_width});
auto* dst_ptr = out->mutable_data<T>();
auto size = in->numel() * sizeof(T);
std::memcpy(dst_ptr, in->data<T>(), size);
std::vector<int64_t> out_shape{static_cast<int64_t>(out->lod()[0].back()),
out_width};
out->Resize(lite::DDim(out_shape));
}
virtual ~SequenceReshapeCompute() = default;
......
......@@ -29,7 +29,7 @@ static inline int CanonicalAxis(const int axis, const int rank) {
return axis;
}
static inline int SizeToAxis(const int axis, lite::DDim dims) {
static inline int SizeToAxis(const int axis, const DDim& dims) {
int size = 1;
for (int i = 0; i < axis; i++) {
size *= dims[i];
......@@ -37,7 +37,7 @@ static inline int SizeToAxis(const int axis, lite::DDim dims) {
return size;
}
static inline int SizeFromAxis(const int axis, lite::DDim dims) {
static inline int SizeFromAxis(const int axis, const DDim& dims) {
int size = 1;
for (size_t i = axis; i < dims.size(); i++) {
size *= dims[i];
......@@ -61,13 +61,15 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
int axis_dim = param.x->dims()[axis];
const int n = SizeToAxis(axis, param.x->dims());
const int d = SizeFromAxis(axis, param.x->dims());
std::vector<int64_t> shape{n, d};
lite::Tensor input_2d, out_2d;
DDim shape(std::vector<DDim::value_type>{n, d});
Tensor input_2d;
Tensor out_2d;
input_2d.ShareDataWith(*param.x);
input_2d.Resize(lite::DDim(shape));
input_2d.Resize(shape);
out_2d.ShareDataWith(*param.output);
out_2d.Resize(lite::DDim(shape));
out_2d.Resize(shape);
lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
context, axis_dim, &input_2d, &out_2d);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册