Merge branch 'develop' into step_rnn/opt_ddim_lite

c3df3f2d · Liu Yiqun · ef2616b6 · 53604bac · c3df3f2d · c3df3f2d
10 changed file
--- a/lite/backends/x86/math/sequence2batch.cc
+++ b/lite/backends/x86/math/sequence2batch.cc
@@ -24,12 +24,12 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
 public:
  void operator()(const lite::Context<lite::TargetType::kX86>& context,
                  const lite::Tensor& src,
-                  std::vector<size_t> index_lod,
+                  const std::vector<size_t>& index_lod,
                  lite::Tensor* dst,
                  bool is_src_index) {
-    size_t* index = index_lod.data();
+    const size_t* index = index_lod.data();
-    auto src_dims = src.dims();
+    const auto& src_dims = src.dims();
-    auto dst_dims = dst->dims();
+    const auto& dst_dims = dst->dims();
    PADDLE_ENFORCE_EQ(
        src_dims.size(), 2UL, "The src must be matrix with rank 2.");
    PADDLE_ENFORCE_EQ(

--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "lite/core/context.h"
 #include "lite/core/tensor.h"
 #include "lite/fluid/eigen.h"
-// #include "lite/fluid/lod.h"
 #include "lite/utils/paddle_enforce.h"
 namespace paddle {
@@ -27,11 +26,6 @@ namespace lite {
 namespace x86 {
 namespace math {
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
 template <lite::TargetType Target, typename T>
 class CopyMatrixRowsFunctor {
 public:
@@ -42,7 +36,7 @@ class CopyMatrixRowsFunctor {
  // The indexed rows are based on the input index.
  void operator()(const lite::Context<Target>& context,
                  const lite::Tensor& src,
-                  std::vector<size_t> index_lod,
+                  const std::vector<size_t>& index_lod,
                  lite::Tensor* dst,
                  bool is_src_index);
 };
@@ -56,6 +50,7 @@ class LoDTensor2BatchFunctor {
  //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
  //
  struct SeqInfo {
+    SeqInfo() = default;
    SeqInfo(int start, int length, int seq_idx)
        : start(start), length(length), seq_idx(seq_idx) {}
    int start;
@@ -89,10 +84,12 @@ class LoDTensor2BatchFunctor {
    const auto& lod = lods[0];
-    std::vector<SeqInfo> seq_info;
+    std::vector<SeqInfo> seq_info(lod.size() - 1);
    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
      int length = lod[seq_id + 1] - lod[seq_id];
-      seq_info.emplace_back(lod[seq_id], length, seq_id);
+      seq_info[seq_id].start = lod[seq_id];
+      seq_info[seq_id].length = length;
+      seq_info[seq_id].seq_idx = seq_id;
    }
    std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
@@ -122,21 +119,19 @@ class LoDTensor2BatchFunctor {
    // The max_seqlen represents batch size after rearranging the
    // input LodTensor. It is also the maximum length of input sequence.
-    lite::LoD batch_lods;
+    LoD* batch_lods = batch->mutable_lod();
-    batch_lods.emplace_back(std::vector<size_t>{0});
+    batch_lods->resize(3);
-    batch_lods.emplace_back(std::vector<size_t>{0});
-    batch_lods.emplace_back(std::vector<size_t>{0});
    // batch_lods[0] is the start positions for batch LoDTensor
    int max_seqlen = seq_info[0].length;
-    batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
+    batch_lods->at(0).resize(static_cast<size_t>(max_seqlen + 1));
    // batch_lods[1] is the raw index in the input LoDTensor
-    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
+    batch_lods->at(1).resize(static_cast<size_t>(lod_tensor.dims()[0]));
    // batch_lods[2] is the sort order for the input LoDTensor.
-    batch_lods[2].resize(seq_info.size());
+    batch_lods->at(2).resize(seq_info.size());
-    size_t* batch_starts = batch_lods[0].data();
+    size_t* batch_starts = batch_lods->at(0).data();
-    size_t* seq2batch_idx = batch_lods[1].data();
+    size_t* seq2batch_idx = batch_lods->at(1).data();
    batch_starts[0] = 0;
    for (int n = 0; n < max_seqlen; n++) {
      auto batch_id = static_cast<int>(batch_starts[n]);
@@ -153,14 +148,13 @@ class LoDTensor2BatchFunctor {
      }
      batch_starts[n + 1] = static_cast<size_t>(batch_id);
    }
-    size_t* seq_order = batch_lods[2].data();
+    size_t* seq_order = batch_lods->at(2).data();
    for (size_t i = 0; i < seq_info.size(); ++i) {
      seq_order[i] = seq_info[i].seq_idx;
    }
-    batch->set_lod(batch_lods);
    CopyMatrixRowsFunctor<Target, T> to_batch;
-    to_batch(context, lod_tensor, batch_lods[1], batch, true);
+    to_batch(context, lod_tensor, batch_lods->at(1), batch, true);
  }
 };

--- a/lite/backends/x86/math/softmax_impl.h
+++ b/lite/backends/x86/math/softmax_impl.h
@@ -99,7 +99,7 @@ class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> {
                  const int axis_dim,
                  const lite::Tensor* X,
                  lite::Tensor* Y) {
-    auto in_dims = X->dims();
+    const auto& in_dims = X->dims();
    constexpr int kBatchDim = 0;
    constexpr int kClassDim = 1;
@@ -140,7 +140,7 @@ class SoftmaxFunctor<Target, float, true, enable_if_CPU<Target>> {
                  const int axis_dim,
                  const lite::Tensor* X,
                  lite::Tensor* Y) {
-    auto in_dims = X->dims();
+    const auto& in_dims = X->dims();
    const float* in_data = X->data<float>();
    float* out_data = Y->mutable_data<float>();
    const int kBatchDim = 0;

--- a/lite/fluid/eigen.h
+++ b/lite/fluid/eigen.h
@@ -30,13 +30,20 @@ struct EigenDim {
  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
  static Type From(const lite::DDim& dims) {
-    PADDLE_ENFORCE(dims.size() == D, "D must match DDim::size");
+    PADDLE_ENFORCE_EQ(dims.size(), D, "D must match DDim::size");
    Type ret;
    for (size_t d = 0; d < dims.size(); d++) {
      ret[d] = dims[d];
    }
    return ret;
  }
+  static Type From(const DDim::value_type length) {
+    PADDLE_ENFORCE_EQ(D, 1, "D must be 1.");
+    Type ret;
+    ret[0] = length;
+    return ret;
+  }
 };
 // Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
@@ -52,7 +59,7 @@ struct EigenTensor {
  using ConstType =
      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
-  static Type From(Tensor& tensor, lite::DDim dims) {  // NOLINT
+  static Type From(Tensor& tensor, const lite::DDim& dims) {  // NOLINT
    return Type(const_cast<T*>(tensor.data<T>()),
                EigenDim<D>::From(dims));  // NOLINT
  }
@@ -61,7 +68,7 @@ struct EigenTensor {
    return From(tensor, tensor.dims());
  }  // NOLINT
-  static ConstType From(const Tensor& tensor, lite::DDim dims) {
+  static ConstType From(const Tensor& tensor, const lite::DDim& dims) {
    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
  }
@@ -97,14 +104,15 @@ template <typename T,
 struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
  // Flatten reshapes a Tensor into an EigenVector.
  static typename EigenVector::Type Flatten(Tensor& tensor) {  // NOLINT
-    return EigenVector::From(
+    return typename EigenVector::Type(
-        tensor, lite::DDim(std::vector<int64_t>({tensor.dims().production()})));
+        const_cast<T*>(tensor.data<T>()),
+        EigenDim<1>::From(tensor.dims().production()));
  }
  static typename EigenVector::ConstType Flatten(
      const Tensor& tensor) {  // NOLINT
-    return EigenVector::From(
+    return typename EigenVector::ConstType(
-        tensor, lite::DDim(std::vector<int64_t>({tensor.dims().production()})));
+        tensor.data<T>(), EigenDim<1>::From(tensor.dims().production()));
  }
 };

--- a/lite/kernels/x86/concat_compute.h
+++ b/lite/kernels/x86/concat_compute.h
@@ -39,26 +39,28 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
+    if (param.x.size() == 1) {
+      param.output->ShareDataWith(*param.x[0]);
+      return;
+    }
    int64_t axis = static_cast<int64_t>(param.axis);
    auto* axis_tensor = param.axis_tensor;
    if (axis_tensor != nullptr) {
      auto* axis_tensor_data = axis_tensor->data<int>();
      axis = static_cast<int64_t>(axis_tensor_data[0]);
    }
-    auto x_dims = param.x[0]->dims();
-    auto out = param.output;
-    if (param.x.size() == 1) {
-      param.output->ShareDataWith(*param.x[0]);
-      return;
-    }
-    auto output_data = param.output->template mutable_data<T>();
+    const auto& x_dims = param.x[0]->dims();
+    auto* out = param.output;
+    T* output_data = param.output->template mutable_data<T>();
    int offset_concat_axis = 0;
    int num_concat = count(0, axis, x_dims);
    int concat_input_size = count(axis + 1, x_dims.size(), x_dims);
    const int top_concat_axis = out->dims()[axis];
    for (size_t i = 0; i < param.x.size(); ++i) {
-      auto bottom_data = param.x[i]->data<T>();
+      const T* bottom_data = param.x[i]->data<T>();
      const int64_t bottom_concat_axis = param.x[i]->dims()[axis];
      for (int n = 0; n < num_concat; ++n) {
        std::memcpy(

--- a/lite/kernels/x86/lookup_table_compute.h
+++ b/lite/kernels/x86/lookup_table_compute.h
@@ -33,17 +33,15 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto *ids_t = param.Ids;
    auto *output_t = param.Out;
    int64_t padding_idx = param.padding_idx;
-    auto *ids = ids_t->data<int64_t>();
+    const int64_t *ids = ids_t->data<int64_t>();
-    //    LOG(INFO) << "ids->dims: " << ids_t->dims();
    int64_t ids_numel = ids_t->dims().production();
    auto *table_t = param.W;
-    //    LOG(INFO) << "W->dims: " << table_t->dims();
    int64_t row_number = table_t->dims()[0];
    int64_t row_width = table_t->dims()[1];
-    auto *table = table_t->data<T>();
+    const T *table = table_t->data<T>();
-    auto *output = output_t->mutable_data<T>();
+    T *output = output_t->mutable_data<T>();
    memset(output, 0, output_t->dims().production() * sizeof(T));
    for (int64_t i = 0; i < ids_numel; ++i) {
      if (padding_idx != -1 && ids[i] == padding_idx) {

--- a/lite/kernels/x86/reduce_compute.h
+++ b/lite/kernels/x86/reduce_compute.h
@@ -51,7 +51,7 @@ class ReduceSumCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto* output = param.output;
    param.output->mutable_data<T>();
-    auto dims = param.dim;
+    const auto& dims = param.dim;
    bool keep_dim = param.keep_dim;
    if (reduce_all) {
      // Flatten and reduce 1-D tensor

--- a/lite/kernels/x86/reduce_op_function.h
+++ b/lite/kernels/x86/reduce_op_function.h
@@ -47,33 +47,23 @@ void ReduceFunctor(const lite::Tensor& input,
                   const std::vector<int>& dims,
                   bool keep_dim) {
  auto x = EigenTensor<T, D>::From(input);
-  auto x_rank = static_cast<int>(x.dimensions().size());
  auto reduce_dim = Eigen::array<int, R_D>();
-  std::vector<int> dims_ref = dims;
+  auto x_rank = static_cast<int>(x.dimensions().size());
-  for (size_t i = 0; i < dims_ref.size(); ++i) {
+  for (size_t i = 0; i < dims.size(); ++i) {
-    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
+    if (dims[i] < 0) {
-    reduce_dim[i] = dims_ref[i];
+      reduce_dim[i] = x_rank + dims[i];
-  }
+    } else {
-  // construct the squeezed output tensor
+      reduce_dim[i] = dims[i];
-  lite::DDim out_dims = output->dims();
-  if (keep_dim && x_rank > 1) {
-    const int kDelFlag = -2;
-    auto dims_vector = out_dims.Vectorize();
-    for (size_t i = 0; i < dims_ref.size(); ++i) {
-      dims_vector[dims_ref[i]] = kDelFlag;
    }
-    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-                      dims_vector.end());
-    out_dims = lite::DDim(dims_vector);
  }
-  // auto& place = *context.eigen_device();
-  Functor functor;
+  Functor functor;
  if (D == 1) {
    auto out = EigenScalar<T>::From(output);
    functor(&x, &out, reduce_dim);
  } else {
-    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
+    auto out = EigenTensor<T, (D - R_D)>::From(*output, output->dims());
    functor(&x, &out, reduce_dim);
  }
 }

--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
@@ -37,10 +37,9 @@ class SequenceReshapeCompute
    int out_width = param.new_dim;
    const auto& in_dims = in->dims();
-    //    LOG(INFO) << "in_dims: " << in_dims;
    int64_t in_width = in_dims[1];
-    auto& in_lod = in->lod();
+    auto& in_lod = in->lod();
    CHECK_EQ(in_lod.size(), 1UL);
    CHECK_EQ((uint64_t)in_dims[0], in_lod[0].back());
@@ -63,13 +62,11 @@ class SequenceReshapeCompute
      }
    }
-    out->Resize(in_dims);
+    out->Resize(std::vector<int64_t>{static_cast<int64_t>(out->lod()[0].back()),
+                                     out_width});
    auto* dst_ptr = out->mutable_data<T>();
    auto size = in->numel() * sizeof(T);
    std::memcpy(dst_ptr, in->data<T>(), size);
-    std::vector<int64_t> out_shape{static_cast<int64_t>(out->lod()[0].back()),
-                                   out_width};
-    out->Resize(lite::DDim(out_shape));
  }
  virtual ~SequenceReshapeCompute() = default;

--- a/lite/kernels/x86/softmax_compute.h
+++ b/lite/kernels/x86/softmax_compute.h
@@ -29,7 +29,7 @@ static inline int CanonicalAxis(const int axis, const int rank) {
  return axis;
 }
-static inline int SizeToAxis(const int axis, lite::DDim dims) {
+static inline int SizeToAxis(const int axis, const DDim& dims) {
  int size = 1;
  for (int i = 0; i < axis; i++) {
    size *= dims[i];
@@ -37,7 +37,7 @@ static inline int SizeToAxis(const int axis, lite::DDim dims) {
  return size;
 }
-static inline int SizeFromAxis(const int axis, lite::DDim dims) {
+static inline int SizeFromAxis(const int axis, const DDim& dims) {
  int size = 1;
  for (size_t i = axis; i < dims.size(); i++) {
    size *= dims[i];
@@ -61,13 +61,15 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    int axis_dim = param.x->dims()[axis];
    const int n = SizeToAxis(axis, param.x->dims());
    const int d = SizeFromAxis(axis, param.x->dims());
-    std::vector<int64_t> shape{n, d};
-    lite::Tensor input_2d, out_2d;
+    DDim shape(std::vector<DDim::value_type>{n, d});
+    Tensor input_2d;
+    Tensor out_2d;
    input_2d.ShareDataWith(*param.x);
-    input_2d.Resize(lite::DDim(shape));
+    input_2d.Resize(shape);
    out_2d.ShareDataWith(*param.output);
-    out_2d.Resize(lite::DDim(shape));
+    out_2d.Resize(shape);
    lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
        context, axis_dim, &input_2d, &out_2d);