Merge pull request #3958 from qingqing01/tensor_numel

Add function to get element count from tensor.

Merge pull request #3958 from qingqing01/tensor_numel
Add function to get element count from tensor.
104ed75f · qingqing01 · GitHub · 8be9930f · dbe05987 · 104ed75f
13 changed file
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -81,6 +81,9 @@ class Tensor {
  /*! Return the dimensions of the memory block. */
  inline const DDim& dims() const;

+  /*! Return the numel of the memory block. */
+  inline int64_t numel() const;
+
  /*! Resize the dimensions of the memory block. */
  inline Tensor& Resize(const DDim& dims);

@@ -162,6 +165,12 @@ class Tensor {
  /*! points to dimensions of memory block. */
  DDim dims_;

+  /**
+   * A cache of the number of elements in a tensor.
+   * Would be 0 for an uninitialized tensor.
+   */
+  int64_t numel_;
+
  /**
   * @brief   A PlaceHolder may be shared by more than one tensor.
   *

--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -24,7 +24,7 @@ inline void Tensor::check_memory_size() const {
  PADDLE_ENFORCE_NOT_NULL(
      holder_, "Tenosr holds no memory. Call Tensor::mutable_data first.");
  PADDLE_ENFORCE_GE(
-      holder_->size(), product(dims_) * sizeof(T) + offset_,
+      holder_->size(), numel() * sizeof(T) + offset_,
      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
      "first to re-allocate memory.\n"
      "or maybe the required data-type mismatches the data already stored.");
@@ -54,11 +54,11 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
 template <typename T>
 inline T* Tensor::mutable_data(platform::Place place) {
  static_assert(std::is_pod<T>::value, "T must be POD");
-  PADDLE_ENFORCE_GT(product(dims_), 0,
+  PADDLE_ENFORCE_GT(numel(), 0,
                    "Tensor's numel must be larger than zero to call "
                    "Tensor::mutable_data. Call Tensor::set_dim first.");
  /* some versions of boost::variant don't have operator!= */
-  int64_t size = product(dims_) * sizeof(T);
+  int64_t size = numel() * sizeof(T);
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
    if (platform::is_cpu_place(place)) {
@@ -97,7 +97,7 @@ inline void Tensor::CopyFrom(const Tensor& src,

  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));

-  auto size = product(src.dims_) * sizeof(T);
+  auto size = src.numel() * sizeof(T);

  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@@ -131,7 +131,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
  PADDLE_ENFORCE_LT(begin_idx, end_idx,
                    "Begin index must be less than end index.");
  PADDLE_ENFORCE_NE(dims_[0], 1, "Can not slice a tensor with dims_[0] = 1.");
-  size_t base = product(dims_) / dims_[0];
+  size_t base = numel() / dims_[0];
  Tensor dst;
  dst.holder_ = holder_;
  DDim dst_dims = dims_;
@@ -143,11 +143,14 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {

 inline Tensor& Tensor::Resize(const DDim& dims) {
  dims_ = dims;
+  numel_ = product(dims_);
  return *this;
 }

 inline const DDim& Tensor::dims() const { return dims_; }

+inline int64_t Tensor::numel() const { return numel_; }
+
 template <typename T>
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
  Tensor res;

--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -42,7 +42,7 @@ class CosSimKernel : public framework::OpKernel {
    output_y_norm->mutable_data<T>(context.GetPlace());

    auto dims = input_x->dims();
-    int size = static_cast<int>(framework::product(dims));
+    int64_t size = input_x->numel();
    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
    auto x = EigenMatrix<T>::From(*input_x, new_dims);
    auto y = EigenMatrix<T>::From(*input_y, new_dims);
@@ -72,7 +72,7 @@ class CosSimGradKernel : public framework::OpKernel {
    auto* input_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));

    auto dims = input_x->dims();
-    int size = static_cast<int>(framework::product(dims));
+    int64_t size = input_x->numel();
    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
    auto x = EigenMatrix<T>::From(*input_x, new_dims);
    auto y = EigenMatrix<T>::From(*input_y, new_dims);

--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -31,7 +31,7 @@ class CPUGaussianRandomKernel : public framework::OpKernel {
    }
    engine.seed(seed);
    std::normal_distribution<T> dist(mean, std);
-    int64_t size = framework::product(tensor->dims());
+    int64_t size = tensor->numel();
    for (int64_t i = 0; i < size; ++i) {
      data[i] = dist(engine);
    }

--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -50,8 +50,8 @@ class GPUGaussianRandomKernel : public framework::OpKernel {
    T mean = static_cast<T>(context.Attr<float>("mean"));
    T std = static_cast<T>(context.Attr<float>("std"));
    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    ssize_t N = framework::product(tensor->dims());
-    thrust::transform(index_sequence_begin, index_sequence_begin + N,
+    int64_t size = tensor->numel();
+    thrust::transform(index_sequence_begin, index_sequence_begin + size,
                      thrust::device_ptr<T>(data),
                      GaussianGenerator<T>(mean, std, seed));
  }

--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -70,7 +70,7 @@ class LookupTableCUDAKernel : public framework::OpKernel {

    size_t N = table_t->dims()[0];
    size_t D = table_t->dims()[1];
-    size_t K = product(ids_t->dims());
+    size_t K = ids_t->numel();
    auto ids = ids_t->data<int32_t>();
    auto table = table_t->data<T>();
    auto output = output_t->mutable_data<T>(context.GetPlace());
@@ -91,7 +91,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel {

    int N = d_table_t->dims()[0];
    int D = d_table_t->dims()[1];
-    int K = product(ids_t->dims());
+    int K = ids_t->numel();
    const int32_t* ids = ids_t->data<int32_t>();
    const T* d_output = d_output_t->data<T>();
    T* d_table = d_table_t->mutable_data<T>(context.GetPlace());

--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
@@ -35,7 +35,7 @@ class LookupTableKernel : public framework::OpKernel {
    auto ids = ids_t->data<int32_t>();
    auto table = table_t->data<T>();
    auto output = output_t->mutable_data<T>(context.GetPlace());
-    for (ssize_t i = 0; i < product(ids_t->dims()); ++i) {
+    for (int64_t i = 0; i < ids_t->numel(); ++i) {
      PADDLE_ENFORCE_LT(ids[i], N);
      PADDLE_ENFORCE_GE(ids[i], 0);
      memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
@@ -61,7 +61,7 @@ class LookupTableGradKernel : public framework::OpKernel {
    t.device(context.GetEigenDevice<platform::CPUPlace>()) =
        t.constant(static_cast<T>(0));

-    for (ssize_t i = 0; i < product(ids_t->dims()); ++i) {
+    for (int64_t i = 0; i < ids_t->numel(); ++i) {
      PADDLE_ENFORCE_LT(ids[i], N);
      PADDLE_ENFORCE_GE(ids[i], 0);
      for (int j = 0; j < D; ++j) {

--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -49,12 +49,11 @@ class MeanGradKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(framework::product(OG->dims()) == 1,
-                   "Mean Gradient should be scalar");
+    PADDLE_ENFORCE(OG->numel() == 1, "Mean Gradient should be scalar");
    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
    IG->mutable_data<T>(context.GetPlace());

-    T ig_size = (T)framework::product(IG->dims());
+    T ig_size = static_cast<T>(IG->numel());
    Eigen::DSizes<int, 1> bcast(ig_size);

    EigenVector<T>::Flatten(*IG).device(context.GetEigenDevice<Place>()) =

--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -31,8 +31,7 @@ class MinusOp : public framework::OperatorWithKernel {
    auto *right_tensor = ctx.Input<framework::Tensor>("Y");

    PADDLE_ENFORCE_EQ(
-        framework::product(left_tensor->dims()),
-        framework::product(right_tensor->dims()),
+        left_tensor->numel(), right_tensor->numel(),
        "Minus operator must take two tensor with same num of elements");
    ctx.Output<framework::Tensor>("Out")->Resize(left_tensor->dims());
  }

--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -41,8 +41,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {

    int rank = framework::arity(x_dims);
    PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2.");
-    PADDLE_ENFORCE_EQ(framework::product(x_dims) / x_dims[0],
-                      framework::product(y_dims) / y_dims[0],
+    PADDLE_ENFORCE_EQ(x->numel() / x_dims[0], y->numel() / y_dims[0],
                      "Product of dimensions expcet the first dimension of "
                      "input and target must be equal.");
    PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
@@ -50,8 +49,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
                   "or to 1.");

    ctx.Output<Tensor>("sub_result")
-        ->Resize({static_cast<int>(x_dims[0]),
-                  static_cast<int>(framework::product(x_dims) / x_dims[0])});
+        ->Resize({x_dims[0], x->numel() / x_dims[0]});
    ctx.Output<Tensor>("Out")->Resize({x_dims[0], 1});
  }
 };

--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/operators/squared_l2_distance_op.h
@@ -39,7 +39,7 @@ class SquaredL2DistanceKernel : public framework::OpKernel {
    auto in0_dims = in0->dims();
    auto in1_dims = in1->dims();

-    int cols = framework::product(in0_dims) / in0_dims[0];
+    int cols = in0->numel() / in0_dims[0];
    // reduce dimensions except the first
    auto x =
        EigenMatrix<T>::From(*in0, framework::make_ddim({in0_dims[0], cols}));
@@ -82,7 +82,7 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel {
    auto x_dims = x_g->dims();
    auto y_dims = y_g->dims();

-    int cols = framework::product(x_dims) / x_dims[0];
+    int cols = x_g->numel() / x_dims[0];
    // calculate gradient
    auto grad_mat = 2 *
                    (out_grad.broadcast(Eigen::array<int, 2>({{1, cols}}))) *

--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -35,7 +35,7 @@ class CPUUniformRandomKernel : public framework::OpKernel {
    std::uniform_real_distribution<T> dist(
        static_cast<T>(context.Attr<float>("min")),
        static_cast<T>(context.Attr<float>("max")));
-    int64_t size = framework::product(tensor->dims());
+    int64_t size = tensor->numel();
    for (int64_t i = 0; i < size; ++i) {
      data[i] = dist(engine);
    }

--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -53,8 +53,8 @@ class GPUUniformRandomKernel : public framework::OpKernel {
    T min = static_cast<T>(context.Attr<float>("min"));
    T max = static_cast<T>(context.Attr<float>("max"));
    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    ssize_t N = framework::product(tensor->dims());
-    thrust::transform(index_sequence_begin, index_sequence_begin + N,
+    int64_t size = tensor->numel();
+    thrust::transform(index_sequence_begin, index_sequence_begin + size,
                      thrust::device_ptr<T>(data),
                      UniformGenerator<T>(min, max, seed));
  }