diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 643f875491724bf443bd7727391734377ee6180c..fc54ed697f685f048ee542aa2bebe91e03c6f76c 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -78,6 +78,9 @@ class Tensor {
   /*! Return the dimensions of the memory block. */
   inline const DDim& dims() const;
 
+  /*! Return the numel of the memory block. */
+  inline int64_t numel() const;
+
   /*! Resize the dimensions of the memory block. */
   inline Tensor& Resize(const DDim& dims);
 
@@ -159,6 +162,9 @@ class Tensor {
   /*! points to dimensions of memory block. */
   DDim dims_;
 
+  /*! the element count of tensor. */
+  int64_t numel_;
+
   /**
    * @brief   A PlaceHolder may be shared by more than one tensor.
    *
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 94f436294f350e2a39785a09959efb3b17bd00a5..03678784b460d44a592a2dbfdff5f058d0d42bf8 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -24,7 +24,7 @@ inline void Tensor::check_memory_size() const {
   PADDLE_ENFORCE_NOT_NULL(
       holder_, "Tenosr holds no memory. Call Tensor::mutable_data first.");
   PADDLE_ENFORCE_GE(
-      holder_->size(), product(dims_) * sizeof(T) + offset_,
+      holder_->size(), numel_ * sizeof(T) + offset_,
       "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
       "first to re-allocate memory.\n"
       "or maybe the required data-type mismatches the data already stored.");
@@ -54,11 +54,11 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
 template <typename T>
 inline T* Tensor::mutable_data(platform::Place place) {
   static_assert(std::is_pod<T>::value, "T must be POD");
-  PADDLE_ENFORCE_GT(product(dims_), 0,
+  PADDLE_ENFORCE_GT(numel_, 0,
                     "Tensor's numel must be larger than zero to call "
                     "Tensor::mutable_data. Call Tensor::set_dim first.");
   /* some versions of boost::variant don't have operator!= */
-  int64_t size = product(dims_) * sizeof(T);
+  int64_t size = numel_ * sizeof(T);
   if (holder_ == nullptr || !(holder_->place() == place) ||
       holder_->size() < size + offset_) {
     if (platform::is_cpu_place(place)) {
@@ -97,7 +97,7 @@ inline void Tensor::CopyFrom(const Tensor& src,
 
   auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
 
-  auto size = product(src.dims_) * sizeof(T);
+  auto size = src.numel() * sizeof(T);
 
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@@ -131,7 +131,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
   PADDLE_ENFORCE_LT(begin_idx, end_idx,
                     "Begin index must be less than end index.");
   PADDLE_ENFORCE_NE(dims_[0], 1, "Can not slice a tensor with dims_[0] = 1.");
-  size_t base = product(dims_) / dims_[0];
+  size_t base = numel_ / dims_[0];
   Tensor dst;
   dst.holder_ = holder_;
   DDim dst_dims = dims_;
@@ -143,10 +143,13 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
 
 inline Tensor& Tensor::Resize(const DDim& dims) {
   dims_ = dims;
+  numel_ = product(dims_);
   return *this;
 }
 
 inline const DDim& Tensor::dims() const { return dims_; }
 
+inline int64_t Tensor::numel() const { return numel_; }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index 9e2bcebe3b5432c157fac895a9bbab5164193dbb..0dc509952578497671a128374f77ce616a520909 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -42,7 +42,7 @@ class CosSimKernel : public framework::OpKernel {
     output_y_norm->mutable_data<T>(context.GetPlace());
 
     auto dims = input_x->dims();
-    int size = static_cast<int>(framework::product(dims));
+    int64_t size = input_x->numel();
     auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
     auto x = EigenMatrix<T>::From(*input_x, new_dims);
     auto y = EigenMatrix<T>::From(*input_y, new_dims);
@@ -72,7 +72,7 @@ class CosSimGradKernel : public framework::OpKernel {
     auto* input_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
 
     auto dims = input_x->dims();
-    int size = static_cast<int>(framework::product(dims));
+    int64_t size = input_x->numel();
     auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
     auto x = EigenMatrix<T>::From(*input_x, new_dims);
     auto y = EigenMatrix<T>::From(*input_y, new_dims);
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 6574880c0eb6324b2dd175e39a364d2ef46e735e..3d76516405960c502a46997108049b2db5cab6bf 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -31,7 +31,7 @@ class CPUGaussianRandomKernel : public framework::OpKernel {
     }
     engine.seed(seed);
     std::normal_distribution<T> dist(mean, std);
-    int64_t size = framework::product(tensor->dims());
+    int64_t size = tensor->numel();
     for (int64_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
     }
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index d9dbc1dcfe6a6676938d64be93c879ea69148018..2d63b3049988cfc3135a87a57dad56b970df3eab 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -50,8 +50,8 @@ class GPUGaussianRandomKernel : public framework::OpKernel {
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    ssize_t N = framework::product(tensor->dims());
-    thrust::transform(index_sequence_begin, index_sequence_begin + N,
+    int64_t size = tensor->numel();
+    thrust::transform(index_sequence_begin, index_sequence_begin + size,
                       thrust::device_ptr<T>(data),
                       GaussianGenerator<T>(mean, std, seed));
   }
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index 27eee3436af8107cef2aa3577ea238be49edf1af..708344046760691aa2da562eb1ee3d8b130c5f18 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -70,7 +70,7 @@ class LookupTableCUDAKernel : public framework::OpKernel {
 
     size_t N = table_t->dims()[0];
     size_t D = table_t->dims()[1];
-    size_t K = product(ids_t->dims());
+    size_t K = ids_t->numel();
     auto ids = ids_t->data<int32_t>();
     auto table = table_t->data<T>();
     auto output = output_t->mutable_data<T>(context.GetPlace());
@@ -91,7 +91,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel {
 
     int N = d_table_t->dims()[0];
     int D = d_table_t->dims()[1];
-    int K = product(ids_t->dims());
+    int K = ids_t->numel();
     const int32_t* ids = ids_t->data<int32_t>();
     const T* d_output = d_output_t->data<T>();
     T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h
index 877b36cef4ea9cdaaaf37c97d5e5bfce55b91436..a1298906dd4b4209644fe06584f70169519de01c 100644
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
@@ -35,7 +35,7 @@ class LookupTableKernel : public framework::OpKernel {
     auto ids = ids_t->data<int32_t>();
     auto table = table_t->data<T>();
     auto output = output_t->mutable_data<T>(context.GetPlace());
-    for (ssize_t i = 0; i < product(ids_t->dims()); ++i) {
+    for (int64_t i = 0; i < ids_t->numel(); ++i) {
       PADDLE_ENFORCE_LT(ids[i], N);
       PADDLE_ENFORCE_GE(ids[i], 0);
       memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
@@ -61,7 +61,7 @@ class LookupTableGradKernel : public framework::OpKernel {
     t.device(context.GetEigenDevice<platform::CPUPlace>()) =
         t.constant(static_cast<T>(0));
 
-    for (ssize_t i = 0; i < product(ids_t->dims()); ++i) {
+    for (int64_t i = 0; i < ids_t->numel(); ++i) {
       PADDLE_ENFORCE_LT(ids[i], N);
       PADDLE_ENFORCE_GE(ids[i], 0);
       for (int j = 0; j < D; ++j) {
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
index 9848af280b62729bef9243052ceae0b7d8f4c6f5..ce31e178d8e375dc59be80a6c05133201308da70 100644
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -49,12 +49,11 @@ class MeanGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(framework::product(OG->dims()) == 1,
-                   "Mean Gradient should be scalar");
+    PADDLE_ENFORCE(OG->numel() == 1, "Mean Gradient should be scalar");
     auto IG = context.Output<Tensor>(framework::GradVarName("X"));
     IG->mutable_data<T>(context.GetPlace());
 
-    T ig_size = (T)framework::product(IG->dims());
+    T ig_size = static_cast<T>(IG->numel());
     Eigen::DSizes<int, 1> bcast(ig_size);
 
     EigenVector<T>::Flatten(*IG).device(context.GetEigenDevice<Place>()) =
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index 069fb5e1abc657aa02a50fde352ce88d078c36e1..a4876feb2edf77bd422fa2a7687b0fa7d55dae47 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -31,8 +31,7 @@ class MinusOp : public framework::OperatorWithKernel {
     auto *right_tensor = ctx.Input<framework::Tensor>("Y");
 
     PADDLE_ENFORCE_EQ(
-        framework::product(left_tensor->dims()),
-        framework::product(right_tensor->dims()),
+        left_tensor->numel(), right_tensor->numel(),
         "Minus operator must take two tensor with same num of elements");
     ctx.Output<framework::Tensor>("Out")->Resize(left_tensor->dims());
   }
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index dc30644a5e7e33d4289e48cac093aa5fde7e75e7..9f51d3efa8ecba894a1023b9de2df451ca85916c 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -41,8 +41,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
 
     int rank = framework::arity(x_dims);
     PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2.");
-    PADDLE_ENFORCE_EQ(framework::product(x_dims) / x_dims[0],
-                      framework::product(y_dims) / y_dims[0],
+    PADDLE_ENFORCE_EQ(x->numel() / x_dims[0], y->numel() / y_dims[0],
                       "Product of dimensions expcet the first dimension of "
                       "input and target must be equal.");
     PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
@@ -50,8 +49,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
                    "or to 1.");
 
     ctx.Output<Tensor>("sub_result")
-        ->Resize({static_cast<int>(x_dims[0]),
-                  static_cast<int>(framework::product(x_dims) / x_dims[0])});
+        ->Resize({x_dims[0], x->numel() / x_dims[0]});
     ctx.Output<Tensor>("Out")->Resize({x_dims[0], 1});
   }
 };
diff --git a/paddle/operators/squared_l2_distance_op.h b/paddle/operators/squared_l2_distance_op.h
index ad3347a0b35f3385c5adbcd7ceaa94fe134105e3..097ac04fc09a10b3b624f491a847e281e41a802c 100644
--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/operators/squared_l2_distance_op.h
@@ -39,7 +39,7 @@ class SquaredL2DistanceKernel : public framework::OpKernel {
     auto in0_dims = in0->dims();
     auto in1_dims = in1->dims();
 
-    int cols = framework::product(in0_dims) / in0_dims[0];
+    int cols = in0->numel() / in0_dims[0];
     // reduce dimensions except the first
     auto x =
         EigenMatrix<T>::From(*in0, framework::make_ddim({in0_dims[0], cols}));
@@ -82,7 +82,7 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel {
     auto x_dims = x_g->dims();
     auto y_dims = y_g->dims();
 
-    int cols = framework::product(x_dims) / x_dims[0];
+    int cols = x_g->numel() / x_dims[0];
     // calculate gradient
     auto grad_mat = 2 *
                     (out_grad.broadcast(Eigen::array<int, 2>({{1, cols}}))) *
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index f2aeef6c310df8535e67fa3906301a87f8ec4694..b8fbc9b52aecdb5c8d985b5de9bcd7cb85835b60 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -35,7 +35,7 @@ class CPUUniformRandomKernel : public framework::OpKernel {
     std::uniform_real_distribution<T> dist(
         static_cast<T>(context.Attr<float>("min")),
         static_cast<T>(context.Attr<float>("max")));
-    int64_t size = framework::product(tensor->dims());
+    int64_t size = tensor->numel();
     for (int64_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
     }
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
index c2c041b144b6ca1f019f972e1301b756ec1c9301..6614b53b3f990d10c82633f3c1f079acea0cd827 100644
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -53,8 +53,8 @@ class GPUUniformRandomKernel : public framework::OpKernel {
     T min = static_cast<T>(context.Attr<float>("min"));
     T max = static_cast<T>(context.Attr<float>("max"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    ssize_t N = framework::product(tensor->dims());
-    thrust::transform(index_sequence_begin, index_sequence_begin + N,
+    int64_t size = tensor->numel();
+    thrust::transform(index_sequence_begin, index_sequence_begin + size,
                       thrust::device_ptr<T>(data),
                       UniformGenerator<T>(min, max, seed));
   }