use 32 bit index to improve expand op (#23899)

* use 32 bit index to improve expand op, test=develop * remove redundant code, test=develop

use 32 bit index to improve expand op (#23899)
* use 32 bit index to improve expand op, test=develop * remove redundant code, test=develop
b8866225 · Zhang Ting · GitHub · e21b3c27 · b8866225 · b8866225
隐藏空白更改
内联并排

Showing with 30 addition and 1 deletion

paddle/fluid/framework/eigen.h paddle/fluid/framework/eigen.h +22 -0

paddle/fluid/operators/expand_op.h paddle/fluid/operators/expand_op.h +8 -1

未找到文件。
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -115,5 +115,27 @@ struct EigenScalar {
  }
 };

+// Define Tensor with 32-bit index.
+template <typename T, int D, int MajorType = Eigen::RowMajor>
+using Tensor32BitIndex =
+    Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, int>, Eigen::Aligned>;
+
+template <typename DSizes>
+Eigen::DSizes<int, DSizes::count> To32BitDims(const DSizes& in) {
+  Eigen::DSizes<int, DSizes::count> out;
+  for (int i = 0; i < DSizes::count; ++i) {
+    out[i] = in[i];
+  }
+  return out;
+}
+
+template <typename EigenTensor>
+Tensor32BitIndex<typename EigenTensor::Scalar, EigenTensor::NumIndices>
+To32BitIndex(EigenTensor in) {
+  using RetType =
+      Tensor32BitIndex<typename EigenTensor::Scalar, EigenTensor::NumIndices>;
+  return RetType(in.data(), To32BitDims(in.dimensions()));
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -90,6 +90,7 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename T, size_t D, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using framework::To32BitIndex;

 template <typename DeviceContext, typename T>
 class ExpandKernel : public framework::OpKernel<T> {
@@ -131,7 +132,13 @@ class ExpandKernel : public framework::OpKernel<T> {
    auto y = EigenTensor<T, Rank>::From(*out0);
    auto& place =
        *context.template device_context<DeviceContext>().eigen_device();
-    y.device(place) = x.broadcast(bcast_dims);
+    // use 32-bit index to speed up
+    bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
+    if (use_32bit_index) {
+      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+    } else {
+      y.device(place) = x.broadcast(bcast_dims);
+    }
  }
 };