improve GPU performance of transpose, test=develop (#25862)

6486fe8a · Zhang Ting · GitHub · 6773fcc1 · 6486fe8a
隐藏空白更改
内联并排

Showing with 11 addition and 1 deletion

paddle/fluid/operators/math/math_function_impl.h paddle/fluid/operators/math/math_function_impl.h +11 -1

未找到文件。
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -21,6 +21,8 @@ namespace paddle {
 namespace operators {
 namespace math {

+using framework::To32BitIndex;
+
 template <typename DeviceContext, typename T>
 void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
                                               framework::Tensor* tensor,
@@ -40,7 +42,15 @@ void Transpose<DeviceContext, T, Rank>::operator()(
  auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
  auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
  auto* dev = context.eigen_device();
-  eigen_out.device(*dev) = eigen_in.shuffle(permute);
+  // use 32bit index to speed up computation
+  bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
+  bool is_gpu_place = platform::is_gpu_place(context.GetPlace());
+  if (use_32bit_index && is_gpu_place) {
+    To32BitIndex(eigen_out).device(*dev) =
+        To32BitIndex(eigen_in).shuffle(permute);
+  } else {
+    eigen_out.device(*dev) = eigen_in.shuffle(permute);
+  }
 }

 template <typename DeviceContext, typename T>