From 6486fe8a944866f0f6065201dc75b8e95b55b3e1 Mon Sep 17 00:00:00 2001
From: Zhang Ting <709968123@qq.com>
Date: Mon, 3 Aug 2020 10:21:12 +0800
Subject: [PATCH] improve GPU performance of transpose, test=develop (#25862)

---
 paddle/fluid/operators/math/math_function_impl.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h
index d1127ce4a2..693d562046 100644
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -21,6 +21,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+using framework::To32BitIndex;
+
 template <typename DeviceContext, typename T>
 void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
                                                framework::Tensor* tensor,
@@ -40,7 +42,15 @@ void Transpose<DeviceContext, T, Rank>::operator()(
   auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
   auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
   auto* dev = context.eigen_device();
-  eigen_out.device(*dev) = eigen_in.shuffle(permute);
+  // use 32bit index to speed up computation
+  bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
+  bool is_gpu_place = platform::is_gpu_place(context.GetPlace());
+  if (use_32bit_index && is_gpu_place) {
+    To32BitIndex(eigen_out).device(*dev) =
+        To32BitIndex(eigen_in).shuffle(permute);
+  } else {
+    eigen_out.device(*dev) = eigen_in.shuffle(permute);
+  }
 }
 
 template <typename DeviceContext, typename T>
-- 
GitLab