From 6486fe8a944866f0f6065201dc75b8e95b55b3e1 Mon Sep 17 00:00:00 2001 From: Zhang Ting <709968123@qq.com> Date: Mon, 3 Aug 2020 10:21:12 +0800 Subject: [PATCH] improve GPU performance of transpose, test=develop (#25862) --- paddle/fluid/operators/math/math_function_impl.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h index d1127ce4a24..693d5620460 100644 --- a/paddle/fluid/operators/math/math_function_impl.h +++ b/paddle/fluid/operators/math/math_function_impl.h @@ -21,6 +21,8 @@ namespace paddle { namespace operators { namespace math { +using framework::To32BitIndex; + template void SetConstant::operator()(const DeviceContext& context, framework::Tensor* tensor, @@ -40,7 +42,15 @@ void Transpose::operator()( auto eigen_in = framework::EigenTensor::From(in); auto eigen_out = framework::EigenTensor::From(*out); auto* dev = context.eigen_device(); - eigen_out.device(*dev) = eigen_in.shuffle(permute); + // use 32bit index to speed up computation + bool use_32bit_index = eigen_out.size() < Eigen::NumTraits::highest(); + bool is_gpu_place = platform::is_gpu_place(context.GetPlace()); + if (use_32bit_index && is_gpu_place) { + To32BitIndex(eigen_out).device(*dev) = + To32BitIndex(eigen_in).shuffle(permute); + } else { + eigen_out.device(*dev) = eigen_in.shuffle(permute); + } } template -- GitLab