未验证 提交 6486fe8a 编写于 作者: Z Zhang Ting 提交者: GitHub

improve GPU performance of transpose, test=develop (#25862)

上级 6773fcc1
......@@ -21,6 +21,8 @@ namespace paddle {
namespace operators {
namespace math {
using framework::To32BitIndex;
template <typename DeviceContext, typename T>
void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
framework::Tensor* tensor,
......@@ -40,7 +42,15 @@ void Transpose<DeviceContext, T, Rank>::operator()(
auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
auto* dev = context.eigen_device();
eigen_out.device(*dev) = eigen_in.shuffle(permute);
// use 32bit index to speed up computation
bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
bool is_gpu_place = platform::is_gpu_place(context.GetPlace());
if (use_32bit_index && is_gpu_place) {
To32BitIndex(eigen_out).device(*dev) =
To32BitIndex(eigen_in).shuffle(permute);
} else {
eigen_out.device(*dev) = eigen_in.shuffle(permute);
}
}
template <typename DeviceContext, typename T>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册