未验证 提交 a0f43889 编写于 作者: L limingshu 提交者: GitHub

Transpose optimization for AlphaFold2 (#45230)

* first commit

* fix bugs according to ci

* add some changes

* change file name into function.cu.h

* remove const_cast
上级 30f4ef7f
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/funcs/functors.h" #include "paddle/phi/kernels/funcs/functors.h"
#include "paddle/phi/kernels/funcs/transpose_functor.cu.h" #include "paddle/phi/kernels/funcs/transpose_function.cu.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
namespace paddle { namespace paddle {
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/funcs/reduce_function.h"
#include "paddle/phi/kernels/funcs/transpose_functor.cu.h" #include "paddle/phi/kernels/funcs/transpose_function.cu.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
namespace paddle { namespace paddle {
......
...@@ -243,5 +243,106 @@ struct BroadcastDimsSimplifier { ...@@ -243,5 +243,106 @@ struct BroadcastDimsSimplifier {
} }
}; };
// Simplify the input dims and permute dims if possible.
struct DimsSimplifier {
public:
explicit DimsSimplifier(const int rank,
const int64_t numel,
const std::vector<int32_t> &perm,
const std::vector<int64_t> &dims)
: perm_(rank), src_dims_(rank), count_(numel) {
SimplifyPermAndDims(rank, dims, perm);
perm_.resize(rank_);
src_dims_.resize(rank_);
dst_dims_.resize(rank_);
if (!is_seq_perm_) {
for (auto i = 0; i < rank_; ++i) {
dst_dims_[i] = src_dims_[perm_[i]];
}
} else {
dst_dims_[0] = numel;
src_dims_[0] = numel;
}
}
~DimsSimplifier() = default;
const int &GetRank() const { return rank_; }
const int64_t &GetCount() const { return count_; }
const std::vector<int> &GetPerm() const { return perm_; }
const std::vector<int64_t> &GetSrcDims() const { return src_dims_; }
const std::vector<int64_t> &GetDstDims() const { return dst_dims_; }
private:
int rank_{1};
int64_t count_{0};
bool is_seq_perm_{true};
std::vector<int> perm_;
std::vector<int64_t> src_dims_;
std::vector<int64_t> dst_dims_;
void SimplifyPermAndDims(const int rank,
const std::vector<int64_t> &in_dims,
const std::vector<int32_t> &perm) {
int start_perm_idx = 0;
int valid_dim_idx = 0;
int valid_map[phi::DDim::kMaxRank];
int64_t combined_dims[phi::DDim::kMaxRank];
// Merge consecutive dims to the fist one dim and
// leave original dim to be 1. Example below :
// perm: [2, 3, 0, 1], origin_dims : [4, 8, 2, 5]
// new_dims: [4, 8, 2, 5] -> [32, 1, 10, 1]
while (start_perm_idx < rank) {
const int start_dim_idx = perm[start_perm_idx];
combined_dims[start_dim_idx] = in_dims[start_dim_idx];
int end_perm_idx = start_perm_idx + 1;
while (end_perm_idx < rank &&
perm[end_perm_idx] == perm[end_perm_idx - 1] + 1) {
const int end_dim_idx = perm[end_perm_idx];
combined_dims[start_dim_idx] *= in_dims[end_dim_idx];
combined_dims[end_dim_idx] = 1;
end_perm_idx += 1;
}
start_perm_idx = end_perm_idx;
}
// Reorder combined dims and marked useless dim as -1.
// for example, if combined dims is [32, 1, 10, 1],
// valid_map is [0, -1, 1, -1] and generate simplified
// dims as [32, 10]
for (auto i = 0; i < rank; ++i) {
const int dim_val = combined_dims[i];
if (dim_val == 1) {
valid_map[i] = -1;
} else {
valid_map[i] = valid_dim_idx;
src_dims_[valid_dim_idx] = dim_val;
valid_dim_idx += 1;
}
}
if (valid_dim_idx == 0) {
src_dims_[0] = 1;
perm_[0] = 0;
return;
}
// Acquire simplified perm with help of combined dims
// and original perm, finally simplified perm is [1, 0]
int perm_idx = 0;
for (auto i = 0; i < rank; ++i) {
const int mapped = valid_map[perm[i]];
if (mapped >= 0) {
perm_[perm_idx] = mapped;
is_seq_perm_ &= (mapped == perm_idx);
perm_idx += 1;
}
}
rank_ = is_seq_perm_ ? 1 : valid_dim_idx;
}
};
} // namespace funcs } // namespace funcs
} // namespace phi } // namespace phi
...@@ -27,161 +27,115 @@ enum { kTransposeMKLDNNFP32 = 1, kTransposeMKLDNNINT8 = 2 }; ...@@ -27,161 +27,115 @@ enum { kTransposeMKLDNNFP32 = 1, kTransposeMKLDNNINT8 = 2 };
enum PermuteType { enum PermuteType {
kCopy = 1, kCopy = 1,
kTranspose = 2, kSwapTranspose = 2,
kVecPermute = 3, kGeneralTranspose = 3,
kGeneralPermute = 4 kVecPermute = 4,
kGeneralPermute = 5
}; };
constexpr int kBlockRows = 16; constexpr int kBlockRows = 16;
constexpr int kTileSize = 32; constexpr int kTileSize = 32;
constexpr int kShareCol = (kTileSize + 1);
#define GETTILESIZE(LEN_, ALIGN_) \
((LEN_ + (ALIGN_ - 1)) & ~(ALIGN_ - 1)) / ALIGN_
// Simplify the input dims and permute dims if possible.
template <typename T> template <typename T>
class TranposeTypeClassifier { struct PermTypeClassifier {
public: public:
TranposeTypeClassifier(const int sm_count, explicit PermTypeClassifier(const int sm_count,
const size_t rank, const int rank,
const int64_t numel, const std::vector<int32_t>& perm,
const std::vector<int32_t>& perm, const std::vector<int64_t>& dims,
const std::vector<int64_t>& dims, const T* src,
const T* src, T* dst) {
T* dst) if (rank == 1) {
: perm_(rank), src_dims(rank) { type_ = PermuteType::kCopy;
SimplifyPermAndDims(rank, dims, perm); } else {
if (rank_ > 1) { constexpr int64_t dim_limitation = 65536;
vec_size_ = GetPermVecSize(sm_count, src, dst); const int dst_vec_size = phi::GetVectorizedSize<T>(dst);
}
perm_.resize(rank_); // While the last dim is fixed, there is chance for vectorized IO.
src_dims.resize(rank_); const int last_idx = rank - 1;
dst_dims.resize(rank_); if (perm[last_idx] == last_idx) {
type_ = PermuteType::kVecPermute;
for (auto i = 0; i < rank_; ++i) { vec_size_ = GetDimVecSize(dst_vec_size, dims[last_idx], src, false);
dst_dims[i] = src_dims[perm_[i]]; return;
}
}
int GetRank() const { return rank_; }
int GetVecSize() const { return vec_size_; }
PermuteType GetPermType() const { return type_; }
std::vector<int> GetPerm() const { return perm_; }
std::vector<int64_t> GetSrcDims() const { return src_dims; }
std::vector<int64_t> GetDstDims() const { return dst_dims; }
private:
int rank_{1};
int vec_size_{1};
std::vector<int> perm_;
std::vector<int64_t> src_dims;
std::vector<int64_t> dst_dims;
PermuteType type_{kCopy};
void SimplifyPermAndDims(const size_t rank,
const std::vector<int64_t>& in_dims,
const std::vector<int32_t>& perm) {
int64_t combined_dims[phi::DDim::kMaxRank];
int valid_map[phi::DDim::kMaxRank];
// Merge consecutive dims to the fist one dim and
// leave original dim to be 1. Example below :
// perm: [2, 3, 0, 1], origin_dims : [4, 8, 2, 5]
// new_dims: [4, 8, 2, 5] -> [32, 1, 10, 1]
int start_perm_idx = 0;
while (start_perm_idx < rank) {
const int start_dim_idx = perm[start_perm_idx];
combined_dims[start_dim_idx] = in_dims[start_dim_idx];
int end_perm_idx = start_perm_idx + 1;
while (end_perm_idx < rank &&
perm[end_perm_idx] == perm[end_perm_idx - 1] + 1) {
const int end_dim_idx = perm[end_perm_idx];
combined_dims[start_dim_idx] *= in_dims[end_dim_idx];
combined_dims[end_dim_idx] = 1;
end_perm_idx += 1;
} }
start_perm_idx = end_perm_idx;
}
// Reorder combined dims and marked useless dim as -1. // Permute at last 2 dims, namely transpose.
// for example, if combined dims is [32, 1, 10, 1], if ((rank == 2 && perm[1] == 0 && perm[0] == 1) ||
// valid_map is [0, -1, 1, -1] and generate simplified (rank == 3 && perm[2] == 1 && perm[1] == 2)) {
// dims as [32, 10] int64_t channel = rank == 2 ? 1 : dims[0];
int valid_dim_idx = 0; // Currently, transpose kernel cannot cover the case that channel
bool sequential_flag = false; // dimension is more than 65536 which is the limitation of dim3 setting.
for (auto i = 0; i < rank; ++i) { // This special case will be covered by extended transpose kernel later.
const int src_dim = combined_dims[i]; if (channel < dim_limitation) {
if (src_dim == 1) { type_ = PermuteType::kGeneralTranspose;
valid_map[i] = -1; num_rows_tile_ = GETTILESIZE(dims[rank - 2], kTileSize);
} else { int dim_vec_size = GetDimVecSize(dst_vec_size, dims[last_idx], src);
sequential_flag = true; int tile_size =
valid_map[i] = valid_dim_idx; channel * num_rows_tile_ * GETTILESIZE(dims[last_idx], kTileSize);
src_dims[valid_dim_idx] = src_dim; vec_size_ = tile_size < sm_count ? 1 : dim_vec_size;
valid_dim_idx += 1; } else {
type_ = PermuteType::kGeneralPermute;
}
return;
} }
}
if (valid_dim_idx == 0) { // Permute at first dim and third dim.
src_dims[0] = 1; if (rank == 3 && perm[2] == 0 && perm[1] == 1) {
perm_[0] = 0; // Currently, transpose kernel cannot cover the case that channel
return; // dimension is more than 65536 which is the limitation of dim3 setting.
} else if (valid_dim_idx == 1) { // This special case will be covered by extended transpose kernel later.
type_ = PermuteType::kCopy; if (dims[1] < dim_limitation) {
} type_ = PermuteType::kSwapTranspose;
num_rows_tile_ = GETTILESIZE(dims[0], kTileSize);
// Acquire simplified perm with help of combined dims
// and original perm, finally simplified perm is [1, 0] int dim_vec_size = GetDimVecSize(dst_vec_size, dims[last_idx], src);
int perm_idx = 0; int tile_size =
for (auto i = 0; i < rank; ++i) { dims[1] * num_rows_tile_ * GETTILESIZE(dims[2], kTileSize);
const int mapped = valid_map[perm[i]]; vec_size_ = tile_size < sm_count ? 1 : dim_vec_size;
if (mapped >= 0) { } else {
perm_[perm_idx] = mapped; type_ = PermuteType::kGeneralPermute;
perm_idx += 1; }
return;
} }
vec_size_ = dst_vec_size;
} }
rank_ = valid_dim_idx;
} }
int GetPermVecSize(const int sm_count, const T* src, T* dst) { ~PermTypeClassifier() = default;
// For gerneal_permute kernel, there is good chance for
// vectorized write.
type_ = PermuteType::kGeneralPermute;
int vec_size = phi::GetVectorizedSize<T>(dst);
// While the last dim is fixed, there is good chance for
// both vectorized read and write.
if (perm_[rank_ - 1] == rank_ - 1) {
int tmp_size = std::min(vec_size, phi::GetVectorizedSize<T>(src));
tmp_size = GetDimVesSize(tmp_size, src_dims[rank_ - 1]);
if (tmp_size > 1) {
type_ = kVecPermute;
vec_size = tmp_size;
}
}
// Once only transpose at the last 2 dims, there is good int GetVecSize() const { return vec_size_; }
// chance for vectorized read. int GetRowsTile() const { return num_rows_tile_; }
if ((rank_ == 2 && perm_[1] == 0 && perm_[0] == 1) || PermuteType GetPermType() const { return type_; }
(rank_ == 3 && perm_[2] == 1 && perm_[1] == 2)) {
type_ = PermuteType::kTranspose; private:
int tmp_vec = std::min(vec_size, phi::GetVectorizedSize<T>(src)); int vec_size_{1};
// With bytes limitation of shared_memory, the VecSize shall be int64_t num_rows_tile_{0};
// restricted for the type whose byte-size is less than 8 (double). PermuteType type_{kGeneralPermute};
vec_size =
sizeof(T) > 8 ? 1 : GetDimVesSize(tmp_vec, src_dims[rank_ - 1]);
}
return vec_size;
}
// To find if highest common divisor and make it as vec_size. // To find if highest common divisor and make it as vec_size.
int GetDimVesSize(const int vec_size, const size_t target_dim) { int GetDimVecSize(const int dst_vec_size,
const int64_t target_dim,
const T* src,
bool use_share_mem = true) {
const int vec_size = std::min(dst_vec_size, phi::GetVectorizedSize<T>(src));
int dim_vec_size = 1; int dim_vec_size = 1;
for (auto size = vec_size; size > 0; size /= 2) { for (int size = vec_size; size > 0; size /= 2) {
if (target_dim % size == 0) { if (target_dim % size == 0) {
dim_vec_size = size; dim_vec_size = size;
break; break;
} }
} }
return dim_vec_size;
if (use_share_mem) {
// By bytes limitation of shared_memory.
return (sizeof(T) > sizeof(float) ? 1 : dim_vec_size);
} else {
return dim_vec_size;
}
} }
}; };
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/transpose_functor.cu.h" #include "paddle/phi/kernels/funcs/transpose_function.cu.h"
#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h" #include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
namespace phi { namespace phi {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册