未验证 提交 6c9df13d 编写于 作者: L limingshu 提交者: GitHub

Divide elementwise case from BroadcastKernel and refine transpose autotune (#33051)

* First Commit.

* add some codes

* add elementwise loader

* fix code styles

* merge with develop

* add some changes both in elementwise and transpose

* add init operation in broadcast kernel.

* change codes according to pr suggestions about transpose file

* fix error for op-benchmark ci

* fix according to ci
上级 f0dab193
......@@ -16,7 +16,6 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
#include "paddle/phi/kernels/funcs/transpose_functor.h"
namespace paddle {
namespace operators {
......
......@@ -20,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/transpose_functor.h"
namespace paddle {
namespace operators {
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/kernels/funcs/transpose_functor.h"
namespace paddle {
namespace operators {
......
......@@ -24,7 +24,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/transpose_functor.h"
namespace paddle {
namespace operators {
......
......@@ -123,7 +123,7 @@ class AutoTuneBase {
float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) {
// Regard 1st run as warmup, judge the compare result by the time cost
// of rest cycles.
constexpr int repeats = 4;
constexpr int repeats = 6;
phi::GpuTimer timer;
float time_cost = 0;
const auto& stream = ctx.stream();
......
......@@ -34,18 +34,6 @@ struct BroadcastDimsSimplifier {
BroadcastDimsSimplifier(const std::vector<const DenseTensor *> &ins,
const phi::DDim &dims,
int axis) {
if (!NeedBroadcast(ins, dims)) {
int64_t numel = phi::product(dims);
rank = 1;
N = ins.size();
out_dims = DimVector{numel};
in_dims.resize(N);
for (int64_t i = 0; i < N; ++i) {
in_dims[i] = DimVector{numel};
}
return;
}
N = std::max(static_cast<int>(ins.size()), 2);
in_dims.resize(N);
rank = dims.size();
......@@ -112,18 +100,6 @@ struct BroadcastDimsSimplifier {
}
private:
bool NeedBroadcast(const std::vector<const DenseTensor *> &ins,
const phi::DDim &dims) {
bool no_broadcast_flag = true;
for (auto *in : ins) {
no_broadcast_flag &= ins[0]->dims() == in->dims();
}
if (ins.size() > 0) {
no_broadcast_flag &= dims == ins[0]->dims();
}
return !no_broadcast_flag;
}
// To compensate the lackage of input_tensors' dimension with axis.
void ExtendInputDimensions(int N, int axis) {
for (auto &in_dim : in_dims) {
......@@ -244,18 +220,18 @@ struct BroadcastDimsSimplifier {
};
// Simplify the input dims and permute dims if possible.
struct DimsSimplifier {
struct PermuteDimsSimplifier {
public:
explicit DimsSimplifier(const int rank,
const int64_t numel,
const std::vector<int32_t> &perm,
const std::vector<int64_t> &dims)
PermuteDimsSimplifier(const int rank,
const int64_t numel,
const std::vector<int32_t> &perm,
const std::vector<int64_t> &dims)
: perm_(rank), src_dims_(rank), count_(numel) {
SimplifyPermAndDims(rank, dims, perm);
perm_.resize(rank_);
src_dims_.resize(rank_);
dst_dims_.resize(rank_);
if (!is_seq_perm_) {
if (!is_sequential_perm_) {
for (auto i = 0; i < rank_; ++i) {
dst_dims_[i] = src_dims_[perm_[i]];
}
......@@ -265,7 +241,7 @@ struct DimsSimplifier {
}
}
~DimsSimplifier() = default;
~PermuteDimsSimplifier() = default;
const int &GetRank() const { return rank_; }
const int64_t &GetCount() const { return count_; }
......@@ -276,8 +252,8 @@ struct DimsSimplifier {
private:
int rank_{1};
int64_t count_{0};
bool is_seq_perm_{true};
std::vector<int> perm_;
bool is_sequential_perm_{true};
std::vector<int64_t> src_dims_;
std::vector<int64_t> dst_dims_;
......@@ -336,11 +312,44 @@ struct DimsSimplifier {
const int mapped = valid_map[perm[i]];
if (mapped >= 0) {
perm_[perm_idx] = mapped;
is_seq_perm_ &= (mapped == perm_idx);
is_sequential_perm_ &= (mapped == perm_idx);
perm_idx += 1;
}
}
rank_ = is_seq_perm_ ? 1 : valid_dim_idx;
rank_ = is_sequential_perm_ ? 1 : valid_dim_idx;
}
};
template <typename T>
struct DimsSimplifiedLogger {
public:
static void Log(const std::vector<const DenseTensor *> &ins,
std::vector<DenseTensor *> *outs,
const BroadcastDimsSimplifier &dims_simplifier,
const std::string &op_name) {
VLOG(6) << op_name << "`s dims after simplification is below :";
for (size_t i = 0; i < ins.size(); ++i) {
VLOG(6) << "input i=" << i << ": origin_dims={" << ins[i]->dims()
<< "}, simplied_dims={"
<< ReversedVectorToString(dims_simplifier.in_dims[i]) << "}";
}
VLOG(6) << "output: origin_dims={" << (*outs)[0]->dims()
<< "}, simplied_dims={"
<< ReversedVectorToString(dims_simplifier.out_dims) << "}";
}
static std::string ReversedVectorToString(const std::vector<T> &reversed_v) {
std::stringstream ss;
bool is_last = true;
for (int i = reversed_v.size() - 1; i >= 0; --i) {
if (is_last) {
ss << reversed_v[i];
is_last = false;
} else {
ss << ", " << reversed_v[i];
}
}
return ss.str();
}
};
......
......@@ -19,8 +19,9 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_utils.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/autotune/auto_tune_base.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/funcs/dims_simplifier.h"
#include "paddle/phi/kernels/funcs/transpose_functor.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/primitive/datamover_primitives.h"
namespace phi {
......@@ -705,24 +706,24 @@ inline void CombineTransposeDim3(const DDim& shape,
template <typename T>
struct TransposeSimple {
static bool Impl(const phi::GPUContext& ctx,
const phi::DenseTensor& in,
const std::vector<int32_t> perm,
phi::DenseTensor* out,
const int64_t numel) {
static bool Run(const phi::GPUContext& ctx,
const phi::DenseTensor& in,
const std::vector<int32_t>& perm,
phi::DenseTensor* out,
const int64_t numel) {
if (numel >= std::numeric_limits<int32_t>::max()) {
return Run<int64_t>(ctx, in, perm, out);
return RunImpl<int64_t>(ctx, in, perm, out);
} else {
return Run<int32_t>(ctx, in, perm, out);
return RunImpl<int32_t>(ctx, in, perm, out);
}
}
private:
template <typename IndexType = int32_t>
static bool Run(const phi::GPUContext& ctx,
const phi::DenseTensor& in,
const std::vector<int32_t> perm,
phi::DenseTensor* out) {
static bool RunImpl(const phi::GPUContext& ctx,
const phi::DenseTensor& in,
const std::vector<int32_t>& perm,
phi::DenseTensor* out) {
// First reduce the dimensions of the input tensor if possible.
auto in_data = in.data<T>();
auto out_data = out->data<T>();
......@@ -752,13 +753,128 @@ struct TransposeSimple {
}
};
template <typename IndexT, int N>
enum PermuteType {
kCopy = 1,
kSwapTranspose = 2,
kGeneralTranspose = 3,
kVecPermute = 4,
kGeneralPermute = 5
};
constexpr int kBlockRows = 16;
constexpr int kTileSize = 32;
constexpr int kShareCol = (kTileSize + 1);
#define GET_TILE_SIZE(LEN_, ALIGN_) \
((LEN_ + (ALIGN_ - 1)) & ~(ALIGN_ - 1)) / ALIGN_
template <typename T>
struct PermTypeClassifier {
public:
PermTypeClassifier(const int sm_count,
const int rank,
const std::vector<int32_t>& perm,
const std::vector<int64_t>& dims,
const T* src,
T* dst) {
if (rank == 1) {
type_ = PermuteType::kCopy;
} else {
// Limitation of the setting in one dimension of cuda grid.
constexpr int64_t dim_limitation = 65536;
int dst_vec_size = phi::GetVectorizedSize<T>(dst);
// While the last dim is fixed, there is chance for vectorized IO.
const int last_idx = rank - 1;
if (perm[last_idx] == last_idx) {
type_ = PermuteType::kVecPermute;
vec_size_ = GetDimVecSize(dst_vec_size, dims[last_idx], src, false);
return;
}
// Permute at last 2 dims, namely transpose.
if ((rank == 2 && perm[1] == 0) ||
(rank == 3 && perm[2] == 1 && perm[1] == 2)) {
int64_t channel = rank == 2 ? 1 : dims[0];
// Currently, transpose kernel cannot cover the case that channel
// dimension is more than 65536 which is the limitation of dim3 setting.
// This special case will be covered by extended transpose kernel later.
if (channel < dim_limitation) {
type_ = PermuteType::kGeneralTranspose;
num_rows_tile_ = GET_TILE_SIZE(dims[rank - 2], kTileSize);
int dim_vec_size = GetDimVecSize(dst_vec_size, dims[last_idx], src);
int tile_size = channel * num_rows_tile_ *
GET_TILE_SIZE(dims[last_idx], kTileSize);
vec_size_ = tile_size < sm_count ? 1 : dim_vec_size;
} else {
type_ = PermuteType::kGeneralPermute;
}
return;
}
// Permute at first dim and third dim.
if (rank == 3 && perm[2] == 0 && perm[1] == 1) {
// Currently, transpose kernel cannot cover the case that channel
// dimension is more than 65536 which is the limitation of dim3 setting.
// This special case will be covered by extended transpose kernel later.
if (dims[1] < dim_limitation) {
type_ = PermuteType::kSwapTranspose;
num_rows_tile_ = GET_TILE_SIZE(dims[0], kTileSize);
int dim_vec_size = GetDimVecSize(dst_vec_size, dims[last_idx], src);
int tile_size =
dims[1] * num_rows_tile_ * GET_TILE_SIZE(dims[2], kTileSize);
vec_size_ = tile_size < sm_count ? 1 : dim_vec_size;
} else {
type_ = PermuteType::kGeneralPermute;
}
return;
}
vec_size_ = dst_vec_size;
}
}
~PermTypeClassifier() = default;
int GetVecSize() const { return vec_size_; }
int GetRowsTile() const { return num_rows_tile_; }
PermuteType GetPermType() const { return type_; }
private:
int vec_size_{1};
int64_t num_rows_tile_{0};
PermuteType type_{kGeneralPermute};
// To find if highest common divisor and make it as vec_size.
int GetDimVecSize(const int dst_vec_size,
const int64_t target_dim,
const T* src,
bool use_share_mem = true) {
int vec_size = std::min(dst_vec_size, phi::GetVectorizedSize<T>(src));
int dim_vec_size = 1;
for (int size = vec_size; size > 0; size /= 2) {
if (target_dim % size == 0) {
dim_vec_size = size;
break;
}
}
if (use_share_mem) {
// By bytes limitation of shared_memory.
return (sizeof(T) > sizeof(float) ? 1 : dim_vec_size);
} else {
return dim_vec_size;
}
}
};
template <typename IndexT, int Rank>
class IdxHelper {
public:
IdxHelper() {}
explicit IdxHelper(const IndexT* dims) {
for (int i = N - 1; i >= 0; --i) {
stride_[i] = i < (N - 1) ? dims[i + 1] * stride_[i + 1] : 1;
for (int i = Rank - 1; i >= 0; --i) {
stride_[i] = i < (Rank - 1) ? dims[i + 1] * stride_[i + 1] : 1;
}
}
......@@ -770,25 +886,25 @@ class IdxHelper {
IndexT* index) const {
IndexT remaining = offset;
#pragma unroll
for (int i = 0; i < N - 1; ++i) {
for (int i = 0; i < Rank - 1; ++i) {
const IndexT idx = remaining / stride_[i];
remaining -= idx * stride_[i];
index[i] = idx;
}
index[N - 1] = remaining;
index[Rank - 1] = remaining;
}
private:
IndexT stride_[N];
IndexT stride_[Rank];
};
template <int N>
class IdxHelper<uint32_t, N> {
template <int Rank>
class IdxHelper<uint32_t, Rank> {
public:
IdxHelper() {}
explicit IdxHelper(const uint32_t* dims) {
for (int i = N - 1; i >= 0; --i) {
uint32_t value = i < (N - 1) ? dims[i + 1] * stride_[i + 1] : 1;
for (int i = Rank - 1; i >= 0; --i) {
uint32_t value = i < (Rank - 1) ? dims[i + 1] * stride_[i + 1] : 1;
divmoder_[i] = phi::kps::details::FastDivMod(value);
stride_[i] = value;
}
......@@ -802,35 +918,35 @@ class IdxHelper<uint32_t, N> {
uint32_t* index) const {
uint32_t remaining = offset;
#pragma unroll
for (int i = 0; i < N - 1; ++i) {
for (int i = 0; i < Rank - 1; ++i) {
uint32_t idx = divmoder_[i].Div(remaining);
index[i] = idx;
remaining -= idx * stride_[i];
}
index[N - 1] = remaining;
index[Rank - 1] = remaining;
}
private:
uint32_t stride_[N];
phi::kps::details::FastDivMod divmoder_[N];
uint32_t stride_[Rank];
phi::kps::details::FastDivMod divmoder_[Rank];
};
// Transform index between memory offset and shape coodinate.
template <typename IndexT, int N>
template <typename IndexT, int Rank>
class IdxAndOffsetHelper {
public:
IdxAndOffsetHelper() {}
explicit IdxAndOffsetHelper(const IndexT* dims) {
index_helper = IdxHelper<IndexT, N>(dims);
index_helper = IdxHelper<IndexT, Rank>(dims);
}
__device__ __forceinline__ IndexT IndexToOffset(const IndexT* index) const {
IndexT offset = 0;
#pragma unroll
for (int i = 0; i < N - 1; ++i) {
for (int i = 0; i < Rank - 1; ++i) {
offset += index[i] * index_helper.GetStride(i);
}
offset += index[N - 1];
offset += index[Rank - 1];
return offset;
}
......@@ -840,7 +956,7 @@ class IdxAndOffsetHelper {
}
private:
IdxHelper<IndexT, N> index_helper;
IdxHelper<IndexT, Rank> index_helper;
};
template <typename IndexT, int Rank>
......@@ -1173,7 +1289,7 @@ struct TransposeLauncher {
T* dst) {
constexpr int ReadSize = sizeof(T) > sizeof(float) ? 1 : VecSize;
const IndexT cols = dims[rank - 1] / VecSize;
const IndexT n_cols_tile = GETTILESIZE(cols, kTileSize);
const IndexT n_cols_tile = GET_TILE_SIZE(cols, kTileSize);
if (perm_type == PermuteType::kGeneralTranspose) {
IndexT chs = (rank == 2) ? 1 : dims[0];
......@@ -1229,82 +1345,65 @@ struct TransposeLauncher {
vec_write = is_vec_write ? kVecRow : 1;
}
IndexT n_rows_tile = is_vec_write
? GETTILESIZE(rows, (kTileSize * vec_write))
? GET_TILE_SIZE(rows, (kTileSize * vec_write))
: num_rows_tile;
return n_rows_tile;
}
};
template <typename T, typename IndexT>
struct PermuteDispatch {
public:
PermuteDispatch(const phi::GPUContext& ctx,
PermTypeClassifier<T>* cls_ptr,
const std::vector<int64_t>& dims,
const std::vector<int32_t>& perm,
const IndexT count,
const T* src,
T* dst)
: dims_(dims), cls_(cls_ptr) {
rank_ = dims_.size();
type_ = cls_->GetPermType();
KernelTypeDispatch(ctx, count, perm, src, dst);
}
~PermuteDispatch() {}
private:
int rank_{0};
std::vector<int64_t> dims_;
PermTypeClassifier<T>* cls_;
PermuteType type_{kGeneralPermute};
inline void PermuteDispatch(const phi::GPUContext& ctx,
const IndexT& count,
PermTypeClassifier<T>* cls_ptr,
const std::vector<int64_t>& dims,
const std::vector<int32_t>& perm,
const T* src,
T* dst) {
int rank = dims.size();
PermuteType type = cls_ptr->GetPermType();
void KernelTypeDispatch(const phi::GPUContext& ctx,
const IndexT& count,
const std::vector<int32_t>& perm,
const T* src,
T* dst) {
#define TRANSPOSE_DISPATCH_VEC_SIZE(size) \
case size: { \
TransposeLauncher<T, IndexT, size>()( \
ctx, rank_, type_, dims_, cls_->GetRowsTile(), src, dst); \
ctx, rank, type, dims, cls_ptr->GetRowsTile(), src, dst); \
break; \
}
#define PERMUTE_DISPATCH_VEC_SIZE(size) \
case size: { \
PermuteLauncher<T, IndexT, size>()( \
ctx, rank_, count, type_, dims_, perm, src, dst); \
break; \
#define PERMUTE_DISPATCH_VEC_SIZE(size) \
case size: { \
PermuteLauncher<T, IndexT, size>()( \
ctx, rank, count, type, dims, perm, src, dst); \
break; \
}
switch (type_) {
case kSwapTranspose:
case kGeneralTranspose:
switch (cls_->GetVecSize()) {
TRANSPOSE_DISPATCH_VEC_SIZE(1);
TRANSPOSE_DISPATCH_VEC_SIZE(2);
TRANSPOSE_DISPATCH_VEC_SIZE(4);
}
break;
default:
switch (cls_->GetVecSize()) {
PERMUTE_DISPATCH_VEC_SIZE(1);
PERMUTE_DISPATCH_VEC_SIZE(2);
PERMUTE_DISPATCH_VEC_SIZE(4);
}
break;
}
switch (type) {
case kSwapTranspose:
case kGeneralTranspose:
switch (cls_ptr->GetVecSize()) {
TRANSPOSE_DISPATCH_VEC_SIZE(1);
TRANSPOSE_DISPATCH_VEC_SIZE(2);
TRANSPOSE_DISPATCH_VEC_SIZE(4);
}
break;
default:
switch (cls_ptr->GetVecSize()) {
PERMUTE_DISPATCH_VEC_SIZE(1);
PERMUTE_DISPATCH_VEC_SIZE(2);
PERMUTE_DISPATCH_VEC_SIZE(4);
}
break;
}
#define TRANSPOSE_DISPATCH_VEC_SIZE
#define PERMUTE_DISPATCH_VEC_SIZE
}
};
}
template <typename T>
inline void PermuteAndTranspose(const phi::GPUContext& ctx,
const int& rank,
const phi::DenseTensor& in,
phi::DenseTensor* out,
const DimsSimplifier& simplifier) {
inline void PermuteAndTranspose(
const phi::GPUContext& ctx,
const int& rank,
const phi::DenseTensor& in,
phi::DenseTensor* out,
const phi::funcs::PermuteDimsSimplifier& simplifier) {
T* dst_data = out->data<T>();
const T* src_data = in.data<T>();
const auto count = simplifier.GetCount();
......@@ -1324,18 +1423,18 @@ inline void PermuteAndTranspose(const phi::GPUContext& ctx,
} else {
if (count < std::numeric_limits<uint32_t>::max()) {
PermuteDispatch<T, uint32_t>(ctx,
static_cast<uint32_t>(count),
&classifier,
simplifier.GetSrcDims(),
simplifier.GetPerm(),
static_cast<uint32_t>(count),
src_data,
dst_data);
} else {
PermuteDispatch<T, int64_t>(ctx,
static_cast<int64_t>(count),
&classifier,
simplifier.GetSrcDims(),
simplifier.GetPerm(),
static_cast<int64_t>(count),
src_data,
dst_data);
}
......@@ -1343,12 +1442,13 @@ inline void PermuteAndTranspose(const phi::GPUContext& ctx,
}
template <typename T>
inline void PermuteWithEigen(const phi::GPUContext& ctx,
const int& rank,
const phi::DenseTensor& in,
phi::DenseTensor* out,
const DimsSimplifier& simplifier) {
const bool not_same_dims = simplifier.GetRank() != rank;
inline void PermuteWithEigen(
const phi::GPUContext& ctx,
const int& rank,
const phi::DenseTensor& in,
phi::DenseTensor* out,
const phi::funcs::PermuteDimsSimplifier& simplifier) {
bool not_same_dims = simplifier.GetRank() != rank;
if (not_same_dims) {
phi::DDim dst_dims = out->dims();
phi::DenseTensor temp_in;
......@@ -1373,10 +1473,10 @@ void TransposeGPUKernelDriver(const phi::GPUContext& ctx,
phi::DenseTensor* out) {
const int rank = perm.size();
int64_t numel = in.numel();
bool ret = TransposeSimple<T>::Impl(ctx, in, perm, out, numel);
bool ret = TransposeSimple<T>::Run(ctx, in, perm, out, numel);
if (!ret) {
auto simplifier =
DimsSimplifier(rank, numel, perm, phi::vectorize<int64_t>(in.dims()));
auto simplifier = phi::funcs::PermuteDimsSimplifier(
rank, numel, perm, phi::vectorize<int64_t>(in.dims()));
auto* tuner = phi::autotune::MakeTransposeTuner<T>(PermuteWithEigen<T>);
tuner->AddCallBack(PermuteAndTranspose<T>);
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
namespace funcs {
enum { kTransposeMKLDNNFP32 = 1, kTransposeMKLDNNINT8 = 2 };
enum PermuteType {
kCopy = 1,
kSwapTranspose = 2,
kGeneralTranspose = 3,
kVecPermute = 4,
kGeneralPermute = 5
};
constexpr int kBlockRows = 16;
constexpr int kTileSize = 32;
constexpr int kShareCol = (kTileSize + 1);
#define GETTILESIZE(LEN_, ALIGN_) \
((LEN_ + (ALIGN_ - 1)) & ~(ALIGN_ - 1)) / ALIGN_
template <typename T>
struct PermTypeClassifier {
public:
explicit PermTypeClassifier(const int sm_count,
const int rank,
const std::vector<int32_t>& perm,
const std::vector<int64_t>& dims,
const T* src,
T* dst) {
if (rank == 1) {
type_ = PermuteType::kCopy;
} else {
constexpr int64_t dim_limitation = 65536;
const int dst_vec_size = phi::GetVectorizedSize<T>(dst);
// While the last dim is fixed, there is chance for vectorized IO.
const int last_idx = rank - 1;
if (perm[last_idx] == last_idx) {
type_ = PermuteType::kVecPermute;
vec_size_ = GetDimVecSize(dst_vec_size, dims[last_idx], src, false);
return;
}
// Permute at last 2 dims, namely transpose.
if ((rank == 2 && perm[1] == 0 && perm[0] == 1) ||
(rank == 3 && perm[2] == 1 && perm[1] == 2)) {
int64_t channel = rank == 2 ? 1 : dims[0];
// Currently, transpose kernel cannot cover the case that channel
// dimension is more than 65536 which is the limitation of dim3 setting.
// This special case will be covered by extended transpose kernel later.
if (channel < dim_limitation) {
type_ = PermuteType::kGeneralTranspose;
num_rows_tile_ = GETTILESIZE(dims[rank - 2], kTileSize);
int dim_vec_size = GetDimVecSize(dst_vec_size, dims[last_idx], src);
int tile_size =
channel * num_rows_tile_ * GETTILESIZE(dims[last_idx], kTileSize);
vec_size_ = tile_size < sm_count ? 1 : dim_vec_size;
} else {
type_ = PermuteType::kGeneralPermute;
}
return;
}
// Permute at first dim and third dim.
if (rank == 3 && perm[2] == 0 && perm[1] == 1) {
// Currently, transpose kernel cannot cover the case that channel
// dimension is more than 65536 which is the limitation of dim3 setting.
// This special case will be covered by extended transpose kernel later.
if (dims[1] < dim_limitation) {
type_ = PermuteType::kSwapTranspose;
num_rows_tile_ = GETTILESIZE(dims[0], kTileSize);
int dim_vec_size = GetDimVecSize(dst_vec_size, dims[last_idx], src);
int tile_size =
dims[1] * num_rows_tile_ * GETTILESIZE(dims[2], kTileSize);
vec_size_ = tile_size < sm_count ? 1 : dim_vec_size;
} else {
type_ = PermuteType::kGeneralPermute;
}
return;
}
vec_size_ = dst_vec_size;
}
}
~PermTypeClassifier() = default;
int GetVecSize() const { return vec_size_; }
int GetRowsTile() const { return num_rows_tile_; }
PermuteType GetPermType() const { return type_; }
private:
int vec_size_{1};
int64_t num_rows_tile_{0};
PermuteType type_{kGeneralPermute};
// To find if highest common divisor and make it as vec_size.
int GetDimVecSize(const int dst_vec_size,
const int64_t target_dim,
const T* src,
bool use_share_mem = true) {
const int vec_size = std::min(dst_vec_size, phi::GetVectorizedSize<T>(src));
int dim_vec_size = 1;
for (int size = vec_size; size > 0; size /= 2) {
if (target_dim % size == 0) {
dim_vec_size = size;
break;
}
}
if (use_share_mem) {
// By bytes limitation of shared_memory.
return (sizeof(T) > sizeof(float) ? 1 : dim_vec_size);
} else {
return dim_vec_size;
}
}
};
} // namespace funcs
} // namespace phi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册