未验证 提交 1d7b75dd 编写于 作者: L limingshu 提交者: GitHub

Support Ternary ops in elmentwise and broadcast (#33976)

上级 a68709d8
......@@ -17,7 +17,6 @@ limitations under the License. */
#include <utility>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
......
......@@ -163,7 +163,7 @@ struct DimensionsTransform {
struct StridesCalculation {
std::vector<std::vector<uint32_t>> strides;
std::vector<FastDivMod> divmoders;
std::vector<platform::FastDivMod> divmoders;
private:
// To calculate the strides of each input_tensor.
......@@ -190,7 +190,7 @@ struct StridesCalculation {
strides.resize(N, std::vector<uint32_t>(dim_size, 1));
for (int i = 0; i < dim_size; ++i) {
divmoders[i] = FastDivMod(out_dims[i]);
divmoders[i] = platform::FastDivMod(out_dims[i]);
}
CalculateStrides(N, dim_size, in_dims);
}
......@@ -198,21 +198,21 @@ struct StridesCalculation {
template <typename InT, typename OutT, typename Functor, ElementwiseType ET,
int VecSize, int kDims>
struct BroadcastArgsWarpper {
using InVecType = CudaAlignedVector<InT, VecSize>;
using OutVecType = CudaAlignedVector<OutT, VecSize>;
struct BroadcastArgsWrapper {
using InVecType = platform::CudaAlignedVector<InT, VecSize>;
using OutVecType = platform::CudaAlignedVector<OutT, VecSize>;
OutT *out_data;
OutVecType *vec_out_data;
const InT *__restrict__ in_data[ET];
const InVecType *__restrict__ vec_in_data[ET];
bool no_broadcast[ET];
FastDivMod divmoders[kDims];
platform::FastDivMod divmoders[kDims];
uint32_t strides[ET][framework::DDim::kMaxRank];
uint32_t scalar_cal_offset;
Functor func;
HOSTDEVICE BroadcastArgsWarpper(
HOSTDEVICE BroadcastArgsWrapper(
const std::vector<const framework::Tensor *> &ins, framework::Tensor *out,
int scalar_cal_offset, Functor func,
const StridesCalculation &offset_calculator)
......@@ -227,7 +227,7 @@ struct BroadcastArgsWarpper {
out_data = out->data<OutT>();
vec_out_data = reinterpret_cast<OutVecType *>(out_data);
memcpy(divmoders, offset_calculator.divmoders.data(),
kDims * sizeof(FastDivMod));
kDims * sizeof(platform::FastDivMod));
}
__device__ __forceinline__ uint32_t GetOffsetByDivmod(int idx, int in_idx) {
......@@ -302,30 +302,29 @@ struct BroadcastArgsWarpper {
}
};
template <typename InT, typename OutT, typename BroadcastArgsWarpper,
template <typename InT, typename OutT, typename BroadcastArgsWrapper,
ElementwiseType ET>
__device__ inline void ScalarizedBroadcastKernelImpl(
BroadcastArgsWarpper broadcast_warpper, int tid) {
BroadcastArgsWrapper broadcast_wrapper, int tid) {
InT args[ET];
OutT args_out;
broadcast_warpper.LoadScalarizedData(args, tid);
broadcast_wrapper.LoadScalarizedData(args, tid);
#pragma unroll(ET)
for (int j = 1; j < ET; ++j) {
args_out = broadcast_warpper.func(args);
}
broadcast_warpper.StoreScalarizedData(args_out, tid);
// Calcualtion of the in_tensor data.
args_out = broadcast_wrapper.func(args);
broadcast_wrapper.StoreScalarizedData(args_out, tid);
}
template <typename InT, typename OutT, typename BroadcastArgsWarpper,
template <typename InT, typename OutT, typename BroadcastArgsWrapper,
ElementwiseType ET, int VecSize>
__device__ inline void VectorizedBroadcastKernelImpl(
BroadcastArgsWarpper broadcast_warpper, int tid) {
using OutVecType = CudaAlignedVector<OutT, VecSize>;
BroadcastArgsWrapper broadcast_wrapper, int tid) {
using OutVecType = platform::CudaAlignedVector<OutT, VecSize>;
OutVecType args_out;
InT ins[ET];
InT args[ET][VecSize];
broadcast_warpper.LoadVectorizedData(args, tid);
broadcast_wrapper.LoadVectorizedData(args, tid);
#pragma unroll(VecSize)
for (int i = 0; i < VecSize; ++i) {
......@@ -333,30 +332,30 @@ __device__ inline void VectorizedBroadcastKernelImpl(
for (int j = 0; j < ET; ++j) {
ins[j] = args[j][i];
}
args_out.val[i] = broadcast_warpper.func(ins);
args_out.val[i] = broadcast_wrapper.func(ins);
}
broadcast_warpper.StoreVectorizedData(args_out, tid);
broadcast_wrapper.StoreVectorizedData(args_out, tid);
}
template <typename InT, typename OutT, typename BroadcastArgsWarpper,
template <typename InT, typename OutT, typename BroadcastArgsWrapper,
ElementwiseType ET, int VecSize>
__global__ void ElementwiseBroadcastKernel(
BroadcastArgsWarpper broadcast_warpper, int main_tid, int tail_tid) {
BroadcastArgsWrapper broadcast_wrapper, int main_tid, int tail_tid) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
// Vectorized calculation of major data whose length is the max multipler of
// VecSize,
// eg: Calcualting the front 1024-length data in total 1027 data once VecSize
// is 4.
if (tid < main_tid) {
VectorizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWarpper, ET, VecSize>(
broadcast_warpper, tid);
VectorizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWrapper, ET, VecSize>(
broadcast_wrapper, tid);
}
// Scalarzed calculation of rest data whose lenght cannot fulfill VecSize.
// eg: Calcualting the rest 3-length data in total 1027 data once VecSize is
// 4.
if (tid < tail_tid) {
ScalarizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWarpper, ET>(
broadcast_warpper, tid);
ScalarizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWrapper, ET>(
broadcast_wrapper, tid);
}
}
......@@ -367,7 +366,7 @@ void LaunchBroadcastKernelForDifferentDimSize(
const std::vector<const framework::Tensor *> &ins, framework::Tensor *out,
int axis, Functor func) {
int numel = out->numel();
const int threads = 256;
int threads = GetThreadsConfig(ctx, numel, VecSize);
int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
int main_tid = numel / VecSize;
int tail_tid = numel % VecSize;
......@@ -380,75 +379,75 @@ void LaunchBroadcastKernelForDifferentDimSize(
switch (merge_dims.dim_size) {
case 1: {
auto broadcast_warpper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 1>(
auto broadcast_wrapper =
BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 1>(
ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid);
broadcast_wrapper, main_tid, tail_tid);
break;
}
case 2: {
auto broadcast_warpper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 2>(
auto broadcast_wrapper =
BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 2>(
ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid);
broadcast_wrapper, main_tid, tail_tid);
break;
}
case 3: {
auto broadcast_warpper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 3>(
auto broadcast_wrapper =
BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 3>(
ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid);
broadcast_wrapper, main_tid, tail_tid);
break;
}
case 4: {
auto broadcast_warpper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 4>(
auto broadcast_wrapper =
BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 4>(
ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid);
broadcast_wrapper, main_tid, tail_tid);
break;
}
case 5: {
auto broadcast_warpper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 5>(
auto broadcast_wrapper =
BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 5>(
ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid);
broadcast_wrapper, main_tid, tail_tid);
break;
}
case 6: {
auto broadcast_warpper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 6>(
auto broadcast_wrapper =
BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 6>(
ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid);
broadcast_wrapper, main_tid, tail_tid);
break;
}
case 7: {
auto broadcast_warpper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 7>(
auto broadcast_wrapper =
BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 7>(
ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid);
broadcast_wrapper, main_tid, tail_tid);
break;
}
case 8: {
auto broadcast_warpper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 8>(
auto broadcast_wrapper =
BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 8>(
ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid);
broadcast_wrapper, main_tid, tail_tid);
break;
}
default: {
......@@ -473,11 +472,11 @@ void LaunchBroadcastElementwiseCudaKernel(
int in_vec_size = 4;
framework::Tensor *out = (*outs)[0];
for (auto *in : ins) {
auto temp_size = GetVectorizedSizeImpl<InT>(in->data<InT>());
auto temp_size = platform::GetVectorizedSize<InT>(in->data<InT>());
in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size)
: in_vec_size;
}
int out_vec_size = GetVectorizedSizeImpl<OutT>(out->data<OutT>());
int out_vec_size = platform::GetVectorizedSize<OutT>(out->data<OutT>());
int vec_size = std::min(out_vec_size, in_vec_size);
switch (vec_size) {
......
......@@ -26,7 +26,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
enum ElementwiseType { kUnary = 1, kBinary = 2 };
enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3 };
/*
* According to NVIDIA, if number of threads per block is 64/128/256/512,
......@@ -52,98 +52,73 @@ inline int GetThreadsConfig(const platform::CUDADeviceContext &ctx,
return std::max(64, threads);
}
/*
* Only the address of input data is the multiplier of 1,2,4, vectorized load
* with corresponding multiplier-value is possible. Moreover, the maximum length
* of vectorized load is 128 bits once. Hence, valid length of vectorized load
* shall be determined under both former constraints.
*/
template <typename T>
int GetVectorizedSizeImpl(const T *pointer) {
constexpr int max_load_bits = 128;
int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
uint64_t address = reinterpret_cast<uint64_t>(pointer);
constexpr int vec8 =
std::alignment_of<CudaAlignedVector<T, 8>>::value; // NOLINT
constexpr int vec4 =
std::alignment_of<CudaAlignedVector<T, 4>>::value; // NOLINT
constexpr int vec2 =
std::alignment_of<CudaAlignedVector<T, 2>>::value; // NOLINT
if (address % vec8 == 0) {
/*
* Currently, decide to deal with no more than 4 data once while adopting
* vectorization load/store, if performance test shows that dealing with
* 8 data once in vectorization load/store does get optimized, return code
* below can be changed into " return std::min(8, valid_vec_size); " .
*/
return std::min(4, valid_vec_size);
} else if (address % vec4 == 0) {
return std::min(4, valid_vec_size);
} else if (address % vec2 == 0) {
return std::min(2, valid_vec_size);
} else {
return 1;
}
}
template <typename InT, typename OutT>
int GetVectorizedSize(const std::vector<const framework::Tensor *> &ins,
const std::vector<framework::Tensor *> &outs) {
int GetVectorizedSizeForIO(const std::vector<const framework::Tensor *> &ins,
const std::vector<framework::Tensor *> &outs) {
int vec_size = 4;
for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
vec_size =
std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<InT>()));
vec_size = std::min<int>(vec_size,
platform::GetVectorizedSize((*iter)->data<InT>()));
}
for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
vec_size =
std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<OutT>()));
vec_size = std::min<int>(
vec_size, platform::GetVectorizedSize((*iter)->data<OutT>()));
}
return vec_size;
}
template <ElementwiseType ET, int VecSize, typename InT, typename OutT>
struct ElementwiseDataWrapper {
OutT *out;
const InT *in0;
const InT *in1;
__device__ ElementwiseDataWrapper(OutT *out, const InT *in0,
const InT *in1 = nullptr)
: out(out), in0(in0), in1(in1) {}
using InVecType = CudaAlignedVector<InT, VecSize>;
using OutVecType = CudaAlignedVector<OutT, VecSize>;
inline __device__ void load_vector(InVecType args[], int idx) {
const InVecType *x_vec = reinterpret_cast<const InVecType *>(in0);
args[0] = x_vec[idx];
if (ET == ElementwiseType::kBinary) {
const InVecType *y_vec = reinterpret_cast<const InVecType *>(in1);
args[1] = y_vec[idx];
using InVecType = platform::CudaAlignedVector<InT, VecSize>;
using OutVecType = platform::CudaAlignedVector<OutT, VecSize>;
const InT *__restrict__ in_data[ET];
OutT *out_data;
uint32_t scalar_cal_offset;
HOSTDEVICE ElementwiseDataWrapper(
const std::vector<const framework::Tensor *> &ins,
std::vector<framework::Tensor *> *outs, uint32_t scalar_cal_offset)
: scalar_cal_offset(scalar_cal_offset) {
#pragma unroll
for (int i = 0; i < ET; ++i) {
in_data[i] = ins[i]->data<InT>();
}
out_data = (*outs)[0]->data<OutT>();
}
inline __device__ void LoadVectorizedData(InVecType vec_args[], int tid) {
#pragma unroll
for (int i = 0; i < ET; ++i) {
const InVecType *in_vec_data =
reinterpret_cast<const InVecType *>(in_data[i]);
vec_args[i] = in_vec_data[tid];
}
}
inline __device__ void load_scalar(InT args[], int idx) {
args[0] = in0[idx];
if (ET == ElementwiseType::kBinary) {
args[1] = in1[idx];
inline __device__ void LoadScalarizedData(InT args[], int tid) {
#pragma unroll
for (int i = 0; i < ET; ++i) {
args[i] = in_data[i][tid + scalar_cal_offset];
}
}
inline __device__ void store_vector(OutVecType res, int idx) {
OutVecType *out_vec = reinterpret_cast<OutVecType *>(out);
out_vec[idx] = res;
inline __device__ void StoreVectorizedData(OutVecType res, int tid) {
OutVecType *out_vec = reinterpret_cast<OutVecType *>(out_data);
out_vec[tid] = res;
}
inline __device__ void store_scalar(OutT res, int idx) { out[idx] = res; }
inline __device__ void StoreScalarizedData(OutT res, int tid) {
out_data[tid + scalar_cal_offset] = res;
}
};
template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
typename Functor>
__device__ inline void VectorizedKernelImpl(
ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
int tid) {
using InVecType = CudaAlignedVector<InT, VecSize>;
using OutVecType = CudaAlignedVector<OutT, VecSize>;
template <ElementwiseType ET, int VecSize, typename ElementwiseWrapper,
typename InT, typename OutT, typename Functor>
__device__ inline void VectorizedKernelImpl(ElementwiseWrapper data,
Functor func, int tid) {
using InVecType = platform::CudaAlignedVector<InT, VecSize>;
using OutVecType = platform::CudaAlignedVector<OutT, VecSize>;
InVecType ins_vec[ET];
OutVecType out_vec;
InT *ins_ptr[ET];
......@@ -153,7 +128,7 @@ __device__ inline void VectorizedKernelImpl(
ins_ptr[i] = reinterpret_cast<InT *>(&(ins_vec[i]));
}
// load
data.load_vector(ins_vec, tid);
data.LoadVectorizedData(ins_vec, tid);
// compute
#pragma unroll
......@@ -165,52 +140,48 @@ __device__ inline void VectorizedKernelImpl(
out_vec.val[i] = func(ins);
}
// store
data.store_vector(out_vec, tid);
data.StoreVectorizedData(out_vec, tid);
}
template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
typename Functor>
__device__ inline void ScalarKernelImpl(
ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
int start, int remain) {
template <ElementwiseType ET, typename ElementwiseWrapper, typename InT,
typename OutT, typename Functor>
__device__ inline void ScalarKernelImpl(ElementwiseWrapper data, Functor func,
int tid) {
InT ins[ET];
OutT out;
for (int i = 0; i < remain; ++i) {
int idx = start + i;
// load
data.load_scalar(ins, idx);
// compute
out = func(ins);
// store
data.store_scalar(out, idx);
}
// load
data.LoadScalarizedData(ins, tid);
// compute
out = func(ins);
// store
data.StoreScalarizedData(out, tid);
}
template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
typename Functor>
__global__ void VectorizedKernel(const InT *__restrict__ in0,
const InT *__restrict__ in1, OutT *out,
int size, Functor func) {
template <ElementwiseType ET, typename ElementwiseWrapper, typename InT,
typename OutT, int VecSize, typename Functor>
__global__ void VectorizedKernel(ElementwiseWrapper data, int main_tid,
int tail_tid, Functor func) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int remain = size - VecSize * tid;
remain = remain > 0 ? remain : 0;
auto data = ElementwiseDataWrapper<ET, VecSize, InT, OutT>(out, in0, in1);
if (remain >= VecSize) {
VectorizedKernelImpl(data, func, tid);
} else {
ScalarKernelImpl(data, func, tid * VecSize, remain);
if (tid < main_tid) {
VectorizedKernelImpl<ET, VecSize, ElementwiseWrapper, InT, OutT, Functor>(
data, func, tid);
}
if (tid < tail_tid) {
ScalarKernelImpl<ET, ElementwiseWrapper, InT, OutT, Functor>(data, func,
tid);
}
}
template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
__global__ void ScalarKernel(const InT *__restrict__ in0,
const InT *__restrict__ in1, OutT *out, int size,
Functor func) {
auto data = ElementwiseDataWrapper<ET, 1, InT, OutT>(out, in0, in1);
template <ElementwiseType ET, typename ElementwiseWrapper, typename InT,
typename OutT, typename Functor>
__global__ void ScalarKernel(ElementwiseWrapper data, int numel, Functor func) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int remain = tid < size ? 1 : 0;
ScalarKernelImpl(data, func, tid, remain);
if (tid < numel) {
ScalarKernelImpl<ET, ElementwiseWrapper, InT, OutT, Functor>(data, func,
tid);
}
}
template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
......@@ -219,35 +190,48 @@ void LaunchSameDimsElementwiseCudaKernel(
const std::vector<const framework::Tensor *> &ins,
std::vector<framework::Tensor *> *outs, Functor func) {
// calculate the max vec_size for all ins and outs
auto size = ins[0]->numel();
int vec_size = GetVectorizedSize<InT, OutT>(ins, *outs);
int block_size = GetThreadsConfig(ctx, size, vec_size);
auto numel = ins[0]->numel();
int vec_size = GetVectorizedSizeForIO<InT, OutT>(ins, *outs);
int block_size = GetThreadsConfig(ctx, numel, vec_size);
int grid_size =
((size + vec_size - 1) / vec_size + block_size - 1) / block_size;
const InT *in0 = ins[0]->data<InT>();
const InT *in1 =
(ET == ElementwiseType::kBinary) ? ins[1]->data<InT>() : nullptr;
OutT *out = (*outs)[0]->data<OutT>();
((numel + vec_size - 1) / vec_size + block_size - 1) / block_size;
int main_tid = numel / vec_size;
int tail_tid = numel % vec_size;
uint32_t vec_len = main_tid * vec_size;
// cuda kernel
auto stream = ctx.stream();
switch (vec_size) {
case 4:
VectorizedKernel<ET, 4><<<grid_size, block_size, 0, stream>>>(
in0, in1, out, size, func);
case 4: {
auto data_wrapper =
ElementwiseDataWrapper<ET, 4, InT, OutT>(ins, outs, vec_len);
VectorizedKernel<ET, decltype(data_wrapper), InT, OutT,
4><<<grid_size, block_size, 0, stream>>>(
data_wrapper, main_tid, tail_tid, func);
break;
case 2:
VectorizedKernel<ET, 2><<<grid_size, block_size, 0, stream>>>(
in0, in1, out, size, func);
}
case 2: {
auto data_wrapper =
ElementwiseDataWrapper<ET, 2, InT, OutT>(ins, outs, vec_len);
VectorizedKernel<ET, decltype(data_wrapper), InT, OutT,
2><<<grid_size, block_size, 0, stream>>>(
data_wrapper, main_tid, tail_tid, func);
break;
case 1:
ScalarKernel<ET><<<grid_size, block_size, 0, stream>>>(in0, in1, out,
size, func);
}
case 1: {
auto data_wrapper =
ElementwiseDataWrapper<ET, 1, InT, OutT>(ins, outs, 0);
ScalarKernel<ET, decltype(data_wrapper), InT,
OutT><<<grid_size, block_size, 0, stream>>>(data_wrapper,
numel, func);
break;
default:
}
default: {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported vectorized size: %d !", vec_size));
break;
}
}
}
......
......@@ -158,12 +158,13 @@ struct IndexCalculator {
: dim(dim) {
dims = detail::VectorToArray<int, kMaxRank>(cal_dims);
strides = detail::VectorToArray<int, kMaxRank>(full_strides);
std::vector<FastDivMod> cal_divmoders;
std::vector<platform::FastDivMod> cal_divmoders;
// fast divmod
for (auto i : cal_strides) {
cal_divmoders.push_back(FastDivMod(i));
cal_divmoders.push_back(platform::FastDivMod(i));
}
divmoders = detail::VectorToArray<FastDivMod, kMaxRank>(cal_divmoders);
divmoders =
detail::VectorToArray<platform::FastDivMod, kMaxRank>(cal_divmoders);
}
__device__ inline int Get(int offset) const {
......@@ -183,7 +184,7 @@ struct IndexCalculator {
int dim;
framework::Array<int, kMaxRank> dims;
framework::Array<int, kMaxRank> strides;
framework::Array<FastDivMod, kMaxRank> divmoders;
framework::Array<platform::FastDivMod, kMaxRank> divmoders;
};
// reduce config
......
......@@ -20,7 +20,7 @@ limitations under the License. */
#define INT_BITS 32
namespace paddle {
namespace operators {
namespace platform {
template <typename T, int Size>
struct alignas(sizeof(T) * Size) CudaAlignedVector {
......@@ -65,5 +65,39 @@ struct FastDivMod {
uint32_t multiplier;
};
} // namespace operators
/*
* Only the address of input data is the multiplier of 1,2,4, vectorized load
* with corresponding multiplier-value is possible. Moreover, the maximum length
* of vectorized load is 128 bits once. Hence, valid length of vectorized load
* shall be determined under both former constraints.
*/
template <typename T>
int GetVectorizedSize(const T *pointer) {
constexpr int max_load_bits = 128;
int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
uint64_t address = reinterpret_cast<uint64_t>(pointer);
constexpr int vec8 =
std::alignment_of<CudaAlignedVector<T, 8>>::value; // NOLINT
constexpr int vec4 =
std::alignment_of<CudaAlignedVector<T, 4>>::value; // NOLINT
constexpr int vec2 =
std::alignment_of<CudaAlignedVector<T, 2>>::value; // NOLINT
if (address % vec8 == 0) {
/*
* Currently, decide to deal with no more than 4 data once while adopting
* vectorization load/store, if performance test shows that dealing with
* 8 data once in vectorization load/store does get optimized, return code
* below can be changed into " return std::min(8, valid_vec_size); " .
*/
return std::min(4, valid_vec_size);
} else if (address % vec4 == 0) {
return std::min(4, valid_vec_size);
} else if (address % vec2 == 0) {
return std::min(2, valid_vec_size);
} else {
return 1;
}
}
} // namespace platform
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册