未验证 提交 1d7b75dd 编写于 作者: L limingshu 提交者: GitHub

Support Ternary ops in elmentwise and broadcast (#33976)

上级 a68709d8
...@@ -17,7 +17,6 @@ limitations under the License. */ ...@@ -17,7 +17,6 @@ limitations under the License. */
#include <utility> #include <utility>
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -163,7 +163,7 @@ struct DimensionsTransform { ...@@ -163,7 +163,7 @@ struct DimensionsTransform {
struct StridesCalculation { struct StridesCalculation {
std::vector<std::vector<uint32_t>> strides; std::vector<std::vector<uint32_t>> strides;
std::vector<FastDivMod> divmoders; std::vector<platform::FastDivMod> divmoders;
private: private:
// To calculate the strides of each input_tensor. // To calculate the strides of each input_tensor.
...@@ -190,7 +190,7 @@ struct StridesCalculation { ...@@ -190,7 +190,7 @@ struct StridesCalculation {
strides.resize(N, std::vector<uint32_t>(dim_size, 1)); strides.resize(N, std::vector<uint32_t>(dim_size, 1));
for (int i = 0; i < dim_size; ++i) { for (int i = 0; i < dim_size; ++i) {
divmoders[i] = FastDivMod(out_dims[i]); divmoders[i] = platform::FastDivMod(out_dims[i]);
} }
CalculateStrides(N, dim_size, in_dims); CalculateStrides(N, dim_size, in_dims);
} }
...@@ -198,21 +198,21 @@ struct StridesCalculation { ...@@ -198,21 +198,21 @@ struct StridesCalculation {
template <typename InT, typename OutT, typename Functor, ElementwiseType ET, template <typename InT, typename OutT, typename Functor, ElementwiseType ET,
int VecSize, int kDims> int VecSize, int kDims>
struct BroadcastArgsWarpper { struct BroadcastArgsWrapper {
using InVecType = CudaAlignedVector<InT, VecSize>; using InVecType = platform::CudaAlignedVector<InT, VecSize>;
using OutVecType = CudaAlignedVector<OutT, VecSize>; using OutVecType = platform::CudaAlignedVector<OutT, VecSize>;
OutT *out_data; OutT *out_data;
OutVecType *vec_out_data; OutVecType *vec_out_data;
const InT *__restrict__ in_data[ET]; const InT *__restrict__ in_data[ET];
const InVecType *__restrict__ vec_in_data[ET]; const InVecType *__restrict__ vec_in_data[ET];
bool no_broadcast[ET]; bool no_broadcast[ET];
FastDivMod divmoders[kDims]; platform::FastDivMod divmoders[kDims];
uint32_t strides[ET][framework::DDim::kMaxRank]; uint32_t strides[ET][framework::DDim::kMaxRank];
uint32_t scalar_cal_offset; uint32_t scalar_cal_offset;
Functor func; Functor func;
HOSTDEVICE BroadcastArgsWarpper( HOSTDEVICE BroadcastArgsWrapper(
const std::vector<const framework::Tensor *> &ins, framework::Tensor *out, const std::vector<const framework::Tensor *> &ins, framework::Tensor *out,
int scalar_cal_offset, Functor func, int scalar_cal_offset, Functor func,
const StridesCalculation &offset_calculator) const StridesCalculation &offset_calculator)
...@@ -227,7 +227,7 @@ struct BroadcastArgsWarpper { ...@@ -227,7 +227,7 @@ struct BroadcastArgsWarpper {
out_data = out->data<OutT>(); out_data = out->data<OutT>();
vec_out_data = reinterpret_cast<OutVecType *>(out_data); vec_out_data = reinterpret_cast<OutVecType *>(out_data);
memcpy(divmoders, offset_calculator.divmoders.data(), memcpy(divmoders, offset_calculator.divmoders.data(),
kDims * sizeof(FastDivMod)); kDims * sizeof(platform::FastDivMod));
} }
__device__ __forceinline__ uint32_t GetOffsetByDivmod(int idx, int in_idx) { __device__ __forceinline__ uint32_t GetOffsetByDivmod(int idx, int in_idx) {
...@@ -302,30 +302,29 @@ struct BroadcastArgsWarpper { ...@@ -302,30 +302,29 @@ struct BroadcastArgsWarpper {
} }
}; };
template <typename InT, typename OutT, typename BroadcastArgsWarpper, template <typename InT, typename OutT, typename BroadcastArgsWrapper,
ElementwiseType ET> ElementwiseType ET>
__device__ inline void ScalarizedBroadcastKernelImpl( __device__ inline void ScalarizedBroadcastKernelImpl(
BroadcastArgsWarpper broadcast_warpper, int tid) { BroadcastArgsWrapper broadcast_wrapper, int tid) {
InT args[ET]; InT args[ET];
OutT args_out; OutT args_out;
broadcast_warpper.LoadScalarizedData(args, tid); broadcast_wrapper.LoadScalarizedData(args, tid);
#pragma unroll(ET) // Calcualtion of the in_tensor data.
for (int j = 1; j < ET; ++j) { args_out = broadcast_wrapper.func(args);
args_out = broadcast_warpper.func(args);
} broadcast_wrapper.StoreScalarizedData(args_out, tid);
broadcast_warpper.StoreScalarizedData(args_out, tid);
} }
template <typename InT, typename OutT, typename BroadcastArgsWarpper, template <typename InT, typename OutT, typename BroadcastArgsWrapper,
ElementwiseType ET, int VecSize> ElementwiseType ET, int VecSize>
__device__ inline void VectorizedBroadcastKernelImpl( __device__ inline void VectorizedBroadcastKernelImpl(
BroadcastArgsWarpper broadcast_warpper, int tid) { BroadcastArgsWrapper broadcast_wrapper, int tid) {
using OutVecType = CudaAlignedVector<OutT, VecSize>; using OutVecType = platform::CudaAlignedVector<OutT, VecSize>;
OutVecType args_out; OutVecType args_out;
InT ins[ET]; InT ins[ET];
InT args[ET][VecSize]; InT args[ET][VecSize];
broadcast_warpper.LoadVectorizedData(args, tid); broadcast_wrapper.LoadVectorizedData(args, tid);
#pragma unroll(VecSize) #pragma unroll(VecSize)
for (int i = 0; i < VecSize; ++i) { for (int i = 0; i < VecSize; ++i) {
...@@ -333,30 +332,30 @@ __device__ inline void VectorizedBroadcastKernelImpl( ...@@ -333,30 +332,30 @@ __device__ inline void VectorizedBroadcastKernelImpl(
for (int j = 0; j < ET; ++j) { for (int j = 0; j < ET; ++j) {
ins[j] = args[j][i]; ins[j] = args[j][i];
} }
args_out.val[i] = broadcast_warpper.func(ins); args_out.val[i] = broadcast_wrapper.func(ins);
} }
broadcast_warpper.StoreVectorizedData(args_out, tid); broadcast_wrapper.StoreVectorizedData(args_out, tid);
} }
template <typename InT, typename OutT, typename BroadcastArgsWarpper, template <typename InT, typename OutT, typename BroadcastArgsWrapper,
ElementwiseType ET, int VecSize> ElementwiseType ET, int VecSize>
__global__ void ElementwiseBroadcastKernel( __global__ void ElementwiseBroadcastKernel(
BroadcastArgsWarpper broadcast_warpper, int main_tid, int tail_tid) { BroadcastArgsWrapper broadcast_wrapper, int main_tid, int tail_tid) {
int tid = threadIdx.x + blockIdx.x * blockDim.x; int tid = threadIdx.x + blockIdx.x * blockDim.x;
// Vectorized calculation of major data whose length is the max multipler of // Vectorized calculation of major data whose length is the max multipler of
// VecSize, // VecSize,
// eg: Calcualting the front 1024-length data in total 1027 data once VecSize // eg: Calcualting the front 1024-length data in total 1027 data once VecSize
// is 4. // is 4.
if (tid < main_tid) { if (tid < main_tid) {
VectorizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWarpper, ET, VecSize>( VectorizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWrapper, ET, VecSize>(
broadcast_warpper, tid); broadcast_wrapper, tid);
} }
// Scalarzed calculation of rest data whose lenght cannot fulfill VecSize. // Scalarzed calculation of rest data whose lenght cannot fulfill VecSize.
// eg: Calcualting the rest 3-length data in total 1027 data once VecSize is // eg: Calcualting the rest 3-length data in total 1027 data once VecSize is
// 4. // 4.
if (tid < tail_tid) { if (tid < tail_tid) {
ScalarizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWarpper, ET>( ScalarizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWrapper, ET>(
broadcast_warpper, tid); broadcast_wrapper, tid);
} }
} }
...@@ -367,7 +366,7 @@ void LaunchBroadcastKernelForDifferentDimSize( ...@@ -367,7 +366,7 @@ void LaunchBroadcastKernelForDifferentDimSize(
const std::vector<const framework::Tensor *> &ins, framework::Tensor *out, const std::vector<const framework::Tensor *> &ins, framework::Tensor *out,
int axis, Functor func) { int axis, Functor func) {
int numel = out->numel(); int numel = out->numel();
const int threads = 256; int threads = GetThreadsConfig(ctx, numel, VecSize);
int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads; int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
int main_tid = numel / VecSize; int main_tid = numel / VecSize;
int tail_tid = numel % VecSize; int tail_tid = numel % VecSize;
...@@ -380,75 +379,75 @@ void LaunchBroadcastKernelForDifferentDimSize( ...@@ -380,75 +379,75 @@ void LaunchBroadcastKernelForDifferentDimSize(
switch (merge_dims.dim_size) { switch (merge_dims.dim_size) {
case 1: { case 1: {
auto broadcast_warpper = auto broadcast_wrapper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 1>( BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 1>(
ins, out, vec_len, func, offset_calculator); ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET, ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>( VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid); broadcast_wrapper, main_tid, tail_tid);
break; break;
} }
case 2: { case 2: {
auto broadcast_warpper = auto broadcast_wrapper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 2>( BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 2>(
ins, out, vec_len, func, offset_calculator); ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET, ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>( VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid); broadcast_wrapper, main_tid, tail_tid);
break; break;
} }
case 3: { case 3: {
auto broadcast_warpper = auto broadcast_wrapper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 3>( BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 3>(
ins, out, vec_len, func, offset_calculator); ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET, ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>( VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid); broadcast_wrapper, main_tid, tail_tid);
break; break;
} }
case 4: { case 4: {
auto broadcast_warpper = auto broadcast_wrapper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 4>( BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 4>(
ins, out, vec_len, func, offset_calculator); ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET, ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>( VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid); broadcast_wrapper, main_tid, tail_tid);
break; break;
} }
case 5: { case 5: {
auto broadcast_warpper = auto broadcast_wrapper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 5>( BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 5>(
ins, out, vec_len, func, offset_calculator); ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET, ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>( VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid); broadcast_wrapper, main_tid, tail_tid);
break; break;
} }
case 6: { case 6: {
auto broadcast_warpper = auto broadcast_wrapper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 6>( BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 6>(
ins, out, vec_len, func, offset_calculator); ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET, ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>( VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid); broadcast_wrapper, main_tid, tail_tid);
break; break;
} }
case 7: { case 7: {
auto broadcast_warpper = auto broadcast_wrapper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 7>( BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 7>(
ins, out, vec_len, func, offset_calculator); ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET, ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>( VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid); broadcast_wrapper, main_tid, tail_tid);
break; break;
} }
case 8: { case 8: {
auto broadcast_warpper = auto broadcast_wrapper =
BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 8>( BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 8>(
ins, out, vec_len, func, offset_calculator); ins, out, vec_len, func, offset_calculator);
ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET, ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
VecSize><<<blocks, threads, 0, stream>>>( VecSize><<<blocks, threads, 0, stream>>>(
broadcast_warpper, main_tid, tail_tid); broadcast_wrapper, main_tid, tail_tid);
break; break;
} }
default: { default: {
...@@ -473,11 +472,11 @@ void LaunchBroadcastElementwiseCudaKernel( ...@@ -473,11 +472,11 @@ void LaunchBroadcastElementwiseCudaKernel(
int in_vec_size = 4; int in_vec_size = 4;
framework::Tensor *out = (*outs)[0]; framework::Tensor *out = (*outs)[0];
for (auto *in : ins) { for (auto *in : ins) {
auto temp_size = GetVectorizedSizeImpl<InT>(in->data<InT>()); auto temp_size = platform::GetVectorizedSize<InT>(in->data<InT>());
in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size) in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size)
: in_vec_size; : in_vec_size;
} }
int out_vec_size = GetVectorizedSizeImpl<OutT>(out->data<OutT>()); int out_vec_size = platform::GetVectorizedSize<OutT>(out->data<OutT>());
int vec_size = std::min(out_vec_size, in_vec_size); int vec_size = std::min(out_vec_size, in_vec_size);
switch (vec_size) { switch (vec_size) {
......
...@@ -26,7 +26,7 @@ limitations under the License. */ ...@@ -26,7 +26,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
enum ElementwiseType { kUnary = 1, kBinary = 2 }; enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3 };
/* /*
* According to NVIDIA, if number of threads per block is 64/128/256/512, * According to NVIDIA, if number of threads per block is 64/128/256/512,
...@@ -52,98 +52,73 @@ inline int GetThreadsConfig(const platform::CUDADeviceContext &ctx, ...@@ -52,98 +52,73 @@ inline int GetThreadsConfig(const platform::CUDADeviceContext &ctx,
return std::max(64, threads); return std::max(64, threads);
} }
/*
* Only the address of input data is the multiplier of 1,2,4, vectorized load
* with corresponding multiplier-value is possible. Moreover, the maximum length
* of vectorized load is 128 bits once. Hence, valid length of vectorized load
* shall be determined under both former constraints.
*/
template <typename T>
int GetVectorizedSizeImpl(const T *pointer) {
constexpr int max_load_bits = 128;
int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
uint64_t address = reinterpret_cast<uint64_t>(pointer);
constexpr int vec8 =
std::alignment_of<CudaAlignedVector<T, 8>>::value; // NOLINT
constexpr int vec4 =
std::alignment_of<CudaAlignedVector<T, 4>>::value; // NOLINT
constexpr int vec2 =
std::alignment_of<CudaAlignedVector<T, 2>>::value; // NOLINT
if (address % vec8 == 0) {
/*
* Currently, decide to deal with no more than 4 data once while adopting
* vectorization load/store, if performance test shows that dealing with
* 8 data once in vectorization load/store does get optimized, return code
* below can be changed into " return std::min(8, valid_vec_size); " .
*/
return std::min(4, valid_vec_size);
} else if (address % vec4 == 0) {
return std::min(4, valid_vec_size);
} else if (address % vec2 == 0) {
return std::min(2, valid_vec_size);
} else {
return 1;
}
}
template <typename InT, typename OutT> template <typename InT, typename OutT>
int GetVectorizedSize(const std::vector<const framework::Tensor *> &ins, int GetVectorizedSizeForIO(const std::vector<const framework::Tensor *> &ins,
const std::vector<framework::Tensor *> &outs) { const std::vector<framework::Tensor *> &outs) {
int vec_size = 4; int vec_size = 4;
for (auto iter = ins.begin(); iter != ins.end(); ++iter) { for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
vec_size = vec_size = std::min<int>(vec_size,
std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<InT>())); platform::GetVectorizedSize((*iter)->data<InT>()));
} }
for (auto iter = outs.begin(); iter != outs.end(); ++iter) { for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
vec_size = vec_size = std::min<int>(
std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<OutT>())); vec_size, platform::GetVectorizedSize((*iter)->data<OutT>()));
} }
return vec_size; return vec_size;
} }
template <ElementwiseType ET, int VecSize, typename InT, typename OutT> template <ElementwiseType ET, int VecSize, typename InT, typename OutT>
struct ElementwiseDataWrapper { struct ElementwiseDataWrapper {
OutT *out; using InVecType = platform::CudaAlignedVector<InT, VecSize>;
const InT *in0; using OutVecType = platform::CudaAlignedVector<OutT, VecSize>;
const InT *in1;
__device__ ElementwiseDataWrapper(OutT *out, const InT *in0, const InT *__restrict__ in_data[ET];
const InT *in1 = nullptr) OutT *out_data;
: out(out), in0(in0), in1(in1) {} uint32_t scalar_cal_offset;
using InVecType = CudaAlignedVector<InT, VecSize>; HOSTDEVICE ElementwiseDataWrapper(
using OutVecType = CudaAlignedVector<OutT, VecSize>; const std::vector<const framework::Tensor *> &ins,
std::vector<framework::Tensor *> *outs, uint32_t scalar_cal_offset)
inline __device__ void load_vector(InVecType args[], int idx) { : scalar_cal_offset(scalar_cal_offset) {
const InVecType *x_vec = reinterpret_cast<const InVecType *>(in0); #pragma unroll
args[0] = x_vec[idx]; for (int i = 0; i < ET; ++i) {
if (ET == ElementwiseType::kBinary) { in_data[i] = ins[i]->data<InT>();
const InVecType *y_vec = reinterpret_cast<const InVecType *>(in1); }
args[1] = y_vec[idx]; out_data = (*outs)[0]->data<OutT>();
}
inline __device__ void LoadVectorizedData(InVecType vec_args[], int tid) {
#pragma unroll
for (int i = 0; i < ET; ++i) {
const InVecType *in_vec_data =
reinterpret_cast<const InVecType *>(in_data[i]);
vec_args[i] = in_vec_data[tid];
} }
} }
inline __device__ void load_scalar(InT args[], int idx) { inline __device__ void LoadScalarizedData(InT args[], int tid) {
args[0] = in0[idx]; #pragma unroll
if (ET == ElementwiseType::kBinary) { for (int i = 0; i < ET; ++i) {
args[1] = in1[idx]; args[i] = in_data[i][tid + scalar_cal_offset];
} }
} }
inline __device__ void store_vector(OutVecType res, int idx) { inline __device__ void StoreVectorizedData(OutVecType res, int tid) {
OutVecType *out_vec = reinterpret_cast<OutVecType *>(out); OutVecType *out_vec = reinterpret_cast<OutVecType *>(out_data);
out_vec[idx] = res; out_vec[tid] = res;
} }
inline __device__ void store_scalar(OutT res, int idx) { out[idx] = res; } inline __device__ void StoreScalarizedData(OutT res, int tid) {
out_data[tid + scalar_cal_offset] = res;
}
}; };
template <ElementwiseType ET, int VecSize, typename InT, typename OutT, template <ElementwiseType ET, int VecSize, typename ElementwiseWrapper,
typename Functor> typename InT, typename OutT, typename Functor>
__device__ inline void VectorizedKernelImpl( __device__ inline void VectorizedKernelImpl(ElementwiseWrapper data,
ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func, Functor func, int tid) {
int tid) { using InVecType = platform::CudaAlignedVector<InT, VecSize>;
using InVecType = CudaAlignedVector<InT, VecSize>; using OutVecType = platform::CudaAlignedVector<OutT, VecSize>;
using OutVecType = CudaAlignedVector<OutT, VecSize>;
InVecType ins_vec[ET]; InVecType ins_vec[ET];
OutVecType out_vec; OutVecType out_vec;
InT *ins_ptr[ET]; InT *ins_ptr[ET];
...@@ -153,7 +128,7 @@ __device__ inline void VectorizedKernelImpl( ...@@ -153,7 +128,7 @@ __device__ inline void VectorizedKernelImpl(
ins_ptr[i] = reinterpret_cast<InT *>(&(ins_vec[i])); ins_ptr[i] = reinterpret_cast<InT *>(&(ins_vec[i]));
} }
// load // load
data.load_vector(ins_vec, tid); data.LoadVectorizedData(ins_vec, tid);
// compute // compute
#pragma unroll #pragma unroll
...@@ -165,52 +140,48 @@ __device__ inline void VectorizedKernelImpl( ...@@ -165,52 +140,48 @@ __device__ inline void VectorizedKernelImpl(
out_vec.val[i] = func(ins); out_vec.val[i] = func(ins);
} }
// store // store
data.store_vector(out_vec, tid); data.StoreVectorizedData(out_vec, tid);
} }
template <ElementwiseType ET, int VecSize, typename InT, typename OutT, template <ElementwiseType ET, typename ElementwiseWrapper, typename InT,
typename Functor> typename OutT, typename Functor>
__device__ inline void ScalarKernelImpl( __device__ inline void ScalarKernelImpl(ElementwiseWrapper data, Functor func,
ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func, int tid) {
int start, int remain) {
InT ins[ET]; InT ins[ET];
OutT out; OutT out;
for (int i = 0; i < remain; ++i) { // load
int idx = start + i; data.LoadScalarizedData(ins, tid);
// load // compute
data.load_scalar(ins, idx); out = func(ins);
// compute // store
out = func(ins); data.StoreScalarizedData(out, tid);
// store
data.store_scalar(out, idx);
}
} }
template <ElementwiseType ET, int VecSize, typename InT, typename OutT, template <ElementwiseType ET, typename ElementwiseWrapper, typename InT,
typename Functor> typename OutT, int VecSize, typename Functor>
__global__ void VectorizedKernel(const InT *__restrict__ in0, __global__ void VectorizedKernel(ElementwiseWrapper data, int main_tid,
const InT *__restrict__ in1, OutT *out, int tail_tid, Functor func) {
int size, Functor func) {
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
int remain = size - VecSize * tid;
remain = remain > 0 ? remain : 0; if (tid < main_tid) {
auto data = ElementwiseDataWrapper<ET, VecSize, InT, OutT>(out, in0, in1); VectorizedKernelImpl<ET, VecSize, ElementwiseWrapper, InT, OutT, Functor>(
if (remain >= VecSize) { data, func, tid);
VectorizedKernelImpl(data, func, tid); }
} else { if (tid < tail_tid) {
ScalarKernelImpl(data, func, tid * VecSize, remain); ScalarKernelImpl<ET, ElementwiseWrapper, InT, OutT, Functor>(data, func,
tid);
} }
} }
template <ElementwiseType ET, typename InT, typename OutT, typename Functor> template <ElementwiseType ET, typename ElementwiseWrapper, typename InT,
__global__ void ScalarKernel(const InT *__restrict__ in0, typename OutT, typename Functor>
const InT *__restrict__ in1, OutT *out, int size, __global__ void ScalarKernel(ElementwiseWrapper data, int numel, Functor func) {
Functor func) {
auto data = ElementwiseDataWrapper<ET, 1, InT, OutT>(out, in0, in1);
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
int remain = tid < size ? 1 : 0; if (tid < numel) {
ScalarKernelImpl(data, func, tid, remain); ScalarKernelImpl<ET, ElementwiseWrapper, InT, OutT, Functor>(data, func,
tid);
}
} }
template <ElementwiseType ET, typename InT, typename OutT, typename Functor> template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
...@@ -219,35 +190,48 @@ void LaunchSameDimsElementwiseCudaKernel( ...@@ -219,35 +190,48 @@ void LaunchSameDimsElementwiseCudaKernel(
const std::vector<const framework::Tensor *> &ins, const std::vector<const framework::Tensor *> &ins,
std::vector<framework::Tensor *> *outs, Functor func) { std::vector<framework::Tensor *> *outs, Functor func) {
// calculate the max vec_size for all ins and outs // calculate the max vec_size for all ins and outs
auto size = ins[0]->numel(); auto numel = ins[0]->numel();
int vec_size = GetVectorizedSize<InT, OutT>(ins, *outs); int vec_size = GetVectorizedSizeForIO<InT, OutT>(ins, *outs);
int block_size = GetThreadsConfig(ctx, size, vec_size); int block_size = GetThreadsConfig(ctx, numel, vec_size);
int grid_size = int grid_size =
((size + vec_size - 1) / vec_size + block_size - 1) / block_size; ((numel + vec_size - 1) / vec_size + block_size - 1) / block_size;
const InT *in0 = ins[0]->data<InT>(); int main_tid = numel / vec_size;
const InT *in1 = int tail_tid = numel % vec_size;
(ET == ElementwiseType::kBinary) ? ins[1]->data<InT>() : nullptr; uint32_t vec_len = main_tid * vec_size;
OutT *out = (*outs)[0]->data<OutT>();
// cuda kernel // cuda kernel
auto stream = ctx.stream(); auto stream = ctx.stream();
switch (vec_size) { switch (vec_size) {
case 4: case 4: {
VectorizedKernel<ET, 4><<<grid_size, block_size, 0, stream>>>( auto data_wrapper =
in0, in1, out, size, func); ElementwiseDataWrapper<ET, 4, InT, OutT>(ins, outs, vec_len);
VectorizedKernel<ET, decltype(data_wrapper), InT, OutT,
4><<<grid_size, block_size, 0, stream>>>(
data_wrapper, main_tid, tail_tid, func);
break; break;
case 2: }
VectorizedKernel<ET, 2><<<grid_size, block_size, 0, stream>>>( case 2: {
in0, in1, out, size, func); auto data_wrapper =
ElementwiseDataWrapper<ET, 2, InT, OutT>(ins, outs, vec_len);
VectorizedKernel<ET, decltype(data_wrapper), InT, OutT,
2><<<grid_size, block_size, 0, stream>>>(
data_wrapper, main_tid, tail_tid, func);
break; break;
case 1: }
ScalarKernel<ET><<<grid_size, block_size, 0, stream>>>(in0, in1, out, case 1: {
size, func); auto data_wrapper =
ElementwiseDataWrapper<ET, 1, InT, OutT>(ins, outs, 0);
ScalarKernel<ET, decltype(data_wrapper), InT,
OutT><<<grid_size, block_size, 0, stream>>>(data_wrapper,
numel, func);
break; break;
default: }
default: {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported vectorized size: %d !", vec_size)); "Unsupported vectorized size: %d !", vec_size));
break; break;
}
} }
} }
......
...@@ -158,12 +158,13 @@ struct IndexCalculator { ...@@ -158,12 +158,13 @@ struct IndexCalculator {
: dim(dim) { : dim(dim) {
dims = detail::VectorToArray<int, kMaxRank>(cal_dims); dims = detail::VectorToArray<int, kMaxRank>(cal_dims);
strides = detail::VectorToArray<int, kMaxRank>(full_strides); strides = detail::VectorToArray<int, kMaxRank>(full_strides);
std::vector<FastDivMod> cal_divmoders; std::vector<platform::FastDivMod> cal_divmoders;
// fast divmod // fast divmod
for (auto i : cal_strides) { for (auto i : cal_strides) {
cal_divmoders.push_back(FastDivMod(i)); cal_divmoders.push_back(platform::FastDivMod(i));
} }
divmoders = detail::VectorToArray<FastDivMod, kMaxRank>(cal_divmoders); divmoders =
detail::VectorToArray<platform::FastDivMod, kMaxRank>(cal_divmoders);
} }
__device__ inline int Get(int offset) const { __device__ inline int Get(int offset) const {
...@@ -183,7 +184,7 @@ struct IndexCalculator { ...@@ -183,7 +184,7 @@ struct IndexCalculator {
int dim; int dim;
framework::Array<int, kMaxRank> dims; framework::Array<int, kMaxRank> dims;
framework::Array<int, kMaxRank> strides; framework::Array<int, kMaxRank> strides;
framework::Array<FastDivMod, kMaxRank> divmoders; framework::Array<platform::FastDivMod, kMaxRank> divmoders;
}; };
// reduce config // reduce config
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#define INT_BITS 32 #define INT_BITS 32
namespace paddle { namespace paddle {
namespace operators { namespace platform {
template <typename T, int Size> template <typename T, int Size>
struct alignas(sizeof(T) * Size) CudaAlignedVector { struct alignas(sizeof(T) * Size) CudaAlignedVector {
...@@ -65,5 +65,39 @@ struct FastDivMod { ...@@ -65,5 +65,39 @@ struct FastDivMod {
uint32_t multiplier; uint32_t multiplier;
}; };
} // namespace operators /*
* Only the address of input data is the multiplier of 1,2,4, vectorized load
* with corresponding multiplier-value is possible. Moreover, the maximum length
* of vectorized load is 128 bits once. Hence, valid length of vectorized load
* shall be determined under both former constraints.
*/
template <typename T>
int GetVectorizedSize(const T *pointer) {
constexpr int max_load_bits = 128;
int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
uint64_t address = reinterpret_cast<uint64_t>(pointer);
constexpr int vec8 =
std::alignment_of<CudaAlignedVector<T, 8>>::value; // NOLINT
constexpr int vec4 =
std::alignment_of<CudaAlignedVector<T, 4>>::value; // NOLINT
constexpr int vec2 =
std::alignment_of<CudaAlignedVector<T, 2>>::value; // NOLINT
if (address % vec8 == 0) {
/*
* Currently, decide to deal with no more than 4 data once while adopting
* vectorization load/store, if performance test shows that dealing with
* 8 data once in vectorization load/store does get optimized, return code
* below can be changed into " return std::min(8, valid_vec_size); " .
*/
return std::min(4, valid_vec_size);
} else if (address % vec4 == 0) {
return std::min(4, valid_vec_size);
} else if (address % vec2 == 0) {
return std::min(2, valid_vec_size);
} else {
return 1;
}
}
} // namespace platform
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册