未验证 提交 bf305033 编写于 作者: Z Zhang Ting 提交者: GitHub

Support different dtypes of inputs for elementwise ops (#38859)

* improve backward performance

* support different dtypes for elementwise ops
上级 7d6096ff
......@@ -50,9 +50,8 @@ class AbsKernel<platform::CUDADeviceContext, T>
std::vector<const framework::Tensor*> ins = {x};
std::vector<framework::Tensor*> outs = {out};
auto functor = CudaAbsFunctor<T>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, math::Real<T>>(dev_ctx, ins, &outs,
functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<math::Real<T>>(
dev_ctx, ins, &outs, functor);
}
};
......
......@@ -1367,14 +1367,14 @@ class ELUGradCudaKernel : public framework::OpKernel<T> {
if (alpha > 0) {
CudaELUGradFunctor<T> functor;
functor.alpha = alpha;
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
} else {
CudaELUGradNegativeAlphaFunctor<T> functor;
functor.alpha = alpha;
ins.push_back(x);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
}
};
......@@ -1450,8 +1450,8 @@ class ActivationCudaKernel
for (auto& attr : attrs) {
*attr.second = ctx.Attr<float>(attr.first);
}
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
};
......@@ -1480,17 +1480,17 @@ class ActivationGradCudaKernel
if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
// Only need forward output Out
ins.push_back(out);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
} else if (static_cast<int>(Functor::FwdDeps()) ==
static_cast<int>(kDepX)) {
// Only need forward input X
ins.push_back(x);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
} else {
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
}
};
......
......@@ -31,7 +31,7 @@ struct BCELossFunctor {
neg_100 = static_cast<T>(-100.);
}
HOSTDEVICE inline T operator()(const T& x, const T& label) const {
HOSTDEVICE inline T operator()(const T x, const T label) const {
PADDLE_ENFORCE(
(x >= static_cast<T>(0)) && (x <= one),
"Input is expected to be within the interval [0, 1], but recieved %f.",
......@@ -52,8 +52,7 @@ struct BCELossGradFunctor {
eps = static_cast<T>(1e-12);
}
HOSTDEVICE inline T operator()(const T& x, const T& label,
const T& dout) const {
HOSTDEVICE inline T operator()(const T x, const T label, const T dout) const {
T term1 = max((one - x) * x, eps);
return (dout * (x - label) / term1);
}
......@@ -73,8 +72,8 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
std::vector<framework::Tensor*> outs = {out};
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto functor = BCELossFunctor<T>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
};
......@@ -91,8 +90,8 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> {
std::vector<framework::Tensor*> outs = {dx};
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto functor = BCELossGradFunctor<T>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kTernary, T, T>(dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
};
......
......@@ -45,7 +45,7 @@ template <typename T>
class ClipGradFunctor {
public:
explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
HOSTDEVICE T operator()(const T& x, const T& y) const {
HOSTDEVICE T operator()(const T x, const T y) const {
return (y > min_ && y < max_) ? x : static_cast<T>(0);
}
......@@ -103,8 +103,7 @@ class ClipKernel : public framework::OpKernel<T> {
std::vector<const framework::Tensor*> ins = {x};
std::vector<framework::Tensor*> outs = {out};
auto functor = ClipFunctor<T>(min, max);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T>(
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
context.template device_context<platform::CUDADeviceContext>(), ins,
&outs, functor);
#endif
......@@ -177,7 +176,7 @@ class ClipGradKernel : public framework::OpKernel<T> {
std::vector<framework::Tensor*> outs = {d_x};
auto functor = ClipGradFunctor<T>(min, max);
d_x->mutable_data<T>(context.GetPlace());
LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
LaunchSameDimsElementwiseCudaKernel<T>(
context.template device_context<platform::CUDADeviceContext>(), ins,
&outs, functor);
#else
......
......@@ -57,8 +57,8 @@ class UnaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
std::vector<framework::Tensor*> outs = {out};
const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T>(cuda_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(cuda_ctx, ins,
&outs, functor);
}
};
......
......@@ -55,8 +55,8 @@ class CompareReduceOpKernel
context.template device_context<platform::CUDADeviceContext>();
std::vector<const framework::Tensor*> ins = {x, y};
std::vector<framework::Tensor*> outs = {&tmp};
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, T, bool>(cuda_ctx, ins, &outs, Functor());
paddle::operators::LaunchSameDimsElementwiseCudaKernel<bool>(
cuda_ctx, ins, &outs, Functor());
// Reduce by 'bitwise and' operator
std::vector<int> reduce_dims;
......
......@@ -32,6 +32,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/dropout_impl_util.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
......@@ -123,6 +124,19 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
}
}
template <typename T, typename MaskType>
struct CudaDropoutGradFunctor {
explicit CudaDropoutGradFunctor(const T factor) : factor_(factor) {}
__device__ __forceinline__ T operator()(const T dout,
const MaskType mask) const {
return dout * static_cast<T>(mask) * factor_;
}
private:
T factor_;
};
template <typename T, typename MaskType, int VecSize>
__global__ void DropoutGradCUDAKernel(const T* dout, const MaskType* mask,
const T factor, const int64_t size,
......@@ -259,21 +273,13 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
if (dropout_prob == 1.0f) {
dX.device(place) = static_cast<T>(0) * dY;
} else {
int vec_size = platform::GetVectorizedSize<T>(grad_y.data<T>());
if (vec_size == 4 && size % 4 == 0) {
auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
auto stream = dev_ctx.stream();
platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(dev_ctx, size, vec_size);
DropoutGradCUDAKernel<
T, uint8_t,
4><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
grad_y.data<T>(), mask.data<uint8_t>(), factor, size,
grad_x->data<T>());
} else {
dX.device(place) =
dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
}
std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
std::vector<framework::Tensor*> outs = {grad_x};
auto functor = CudaDropoutGradFunctor<T, uint8_t>(factor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
dev_ctx, ins, &outs, functor);
}
} else {
dX.device(place) = dY * M.cast<T>();
......
......@@ -234,21 +234,22 @@ struct FMinFunctor<int64_t> {
template <typename T>
struct MinGradXFunctor {
inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
return dout * static_cast<T>(x < y);
}
};
template <typename T>
struct MinGradYFunctor {
inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
return dout * static_cast<T>(x >= y);
}
};
template <typename InT, typename OutT>
struct MinGradXYFunctor {
inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(
const InT& x, const InT& y, const InT& dout) {
inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(const InT x,
const InT y,
const InT dout) {
pten::framework::Array<OutT, 2> outs;
// dx = dout * (x < y)
outs[0] = static_cast<OutT>(dout * static_cast<InT>(x < y));
......@@ -303,21 +304,22 @@ struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
// Ternary compare
template <typename T>
struct MaxGradXFunctor {
inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
return dout * static_cast<T>(x > y);
}
};
template <typename T>
struct MaxGradYFunctor {
inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
return dout * static_cast<T>(x <= y);
}
};
template <typename InT, typename OutT>
struct MaxGradXYFunctor {
inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(
const InT& x, const InT& y, const InT& dout) {
inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(const InT x,
const InT y,
const InT dout) {
pten::framework::Array<OutT, 2> outs;
// dx = dout * (x > y)
outs[0] = static_cast<OutT>(dout * static_cast<InT>(x > y));
......
......@@ -26,8 +26,7 @@ namespace operators {
using ElementwiseType = pten::ElementwiseType;
template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
int NumOuts = 1>
template <typename OutT, typename Functor, int NumOuts = 1>
void LaunchSameDimsElementwiseCudaKernel(
const KPDevice &ctx, const std::vector<const framework::Tensor *> &ins,
std::vector<framework::Tensor *> *outs, Functor func) {
......@@ -54,9 +53,8 @@ void LaunchSameDimsElementwiseCudaKernel(
for (int i = 0; i < pt_outputs_tmp.size(); i++) {
pt_outputs.push_back(pt_outputs_tmp[i].get());
}
pten::funcs::LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor,
NumOuts>(ctx, pt_inputs,
&pt_outputs, func);
pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT, Functor, NumOuts>(
ctx, pt_inputs, &pt_outputs, func);
}
} // namespace operators
......
......@@ -87,8 +87,8 @@ class LabelSmoothGPUKernel : public framework::OpKernel<T> {
std::vector<const framework::Tensor*> ins = {in_t};
std::vector<framework::Tensor*> outs = {out_t};
auto functor = LabelSmoothFunctor<T>(epsilon, label_dim);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
}
};
......@@ -107,8 +107,8 @@ class LabelSmoothGradGPUKernel : public framework::OpKernel<T> {
std::vector<const framework::Tensor*> ins = {d_out_t};
std::vector<framework::Tensor*> outs = {d_in_t};
auto functor = LabelSmoothGradFunctor<T>(epsilon);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
};
} // namespace operators
......
......@@ -39,8 +39,8 @@ class LgammaKernel<platform::CUDADeviceContext, T>
std::vector<const framework::Tensor*> ins = {x};
std::vector<framework::Tensor*> outs = {out};
auto functor = CudaLgammaFunctor<T>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
};
......
......@@ -126,8 +126,7 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
std::vector<framework::Tensor*> outs = {out_norm};
const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T, UnsignedPowFunctor<T>>(
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
cuda_ctx, ins, &outs, UnsignedPowFunctor<T>(1. / porder));
}
}
......
......@@ -151,9 +151,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
const auto& cuda_ctx =
context.template device_context<platform::CUDADeviceContext>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
cuda_ctx, ins, &outs, func);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(cuda_ctx, ins,
&outs, func);
std::vector<int> reduce_axis = {0, 2};
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
......
......@@ -80,9 +80,8 @@ struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
std::vector<const Tensor*> ins = {&lhs, &rhs};
std::vector<Tensor*> outs = {mask};
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kBinary, int64_t, T>(dev_ctx, ins, &outs,
CompareFunctor<int64_t, T>());
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
}
};
......
......@@ -54,6 +54,7 @@ template <typename ReturnType, typename... Args>
struct FunctionTraits<ReturnType(Args...)> {
static const size_t arity = sizeof...(Args);
static const bool has_pointer_args = IsPointerArgs<arity, Args...>::value;
using ArgsTuple = std::tuple<Args...>;
};
} // namespace platform
......
......@@ -438,14 +438,78 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout,
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename InT, typename OutT>
// static unroller
template <template <int Index, int VecSize> typename Func,
int VecSize,
int End,
int Begin = 0>
struct Unroller {
template <typename... Args>
static HOSTDEVICE inline void step(Args &&... args) {
Func<Begin, VecSize>::Apply(std::forward<Args>(args)...);
Unroller<Func, VecSize, End, Begin + 1>::step(args...);
}
};
template <template <int Index, int VecSize> typename Func, int VecSize, int End>
struct Unroller<Func, VecSize, End, End> {
template <typename... Args>
static HOSTDEVICE inline void step(Args &&... args) {}
};
template <int Index, int VecSize>
struct Loader {
template <typename Array, typename ArgsT>
static __device__ void Apply(const Array &in,
ArgsT *args,
int num,
int data_offset,
bool is_boundary) {
using Type = std::tuple_element_t<Index, ArgsT>;
kps::Init<Type, ArgsT, Index, VecSize>(args, static_cast<Type>(1.0f));
if (is_boundary) {
kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
} else {
kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
}
}
};
template <int Index, int VecSize>
struct InputSetter {
template <typename Array>
static HOSTDEVICE void Apply(
const std::vector<const DenseTensor *> &ins_tensor, Array *ins_data) {
(*ins_data)[Index] =
reinterpret_cast<const _ptr_ char *>(ins_tensor[Index]->data());
}
};
template <int Index, int VecSize>
struct VecSizeGetter {
template <typename ArgsT>
static HOSTDEVICE void Apply(const std::vector<const DenseTensor *> &ins,
const ArgsT &args,
int *vec_size) {
using Type = std::tuple_element_t<Index, ArgsT>;
*vec_size = std::min<int>(
*vec_size,
paddle::platform::GetVectorizedSize(ins[Index]->data<Type>()));
}
};
template <typename OutT, typename Functor>
int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
const std::vector<DenseTensor *> &outs) {
using Traits = paddle::platform::FunctionTraits<Functor>;
using ArgsT = typename Traits::ArgsTuple;
const int Arity = Traits::arity;
int vec_size = 4;
for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
vec_size = std::min<int>(
vec_size, paddle::platform::GetVectorizedSize((*iter)->data<InT>()));
}
ArgsT arg;
// The Arg VecSize=1 is to match the Unroller template.
Unroller<VecSizeGetter, 1, Arity>::step(ins, arg, &vec_size);
for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
vec_size = std::min<int>(
vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>()));
......@@ -514,6 +578,39 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
}
};
namespace detail {
template <class F, class Tuple, std::size_t... Index>
// GCC/Clang need the decltype() return type
HOSTDEVICE constexpr decltype(auto) ApplyImpl(F &&f,
Tuple &&t,
std::index_sequence<Index...>) {
return std::forward<F>(f)(std::get<Index>(std::forward<Tuple>(t))...);
}
} // namespace detail
template <class F, class Tuple>
HOSTDEVICE constexpr decltype(auto) Apply(F &&f, Tuple &&t) {
return detail::ApplyImpl(
std::forward<F>(f),
std::forward<Tuple>(t),
std::make_index_sequence<
std::tuple_size<std::remove_reference_t<Tuple>>::value>{});
}
template <typename OutT,
int VecSize,
typename Functor,
typename ArgsT,
int Arity>
struct SameDimsElementwisePrimitiveCaller {
__device__ inline void operator()(Functor func, ArgsT *args, OutT *result) {
#pragma unroll
for (int idx = 0; idx < VecSize; ++idx) {
result[idx] = static_cast<OutT>(Apply(func, args[idx]));
}
}
};
template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
struct ElementwiseWriteDataCaller {
__device__ __forceinline__ void operator()(
......@@ -549,8 +646,7 @@ struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
}
};
template <typename InT,
typename OutT,
template <typename OutT,
typename Functor,
int Arity,
int NumOuts,
......@@ -558,42 +654,32 @@ template <typename InT,
bool IsBoundary>
__device__ void VectorizedElementwiseKernelImpl(
const pten::framework::Array<const _ptr_ InT *__restrict__, Arity> &in,
const pten::framework::Array<const _ptr_ char *__restrict__, Arity> &in,
pten::framework::Array<_ptr_ OutT *, NumOuts> outs,
int num,
int data_offset,
Functor func) {
InT args[Arity > 1 ? Arity : 1][VecSize];
using Traits = paddle::platform::FunctionTraits<Functor>;
using ArgsT = typename Traits::ArgsTuple;
ArgsT args[VecSize];
ConditionalT<OutT, NumOuts> result[VecSize];
#pragma unroll
for (int i = 0; i < Arity; i++) {
kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(
args[i], in[i] + data_offset, num);
}
Unroller<Loader, VecSize, Arity>::step(
in, args, num, data_offset, IsBoundary);
constexpr bool kCallElementwiseAny =
paddle::platform::FunctionTraits<Functor>::has_pointer_args;
ElementwisePrimitiveCaller<InT,
ConditionalT<OutT, NumOuts>,
SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>,
VecSize,
Functor,
Arity,
kCallElementwiseAny>()(func, args, result);
ArgsT,
Arity>()(func, args, result);
ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
outs, result, data_offset, num);
}
template <typename InT,
typename OutT,
typename Functor,
int Arity,
int NumOuts,
int VecSize>
template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
__global__ void VectorizedElementwiseKernel(
pten::framework::Array<const _ptr_ InT *__restrict__, Arity> ins,
pten::framework::Array<const _ptr_ char *__restrict__, Arity> ins,
pten::framework::Array<_ptr_ OutT *, NumOuts> outs,
int size,
int main_offset,
......@@ -601,8 +687,7 @@ __global__ void VectorizedElementwiseKernel(
int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
for (; data_offset < main_offset; data_offset += stride) {
VectorizedElementwiseKernelImpl<InT,
OutT,
VectorizedElementwiseKernelImpl<OutT,
Functor,
Arity,
NumOuts,
......@@ -613,8 +698,7 @@ __global__ void VectorizedElementwiseKernel(
int num = size - data_offset;
if (num > 0) {
VectorizedElementwiseKernelImpl<InT,
OutT,
VectorizedElementwiseKernelImpl<OutT,
Functor,
Arity,
NumOuts,
......@@ -623,24 +707,17 @@ __global__ void VectorizedElementwiseKernel(
}
}
template <typename InT,
typename OutT,
typename Functor,
int Arity,
int NumOuts,
int VecSize>
template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
void ElementwiseCudaKernel(const KPDevice &ctx,
const std::vector<const DenseTensor *> &ins,
std::vector<DenseTensor *> *outs,
Functor func) {
auto numel =
(*outs)[0]->numel(); // To avoid running errors when ins.size()== 0
pten::framework::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
pten::framework::Array<const _ptr_ char *__restrict__, Arity> ins_data;
pten::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
for (int i = 0; i < Arity; ++i) {
ins_data[i] = ins[i]->data<InT>();
}
Unroller<InputSetter, VecSize, Arity>::step(ins, &ins_data);
for (int i = 0; i < NumOuts; ++i) {
outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
}
......@@ -649,8 +726,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
int grid_size = 8;
auto stream = ctx.x_context()->xpu_stream;
int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
VectorizedElementwiseKernel<InT,
OutT,
VectorizedElementwiseKernel<OutT,
Functor,
Arity,
NumOuts,
......@@ -662,7 +738,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
gpu_config.GetBlockSize();
auto stream = ctx.stream();
VectorizedElementwiseKernel<InT, OutT, Functor, Arity, NumOuts, VecSize><<<
VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize><<<
gpu_config.block_per_grid,
gpu_config.thread_per_block,
0,
......@@ -670,19 +746,14 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
#endif
}
template <ElementwiseType ET,
typename InT,
typename OutT,
typename Functor,
int NumOuts = 1>
template <typename OutT, typename Functor, int NumOuts = 1>
void LaunchSameDimsElementwiseCudaKernel(
const KPDevice &ctx,
const std::vector<const DenseTensor *> &ins,
std::vector<DenseTensor *> *outs,
Functor func) {
using Traits = paddle::platform::FunctionTraits<Functor>;
const int kArity =
Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
const int kArity = Traits::arity;
PADDLE_ENFORCE_EQ(ins.size(),
kArity,
paddle::platform::errors::InvalidArgument(
......@@ -712,18 +783,18 @@ void LaunchSameDimsElementwiseCudaKernel(
}
// calculate the max vec_size for all ins and outs
int vec_size = GetVectorizedSizeForTensors<InT, OutT>(ins, *outs);
int vec_size = GetVectorizedSizeForTensors<OutT, Functor>(ins, *outs);
switch (vec_size) {
case 4:
ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 4>(
ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 4>(
ctx, ins, outs, func);
break;
case 2:
ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 2>(
ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 2>(
ctx, ins, outs, func);
break;
case 1:
ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 1>(
ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 1>(
ctx, ins, outs, func);
break;
default: {
......
......@@ -44,9 +44,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
inputs.emplace_back(&x);
outputs.emplace_back(out);
dev_ctx.Alloc<OutT>(out);
pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
InT,
OutT>(
pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT>(
dev_ctx, inputs, &outputs, CastFuctor<InT, OutT>());
}
......
......@@ -574,8 +574,7 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx,
dims_size.emplace_back(in->dims().size());
}
if (no_broadcast_flag) {
pten::funcs::
LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT, Functor, NumOuts>(
ctx, ins, outs, func);
} else {
axis = axis == -1
......
......@@ -48,9 +48,7 @@ void FullKernel(const ContextT& dev_ctx,
// This function has no input, so the inputs.size() == 0. Use kUnary, but
// the data will not be loaded in the kernel because the number of
// parameters in the operator is 0
pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
T,
T>(
pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
dev_ctx, inputs, &outputs, FullFuctor<T>(val.to<T>()));
}
}
......@@ -90,9 +88,7 @@ void FullLikeKernel(const ContextT& dev_ctx,
// the operator is 0
int numel = out->numel();
if (numel > 0) {
pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
T,
T>(
pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
dev_ctx, inputs, &outputs, FullFuctor<T>(value));
}
}
......
......@@ -1093,7 +1093,7 @@ void TensorReduceImpl(const pten::GPUContext& dev_ctx,
if (config.reduce_num == 1) {
std::vector<const DenseTensor*> inputs = {&x};
std::vector<DenseTensor*> outputs = {y};
funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, Tx, Ty>(
funcs::LaunchSameDimsElementwiseCudaKernel<Ty>(
dev_ctx, inputs, &outputs, transform);
return;
}
......
......@@ -54,9 +54,7 @@ void ScaleKernel(const Context& dev_ctx,
inputs.emplace_back(&x);
outputs.emplace_back(out);
dev_ctx.template Alloc<T>(out);
pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
T,
T>(
pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
dev_ctx,
inputs,
&outputs,
......
......@@ -229,6 +229,18 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
}
}
/**
* The difference from the above function is that
* it supports different data types of inputs.
*/
template <typename T, typename ArgsT, int Index, int NX>
__device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
#pragma unroll
for (int i = 0; i < NX; i++) {
std::get<Index>(dst[i]) = init_data;
}
}
/**
* @brief Read 1D data from global memory to register. When IsBoundary = true
* and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
......@@ -282,6 +294,48 @@ __device__ __forceinline__ void ReadData(T* dst,
}
}
/**
* @brief Read 1D data from global memory to register. The difference
* from the above function is that it supports different data types of inputs.
*/
template <typename T,
int NX,
int NY,
int BlockSize,
typename ArgsT,
int Index,
bool IsBoundary = false>
__device__ __forceinline__ void ReadData(ArgsT* dst,
const T* __restrict__ src,
int num) {
if (IsBoundary) { // blockDim.x * NX > num
int thread_offset = threadIdx.x * NX;
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (idx + thread_offset < num) {
std::get<Index>(dst[idx]) = src[thread_offset + idx];
}
}
} else { // blockDim,x * NX < num
constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
constexpr int kVectorsPerThread = NX / kVectorSize;
int thread_offset = threadIdx.x * kVectorsPerThread;
using VecType = details::VectorType<T, kVectorSize>;
const VecType* vec_input = reinterpret_cast<const VecType*>(src);
VecType vec_temp[kVectorsPerThread];
#pragma unroll
for (int i = 0; i < kVectorsPerThread; ++i) {
vec_temp[i] = vec_input[thread_offset + i];
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
std::get<Index>(dst[idx]) = *(reinterpret_cast<T*>(vec_temp) + idx);
}
}
}
}
/**
* @brief Read 2D data from global memory to registers with broadcast form.
*
......
......@@ -189,6 +189,18 @@ __device__ __inline__ void Init(T* dst, T init_data) {
}
}
/**
* The difference from the above function is that
* it supports different data types of inputs.
*/
template <typename T, typename ArgsT, int Index, int NX>
__device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
#pragma unroll
for (int i = 0; i < NX; i++) {
std::get<Index>(dst[i]) = init_data;
}
}
/**
* @brief Read 1D data from global memory to register. When IsBoundary = true
* and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
......@@ -229,6 +241,40 @@ __device__ __inline__ void ReadData(T* dst,
}
}
/**
* @brief Read 1D data from global memory to register. The difference
* from the above function is that it supports different data types of inputs.
*/
template <typename T,
int NX,
int NY,
int BlockSize,
typename ArgsT,
int Index,
bool IsBoundary = false>
__device__ __forceinline__ void ReadData(ArgsT* dst,
const T* __restrict__ src,
int num) {
int thread_offset = core_id() * NX;
__local__ T in_temp[1];
__local__ T in_vec[NX];
if (IsBoundary) { // core_num() * NX > num
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (idx + thread_offset < num) {
GM2LM(src + thread_offset + idx, in_temp, sizeof(T));
std::get<Index>(dst[idx]) = in_temp[0];
}
}
} else { // core_num() * NX < num
GM2LM(src + thread_offset, in_vec, NX * sizeof(T));
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
std::get<Index>(dst[idx]) = in_vec[idx];
}
}
}
/**
* @brief Read 2D data from global memory to registers with broadcast form.
*
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册