未验证 提交 bf305033 编写于 作者: Z Zhang Ting 提交者: GitHub

Support different dtypes of inputs for elementwise ops (#38859)

* improve backward performance

* support different dtypes for elementwise ops
上级 7d6096ff
...@@ -50,9 +50,8 @@ class AbsKernel<platform::CUDADeviceContext, T> ...@@ -50,9 +50,8 @@ class AbsKernel<platform::CUDADeviceContext, T>
std::vector<const framework::Tensor*> ins = {x}; std::vector<const framework::Tensor*> ins = {x};
std::vector<framework::Tensor*> outs = {out}; std::vector<framework::Tensor*> outs = {out};
auto functor = CudaAbsFunctor<T>(); auto functor = CudaAbsFunctor<T>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<math::Real<T>>(
ElementwiseType::kUnary, T, math::Real<T>>(dev_ctx, ins, &outs, dev_ctx, ins, &outs, functor);
functor);
} }
}; };
......
...@@ -1367,14 +1367,14 @@ class ELUGradCudaKernel : public framework::OpKernel<T> { ...@@ -1367,14 +1367,14 @@ class ELUGradCudaKernel : public framework::OpKernel<T> {
if (alpha > 0) { if (alpha > 0) {
CudaELUGradFunctor<T> functor; CudaELUGradFunctor<T> functor;
functor.alpha = alpha; functor.alpha = alpha;
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); &outs, functor);
} else { } else {
CudaELUGradNegativeAlphaFunctor<T> functor; CudaELUGradNegativeAlphaFunctor<T> functor;
functor.alpha = alpha; functor.alpha = alpha;
ins.push_back(x); ins.push_back(x);
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); &outs, functor);
} }
} }
}; };
...@@ -1450,8 +1450,8 @@ class ActivationCudaKernel ...@@ -1450,8 +1450,8 @@ class ActivationCudaKernel
for (auto& attr : attrs) { for (auto& attr : attrs) {
*attr.second = ctx.Attr<float>(attr.first); *attr.second = ctx.Attr<float>(attr.first);
} }
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); &outs, functor);
} }
}; };
...@@ -1480,17 +1480,17 @@ class ActivationGradCudaKernel ...@@ -1480,17 +1480,17 @@ class ActivationGradCudaKernel
if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) { if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
// Only need forward output Out // Only need forward output Out
ins.push_back(out); ins.push_back(out);
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); &outs, functor);
} else if (static_cast<int>(Functor::FwdDeps()) == } else if (static_cast<int>(Functor::FwdDeps()) ==
static_cast<int>(kDepX)) { static_cast<int>(kDepX)) {
// Only need forward input X // Only need forward input X
ins.push_back(x); ins.push_back(x);
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); &outs, functor);
} else { } else {
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); &outs, functor);
} }
} }
}; };
......
...@@ -31,7 +31,7 @@ struct BCELossFunctor { ...@@ -31,7 +31,7 @@ struct BCELossFunctor {
neg_100 = static_cast<T>(-100.); neg_100 = static_cast<T>(-100.);
} }
HOSTDEVICE inline T operator()(const T& x, const T& label) const { HOSTDEVICE inline T operator()(const T x, const T label) const {
PADDLE_ENFORCE( PADDLE_ENFORCE(
(x >= static_cast<T>(0)) && (x <= one), (x >= static_cast<T>(0)) && (x <= one),
"Input is expected to be within the interval [0, 1], but recieved %f.", "Input is expected to be within the interval [0, 1], but recieved %f.",
...@@ -52,8 +52,7 @@ struct BCELossGradFunctor { ...@@ -52,8 +52,7 @@ struct BCELossGradFunctor {
eps = static_cast<T>(1e-12); eps = static_cast<T>(1e-12);
} }
HOSTDEVICE inline T operator()(const T& x, const T& label, HOSTDEVICE inline T operator()(const T x, const T label, const T dout) const {
const T& dout) const {
T term1 = max((one - x) * x, eps); T term1 = max((one - x) * x, eps);
return (dout * (x - label) / term1); return (dout * (x - label) / term1);
} }
...@@ -73,8 +72,8 @@ class BCELossCUDAKernel : public framework::OpKernel<T> { ...@@ -73,8 +72,8 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
std::vector<framework::Tensor*> outs = {out}; std::vector<framework::Tensor*> outs = {out};
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto functor = BCELossFunctor<T>(); auto functor = BCELossFunctor<T>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); &outs, functor);
} }
}; };
...@@ -91,8 +90,8 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> { ...@@ -91,8 +90,8 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> {
std::vector<framework::Tensor*> outs = {dx}; std::vector<framework::Tensor*> outs = {dx};
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto functor = BCELossGradFunctor<T>(); auto functor = BCELossGradFunctor<T>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
ElementwiseType::kTernary, T, T>(dev_ctx, ins, &outs, functor); &outs, functor);
} }
}; };
......
...@@ -45,7 +45,7 @@ template <typename T> ...@@ -45,7 +45,7 @@ template <typename T>
class ClipGradFunctor { class ClipGradFunctor {
public: public:
explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {} explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
HOSTDEVICE T operator()(const T& x, const T& y) const { HOSTDEVICE T operator()(const T x, const T y) const {
return (y > min_ && y < max_) ? x : static_cast<T>(0); return (y > min_ && y < max_) ? x : static_cast<T>(0);
} }
...@@ -103,8 +103,7 @@ class ClipKernel : public framework::OpKernel<T> { ...@@ -103,8 +103,7 @@ class ClipKernel : public framework::OpKernel<T> {
std::vector<const framework::Tensor*> ins = {x}; std::vector<const framework::Tensor*> ins = {x};
std::vector<framework::Tensor*> outs = {out}; std::vector<framework::Tensor*> outs = {out};
auto functor = ClipFunctor<T>(min, max); auto functor = ClipFunctor<T>(min, max);
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
ElementwiseType::kUnary, T, T>(
context.template device_context<platform::CUDADeviceContext>(), ins, context.template device_context<platform::CUDADeviceContext>(), ins,
&outs, functor); &outs, functor);
#endif #endif
...@@ -177,7 +176,7 @@ class ClipGradKernel : public framework::OpKernel<T> { ...@@ -177,7 +176,7 @@ class ClipGradKernel : public framework::OpKernel<T> {
std::vector<framework::Tensor*> outs = {d_x}; std::vector<framework::Tensor*> outs = {d_x};
auto functor = ClipGradFunctor<T>(min, max); auto functor = ClipGradFunctor<T>(min, max);
d_x->mutable_data<T>(context.GetPlace()); d_x->mutable_data<T>(context.GetPlace());
LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>( LaunchSameDimsElementwiseCudaKernel<T>(
context.template device_context<platform::CUDADeviceContext>(), ins, context.template device_context<platform::CUDADeviceContext>(), ins,
&outs, functor); &outs, functor);
#else #else
......
...@@ -57,8 +57,8 @@ class UnaryBitwiseOpKernel<platform::CUDADeviceContext, Functor> ...@@ -57,8 +57,8 @@ class UnaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
std::vector<framework::Tensor*> outs = {out}; std::vector<framework::Tensor*> outs = {out};
const auto& cuda_ctx = const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>(); ctx.template device_context<platform::CUDADeviceContext>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(cuda_ctx, ins,
ElementwiseType::kUnary, T, T>(cuda_ctx, ins, &outs, functor); &outs, functor);
} }
}; };
......
...@@ -55,8 +55,8 @@ class CompareReduceOpKernel ...@@ -55,8 +55,8 @@ class CompareReduceOpKernel
context.template device_context<platform::CUDADeviceContext>(); context.template device_context<platform::CUDADeviceContext>();
std::vector<const framework::Tensor*> ins = {x, y}; std::vector<const framework::Tensor*> ins = {x, y};
std::vector<framework::Tensor*> outs = {&tmp}; std::vector<framework::Tensor*> outs = {&tmp};
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<bool>(
ElementwiseType::kBinary, T, bool>(cuda_ctx, ins, &outs, Functor()); cuda_ctx, ins, &outs, Functor());
// Reduce by 'bitwise and' operator // Reduce by 'bitwise and' operator
std::vector<int> reduce_dims; std::vector<int> reduce_dims;
......
...@@ -32,6 +32,7 @@ limitations under the License. */ ...@@ -32,6 +32,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/dropout_impl_util.h" #include "paddle/fluid/operators/dropout_impl_util.h"
#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/pten/kernels/funcs/cuda_kernel_config.h" #include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
...@@ -123,6 +124,19 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, ...@@ -123,6 +124,19 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
} }
} }
template <typename T, typename MaskType>
struct CudaDropoutGradFunctor {
explicit CudaDropoutGradFunctor(const T factor) : factor_(factor) {}
__device__ __forceinline__ T operator()(const T dout,
const MaskType mask) const {
return dout * static_cast<T>(mask) * factor_;
}
private:
T factor_;
};
template <typename T, typename MaskType, int VecSize> template <typename T, typename MaskType, int VecSize>
__global__ void DropoutGradCUDAKernel(const T* dout, const MaskType* mask, __global__ void DropoutGradCUDAKernel(const T* dout, const MaskType* mask,
const T factor, const int64_t size, const T factor, const int64_t size,
...@@ -259,21 +273,13 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, ...@@ -259,21 +273,13 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
if (dropout_prob == 1.0f) { if (dropout_prob == 1.0f) {
dX.device(place) = static_cast<T>(0) * dY; dX.device(place) = static_cast<T>(0) * dY;
} else { } else {
int vec_size = platform::GetVectorizedSize<T>(grad_y.data<T>()); auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
if (vec_size == 4 && size % 4 == 0) { auto stream = dev_ctx.stream();
auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob)); std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
auto stream = dev_ctx.stream(); std::vector<framework::Tensor*> outs = {grad_x};
platform::GpuLaunchConfig config = auto functor = CudaDropoutGradFunctor<T, uint8_t>(factor);
platform::GetGpuLaunchConfig1D(dev_ctx, size, vec_size); paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
DropoutGradCUDAKernel< dev_ctx, ins, &outs, functor);
T, uint8_t,
4><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
grad_y.data<T>(), mask.data<uint8_t>(), factor, size,
grad_x->data<T>());
} else {
dX.device(place) =
dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
}
} }
} else { } else {
dX.device(place) = dY * M.cast<T>(); dX.device(place) = dY * M.cast<T>();
......
...@@ -234,21 +234,22 @@ struct FMinFunctor<int64_t> { ...@@ -234,21 +234,22 @@ struct FMinFunctor<int64_t> {
template <typename T> template <typename T>
struct MinGradXFunctor { struct MinGradXFunctor {
inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const { inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
return dout * static_cast<T>(x < y); return dout * static_cast<T>(x < y);
} }
}; };
template <typename T> template <typename T>
struct MinGradYFunctor { struct MinGradYFunctor {
inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const { inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
return dout * static_cast<T>(x >= y); return dout * static_cast<T>(x >= y);
} }
}; };
template <typename InT, typename OutT> template <typename InT, typename OutT>
struct MinGradXYFunctor { struct MinGradXYFunctor {
inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()( inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(const InT x,
const InT& x, const InT& y, const InT& dout) { const InT y,
const InT dout) {
pten::framework::Array<OutT, 2> outs; pten::framework::Array<OutT, 2> outs;
// dx = dout * (x < y) // dx = dout * (x < y)
outs[0] = static_cast<OutT>(dout * static_cast<InT>(x < y)); outs[0] = static_cast<OutT>(dout * static_cast<InT>(x < y));
...@@ -303,21 +304,22 @@ struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> { ...@@ -303,21 +304,22 @@ struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
// Ternary compare // Ternary compare
template <typename T> template <typename T>
struct MaxGradXFunctor { struct MaxGradXFunctor {
inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const { inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
return dout * static_cast<T>(x > y); return dout * static_cast<T>(x > y);
} }
}; };
template <typename T> template <typename T>
struct MaxGradYFunctor { struct MaxGradYFunctor {
inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const { inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
return dout * static_cast<T>(x <= y); return dout * static_cast<T>(x <= y);
} }
}; };
template <typename InT, typename OutT> template <typename InT, typename OutT>
struct MaxGradXYFunctor { struct MaxGradXYFunctor {
inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()( inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(const InT x,
const InT& x, const InT& y, const InT& dout) { const InT y,
const InT dout) {
pten::framework::Array<OutT, 2> outs; pten::framework::Array<OutT, 2> outs;
// dx = dout * (x > y) // dx = dout * (x > y)
outs[0] = static_cast<OutT>(dout * static_cast<InT>(x > y)); outs[0] = static_cast<OutT>(dout * static_cast<InT>(x > y));
......
...@@ -26,8 +26,7 @@ namespace operators { ...@@ -26,8 +26,7 @@ namespace operators {
using ElementwiseType = pten::ElementwiseType; using ElementwiseType = pten::ElementwiseType;
template <ElementwiseType ET, typename InT, typename OutT, typename Functor, template <typename OutT, typename Functor, int NumOuts = 1>
int NumOuts = 1>
void LaunchSameDimsElementwiseCudaKernel( void LaunchSameDimsElementwiseCudaKernel(
const KPDevice &ctx, const std::vector<const framework::Tensor *> &ins, const KPDevice &ctx, const std::vector<const framework::Tensor *> &ins,
std::vector<framework::Tensor *> *outs, Functor func) { std::vector<framework::Tensor *> *outs, Functor func) {
...@@ -54,9 +53,8 @@ void LaunchSameDimsElementwiseCudaKernel( ...@@ -54,9 +53,8 @@ void LaunchSameDimsElementwiseCudaKernel(
for (int i = 0; i < pt_outputs_tmp.size(); i++) { for (int i = 0; i < pt_outputs_tmp.size(); i++) {
pt_outputs.push_back(pt_outputs_tmp[i].get()); pt_outputs.push_back(pt_outputs_tmp[i].get());
} }
pten::funcs::LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor, pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT, Functor, NumOuts>(
NumOuts>(ctx, pt_inputs, ctx, pt_inputs, &pt_outputs, func);
&pt_outputs, func);
} }
} // namespace operators } // namespace operators
......
...@@ -87,8 +87,8 @@ class LabelSmoothGPUKernel : public framework::OpKernel<T> { ...@@ -87,8 +87,8 @@ class LabelSmoothGPUKernel : public framework::OpKernel<T> {
std::vector<const framework::Tensor*> ins = {in_t}; std::vector<const framework::Tensor*> ins = {in_t};
std::vector<framework::Tensor*> outs = {out_t}; std::vector<framework::Tensor*> outs = {out_t};
auto functor = LabelSmoothFunctor<T>(epsilon, label_dim); auto functor = LabelSmoothFunctor<T>(epsilon, label_dim);
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); &outs, functor);
} }
} }
}; };
...@@ -107,8 +107,8 @@ class LabelSmoothGradGPUKernel : public framework::OpKernel<T> { ...@@ -107,8 +107,8 @@ class LabelSmoothGradGPUKernel : public framework::OpKernel<T> {
std::vector<const framework::Tensor*> ins = {d_out_t}; std::vector<const framework::Tensor*> ins = {d_out_t};
std::vector<framework::Tensor*> outs = {d_in_t}; std::vector<framework::Tensor*> outs = {d_in_t};
auto functor = LabelSmoothGradFunctor<T>(epsilon); auto functor = LabelSmoothGradFunctor<T>(epsilon);
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); &outs, functor);
} }
}; };
} // namespace operators } // namespace operators
......
...@@ -39,8 +39,8 @@ class LgammaKernel<platform::CUDADeviceContext, T> ...@@ -39,8 +39,8 @@ class LgammaKernel<platform::CUDADeviceContext, T>
std::vector<const framework::Tensor*> ins = {x}; std::vector<const framework::Tensor*> ins = {x};
std::vector<framework::Tensor*> outs = {out}; std::vector<framework::Tensor*> outs = {out};
auto functor = CudaLgammaFunctor<T>(); auto functor = CudaLgammaFunctor<T>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); &outs, functor);
} }
}; };
......
...@@ -126,8 +126,7 @@ class PnormCUDAKernel : public framework::OpKernel<T> { ...@@ -126,8 +126,7 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
std::vector<framework::Tensor*> outs = {out_norm}; std::vector<framework::Tensor*> outs = {out_norm};
const auto& cuda_ctx = const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>(); ctx.template device_context<platform::CUDADeviceContext>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
ElementwiseType::kUnary, T, T, UnsignedPowFunctor<T>>(
cuda_ctx, ins, &outs, UnsignedPowFunctor<T>(1. / porder)); cuda_ctx, ins, &outs, UnsignedPowFunctor<T>(1. / porder));
} }
} }
......
...@@ -151,9 +151,8 @@ class CUDARenormKernel : public framework::OpKernel<T> { ...@@ -151,9 +151,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
const auto& cuda_ctx = const auto& cuda_ctx =
context.template device_context<platform::CUDADeviceContext>(); context.template device_context<platform::CUDADeviceContext>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(cuda_ctx, ins,
ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>( &outs, func);
cuda_ctx, ins, &outs, func);
std::vector<int> reduce_axis = {0, 2}; std::vector<int> reduce_axis = {0, 2};
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
......
...@@ -80,9 +80,8 @@ struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> { ...@@ -80,9 +80,8 @@ struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
std::vector<const Tensor*> ins = {&lhs, &rhs}; std::vector<const Tensor*> ins = {&lhs, &rhs};
std::vector<Tensor*> outs = {mask}; std::vector<Tensor*> outs = {mask};
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel< paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
ElementwiseType::kBinary, int64_t, T>(dev_ctx, ins, &outs, dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
CompareFunctor<int64_t, T>());
} }
}; };
......
...@@ -54,6 +54,7 @@ template <typename ReturnType, typename... Args> ...@@ -54,6 +54,7 @@ template <typename ReturnType, typename... Args>
struct FunctionTraits<ReturnType(Args...)> { struct FunctionTraits<ReturnType(Args...)> {
static const size_t arity = sizeof...(Args); static const size_t arity = sizeof...(Args);
static const bool has_pointer_args = IsPointerArgs<arity, Args...>::value; static const bool has_pointer_args = IsPointerArgs<arity, Args...>::value;
using ArgsTuple = std::tuple<Args...>;
}; };
} // namespace platform } // namespace platform
......
...@@ -438,14 +438,78 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout, ...@@ -438,14 +438,78 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout,
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
template <typename InT, typename OutT> // static unroller
template <template <int Index, int VecSize> typename Func,
int VecSize,
int End,
int Begin = 0>
struct Unroller {
template <typename... Args>
static HOSTDEVICE inline void step(Args &&... args) {
Func<Begin, VecSize>::Apply(std::forward<Args>(args)...);
Unroller<Func, VecSize, End, Begin + 1>::step(args...);
}
};
template <template <int Index, int VecSize> typename Func, int VecSize, int End>
struct Unroller<Func, VecSize, End, End> {
template <typename... Args>
static HOSTDEVICE inline void step(Args &&... args) {}
};
template <int Index, int VecSize>
struct Loader {
template <typename Array, typename ArgsT>
static __device__ void Apply(const Array &in,
ArgsT *args,
int num,
int data_offset,
bool is_boundary) {
using Type = std::tuple_element_t<Index, ArgsT>;
kps::Init<Type, ArgsT, Index, VecSize>(args, static_cast<Type>(1.0f));
if (is_boundary) {
kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
} else {
kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
}
}
};
template <int Index, int VecSize>
struct InputSetter {
template <typename Array>
static HOSTDEVICE void Apply(
const std::vector<const DenseTensor *> &ins_tensor, Array *ins_data) {
(*ins_data)[Index] =
reinterpret_cast<const _ptr_ char *>(ins_tensor[Index]->data());
}
};
template <int Index, int VecSize>
struct VecSizeGetter {
template <typename ArgsT>
static HOSTDEVICE void Apply(const std::vector<const DenseTensor *> &ins,
const ArgsT &args,
int *vec_size) {
using Type = std::tuple_element_t<Index, ArgsT>;
*vec_size = std::min<int>(
*vec_size,
paddle::platform::GetVectorizedSize(ins[Index]->data<Type>()));
}
};
template <typename OutT, typename Functor>
int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins, int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
const std::vector<DenseTensor *> &outs) { const std::vector<DenseTensor *> &outs) {
using Traits = paddle::platform::FunctionTraits<Functor>;
using ArgsT = typename Traits::ArgsTuple;
const int Arity = Traits::arity;
int vec_size = 4; int vec_size = 4;
for (auto iter = ins.begin(); iter != ins.end(); ++iter) { ArgsT arg;
vec_size = std::min<int>( // The Arg VecSize=1 is to match the Unroller template.
vec_size, paddle::platform::GetVectorizedSize((*iter)->data<InT>())); Unroller<VecSizeGetter, 1, Arity>::step(ins, arg, &vec_size);
}
for (auto iter = outs.begin(); iter != outs.end(); ++iter) { for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
vec_size = std::min<int>( vec_size = std::min<int>(
vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>())); vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>()));
...@@ -514,6 +578,39 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> { ...@@ -514,6 +578,39 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
} }
}; };
namespace detail {
template <class F, class Tuple, std::size_t... Index>
// GCC/Clang need the decltype() return type
HOSTDEVICE constexpr decltype(auto) ApplyImpl(F &&f,
Tuple &&t,
std::index_sequence<Index...>) {
return std::forward<F>(f)(std::get<Index>(std::forward<Tuple>(t))...);
}
} // namespace detail
template <class F, class Tuple>
HOSTDEVICE constexpr decltype(auto) Apply(F &&f, Tuple &&t) {
return detail::ApplyImpl(
std::forward<F>(f),
std::forward<Tuple>(t),
std::make_index_sequence<
std::tuple_size<std::remove_reference_t<Tuple>>::value>{});
}
template <typename OutT,
int VecSize,
typename Functor,
typename ArgsT,
int Arity>
struct SameDimsElementwisePrimitiveCaller {
__device__ inline void operator()(Functor func, ArgsT *args, OutT *result) {
#pragma unroll
for (int idx = 0; idx < VecSize; ++idx) {
result[idx] = static_cast<OutT>(Apply(func, args[idx]));
}
}
};
template <typename OutT, int VecSize, bool IsBoundary, int NumOuts> template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
struct ElementwiseWriteDataCaller { struct ElementwiseWriteDataCaller {
__device__ __forceinline__ void operator()( __device__ __forceinline__ void operator()(
...@@ -549,8 +646,7 @@ struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> { ...@@ -549,8 +646,7 @@ struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
} }
}; };
template <typename InT, template <typename OutT,
typename OutT,
typename Functor, typename Functor,
int Arity, int Arity,
int NumOuts, int NumOuts,
...@@ -558,42 +654,32 @@ template <typename InT, ...@@ -558,42 +654,32 @@ template <typename InT,
bool IsBoundary> bool IsBoundary>
__device__ void VectorizedElementwiseKernelImpl( __device__ void VectorizedElementwiseKernelImpl(
const pten::framework::Array<const _ptr_ InT *__restrict__, Arity> &in, const pten::framework::Array<const _ptr_ char *__restrict__, Arity> &in,
pten::framework::Array<_ptr_ OutT *, NumOuts> outs, pten::framework::Array<_ptr_ OutT *, NumOuts> outs,
int num, int num,
int data_offset, int data_offset,
Functor func) { Functor func) {
InT args[Arity > 1 ? Arity : 1][VecSize]; using Traits = paddle::platform::FunctionTraits<Functor>;
using ArgsT = typename Traits::ArgsTuple;
ArgsT args[VecSize];
ConditionalT<OutT, NumOuts> result[VecSize]; ConditionalT<OutT, NumOuts> result[VecSize];
#pragma unroll Unroller<Loader, VecSize, Arity>::step(
for (int i = 0; i < Arity; i++) { in, args, num, data_offset, IsBoundary);
kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(
args[i], in[i] + data_offset, num);
}
constexpr bool kCallElementwiseAny = SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>,
paddle::platform::FunctionTraits<Functor>::has_pointer_args; VecSize,
ElementwisePrimitiveCaller<InT, Functor,
ConditionalT<OutT, NumOuts>, ArgsT,
VecSize, Arity>()(func, args, result);
Functor,
Arity,
kCallElementwiseAny>()(func, args, result);
ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()( ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
outs, result, data_offset, num); outs, result, data_offset, num);
} }
template <typename InT, template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
typename OutT,
typename Functor,
int Arity,
int NumOuts,
int VecSize>
__global__ void VectorizedElementwiseKernel( __global__ void VectorizedElementwiseKernel(
pten::framework::Array<const _ptr_ InT *__restrict__, Arity> ins, pten::framework::Array<const _ptr_ char *__restrict__, Arity> ins,
pten::framework::Array<_ptr_ OutT *, NumOuts> outs, pten::framework::Array<_ptr_ OutT *, NumOuts> outs,
int size, int size,
int main_offset, int main_offset,
...@@ -601,8 +687,7 @@ __global__ void VectorizedElementwiseKernel( ...@@ -601,8 +687,7 @@ __global__ void VectorizedElementwiseKernel(
int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
for (; data_offset < main_offset; data_offset += stride) { for (; data_offset < main_offset; data_offset += stride) {
VectorizedElementwiseKernelImpl<InT, VectorizedElementwiseKernelImpl<OutT,
OutT,
Functor, Functor,
Arity, Arity,
NumOuts, NumOuts,
...@@ -613,8 +698,7 @@ __global__ void VectorizedElementwiseKernel( ...@@ -613,8 +698,7 @@ __global__ void VectorizedElementwiseKernel(
int num = size - data_offset; int num = size - data_offset;
if (num > 0) { if (num > 0) {
VectorizedElementwiseKernelImpl<InT, VectorizedElementwiseKernelImpl<OutT,
OutT,
Functor, Functor,
Arity, Arity,
NumOuts, NumOuts,
...@@ -623,24 +707,17 @@ __global__ void VectorizedElementwiseKernel( ...@@ -623,24 +707,17 @@ __global__ void VectorizedElementwiseKernel(
} }
} }
template <typename InT, template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
typename OutT,
typename Functor,
int Arity,
int NumOuts,
int VecSize>
void ElementwiseCudaKernel(const KPDevice &ctx, void ElementwiseCudaKernel(const KPDevice &ctx,
const std::vector<const DenseTensor *> &ins, const std::vector<const DenseTensor *> &ins,
std::vector<DenseTensor *> *outs, std::vector<DenseTensor *> *outs,
Functor func) { Functor func) {
auto numel = auto numel =
(*outs)[0]->numel(); // To avoid running errors when ins.size()== 0 (*outs)[0]->numel(); // To avoid running errors when ins.size()== 0
pten::framework::Array<const _ptr_ InT *__restrict__, Arity> ins_data; pten::framework::Array<const _ptr_ char *__restrict__, Arity> ins_data;
pten::framework::Array<_ptr_ OutT *, NumOuts> outs_data; pten::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
for (int i = 0; i < Arity; ++i) { Unroller<InputSetter, VecSize, Arity>::step(ins, &ins_data);
ins_data[i] = ins[i]->data<InT>();
}
for (int i = 0; i < NumOuts; ++i) { for (int i = 0; i < NumOuts; ++i) {
outs_data[i] = ctx.Alloc<OutT>((*outs)[i]); outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
} }
...@@ -649,8 +726,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx, ...@@ -649,8 +726,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
int grid_size = 8; int grid_size = 8;
auto stream = ctx.x_context()->xpu_stream; auto stream = ctx.x_context()->xpu_stream;
int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size; int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
VectorizedElementwiseKernel<InT, VectorizedElementwiseKernel<OutT,
OutT,
Functor, Functor,
Arity, Arity,
NumOuts, NumOuts,
...@@ -662,7 +738,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx, ...@@ -662,7 +738,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize * int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
gpu_config.GetBlockSize(); gpu_config.GetBlockSize();
auto stream = ctx.stream(); auto stream = ctx.stream();
VectorizedElementwiseKernel<InT, OutT, Functor, Arity, NumOuts, VecSize><<< VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize><<<
gpu_config.block_per_grid, gpu_config.block_per_grid,
gpu_config.thread_per_block, gpu_config.thread_per_block,
0, 0,
...@@ -670,19 +746,14 @@ void ElementwiseCudaKernel(const KPDevice &ctx, ...@@ -670,19 +746,14 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
#endif #endif
} }
template <ElementwiseType ET, template <typename OutT, typename Functor, int NumOuts = 1>
typename InT,
typename OutT,
typename Functor,
int NumOuts = 1>
void LaunchSameDimsElementwiseCudaKernel( void LaunchSameDimsElementwiseCudaKernel(
const KPDevice &ctx, const KPDevice &ctx,
const std::vector<const DenseTensor *> &ins, const std::vector<const DenseTensor *> &ins,
std::vector<DenseTensor *> *outs, std::vector<DenseTensor *> *outs,
Functor func) { Functor func) {
using Traits = paddle::platform::FunctionTraits<Functor>; using Traits = paddle::platform::FunctionTraits<Functor>;
const int kArity = const int kArity = Traits::arity;
Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
PADDLE_ENFORCE_EQ(ins.size(), PADDLE_ENFORCE_EQ(ins.size(),
kArity, kArity,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
...@@ -712,18 +783,18 @@ void LaunchSameDimsElementwiseCudaKernel( ...@@ -712,18 +783,18 @@ void LaunchSameDimsElementwiseCudaKernel(
} }
// calculate the max vec_size for all ins and outs // calculate the max vec_size for all ins and outs
int vec_size = GetVectorizedSizeForTensors<InT, OutT>(ins, *outs); int vec_size = GetVectorizedSizeForTensors<OutT, Functor>(ins, *outs);
switch (vec_size) { switch (vec_size) {
case 4: case 4:
ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 4>( ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 4>(
ctx, ins, outs, func); ctx, ins, outs, func);
break; break;
case 2: case 2:
ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 2>( ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 2>(
ctx, ins, outs, func); ctx, ins, outs, func);
break; break;
case 1: case 1:
ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 1>( ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 1>(
ctx, ins, outs, func); ctx, ins, outs, func);
break; break;
default: { default: {
......
...@@ -44,9 +44,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx, ...@@ -44,9 +44,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
inputs.emplace_back(&x); inputs.emplace_back(&x);
outputs.emplace_back(out); outputs.emplace_back(out);
dev_ctx.Alloc<OutT>(out); dev_ctx.Alloc<OutT>(out);
pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT>(
InT,
OutT>(
dev_ctx, inputs, &outputs, CastFuctor<InT, OutT>()); dev_ctx, inputs, &outputs, CastFuctor<InT, OutT>());
} }
......
...@@ -574,9 +574,8 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx, ...@@ -574,9 +574,8 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx,
dims_size.emplace_back(in->dims().size()); dims_size.emplace_back(in->dims().size());
} }
if (no_broadcast_flag) { if (no_broadcast_flag) {
pten::funcs:: pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT, Functor, NumOuts>(
LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>( ctx, ins, outs, func);
ctx, ins, outs, func);
} else { } else {
axis = axis == -1 axis = axis == -1
? *std::max_element(dims_size.begin(), dims_size.end()) - ? *std::max_element(dims_size.begin(), dims_size.end()) -
......
...@@ -48,9 +48,7 @@ void FullKernel(const ContextT& dev_ctx, ...@@ -48,9 +48,7 @@ void FullKernel(const ContextT& dev_ctx,
// This function has no input, so the inputs.size() == 0. Use kUnary, but // This function has no input, so the inputs.size() == 0. Use kUnary, but
// the data will not be loaded in the kernel because the number of // the data will not be loaded in the kernel because the number of
// parameters in the operator is 0 // parameters in the operator is 0
pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
T,
T>(
dev_ctx, inputs, &outputs, FullFuctor<T>(val.to<T>())); dev_ctx, inputs, &outputs, FullFuctor<T>(val.to<T>()));
} }
} }
...@@ -90,9 +88,7 @@ void FullLikeKernel(const ContextT& dev_ctx, ...@@ -90,9 +88,7 @@ void FullLikeKernel(const ContextT& dev_ctx,
// the operator is 0 // the operator is 0
int numel = out->numel(); int numel = out->numel();
if (numel > 0) { if (numel > 0) {
pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
T,
T>(
dev_ctx, inputs, &outputs, FullFuctor<T>(value)); dev_ctx, inputs, &outputs, FullFuctor<T>(value));
} }
} }
......
...@@ -1093,7 +1093,7 @@ void TensorReduceImpl(const pten::GPUContext& dev_ctx, ...@@ -1093,7 +1093,7 @@ void TensorReduceImpl(const pten::GPUContext& dev_ctx,
if (config.reduce_num == 1) { if (config.reduce_num == 1) {
std::vector<const DenseTensor*> inputs = {&x}; std::vector<const DenseTensor*> inputs = {&x};
std::vector<DenseTensor*> outputs = {y}; std::vector<DenseTensor*> outputs = {y};
funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, Tx, Ty>( funcs::LaunchSameDimsElementwiseCudaKernel<Ty>(
dev_ctx, inputs, &outputs, transform); dev_ctx, inputs, &outputs, transform);
return; return;
} }
......
...@@ -54,9 +54,7 @@ void ScaleKernel(const Context& dev_ctx, ...@@ -54,9 +54,7 @@ void ScaleKernel(const Context& dev_ctx,
inputs.emplace_back(&x); inputs.emplace_back(&x);
outputs.emplace_back(out); outputs.emplace_back(out);
dev_ctx.template Alloc<T>(out); dev_ctx.template Alloc<T>(out);
pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
T,
T>(
dev_ctx, dev_ctx,
inputs, inputs,
&outputs, &outputs,
......
...@@ -229,6 +229,18 @@ __device__ __forceinline__ void Init(T* dst, T init_data) { ...@@ -229,6 +229,18 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
} }
} }
/**
* The difference from the above function is that
* it supports different data types of inputs.
*/
template <typename T, typename ArgsT, int Index, int NX>
__device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
#pragma unroll
for (int i = 0; i < NX; i++) {
std::get<Index>(dst[i]) = init_data;
}
}
/** /**
* @brief Read 1D data from global memory to register. When IsBoundary = true * @brief Read 1D data from global memory to register. When IsBoundary = true
* and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
...@@ -282,6 +294,48 @@ __device__ __forceinline__ void ReadData(T* dst, ...@@ -282,6 +294,48 @@ __device__ __forceinline__ void ReadData(T* dst,
} }
} }
/**
* @brief Read 1D data from global memory to register. The difference
* from the above function is that it supports different data types of inputs.
*/
template <typename T,
int NX,
int NY,
int BlockSize,
typename ArgsT,
int Index,
bool IsBoundary = false>
__device__ __forceinline__ void ReadData(ArgsT* dst,
const T* __restrict__ src,
int num) {
if (IsBoundary) { // blockDim.x * NX > num
int thread_offset = threadIdx.x * NX;
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (idx + thread_offset < num) {
std::get<Index>(dst[idx]) = src[thread_offset + idx];
}
}
} else { // blockDim,x * NX < num
constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
constexpr int kVectorsPerThread = NX / kVectorSize;
int thread_offset = threadIdx.x * kVectorsPerThread;
using VecType = details::VectorType<T, kVectorSize>;
const VecType* vec_input = reinterpret_cast<const VecType*>(src);
VecType vec_temp[kVectorsPerThread];
#pragma unroll
for (int i = 0; i < kVectorsPerThread; ++i) {
vec_temp[i] = vec_input[thread_offset + i];
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
std::get<Index>(dst[idx]) = *(reinterpret_cast<T*>(vec_temp) + idx);
}
}
}
}
/** /**
* @brief Read 2D data from global memory to registers with broadcast form. * @brief Read 2D data from global memory to registers with broadcast form.
* *
......
...@@ -189,6 +189,18 @@ __device__ __inline__ void Init(T* dst, T init_data) { ...@@ -189,6 +189,18 @@ __device__ __inline__ void Init(T* dst, T init_data) {
} }
} }
/**
* The difference from the above function is that
* it supports different data types of inputs.
*/
template <typename T, typename ArgsT, int Index, int NX>
__device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
#pragma unroll
for (int i = 0; i < NX; i++) {
std::get<Index>(dst[i]) = init_data;
}
}
/** /**
* @brief Read 1D data from global memory to register. When IsBoundary = true * @brief Read 1D data from global memory to register. When IsBoundary = true
* and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
...@@ -229,6 +241,40 @@ __device__ __inline__ void ReadData(T* dst, ...@@ -229,6 +241,40 @@ __device__ __inline__ void ReadData(T* dst,
} }
} }
/**
* @brief Read 1D data from global memory to register. The difference
* from the above function is that it supports different data types of inputs.
*/
template <typename T,
int NX,
int NY,
int BlockSize,
typename ArgsT,
int Index,
bool IsBoundary = false>
__device__ __forceinline__ void ReadData(ArgsT* dst,
const T* __restrict__ src,
int num) {
int thread_offset = core_id() * NX;
__local__ T in_temp[1];
__local__ T in_vec[NX];
if (IsBoundary) { // core_num() * NX > num
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (idx + thread_offset < num) {
GM2LM(src + thread_offset + idx, in_temp, sizeof(T));
std::get<Index>(dst[idx]) = in_temp[0];
}
}
} else { // core_num() * NX < num
GM2LM(src + thread_offset, in_vec, NX * sizeof(T));
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
std::get<Index>(dst[idx]) = in_vec[idx];
}
}
}
/** /**
* @brief Read 2D data from global memory to registers with broadcast form. * @brief Read 2D data from global memory to registers with broadcast form.
* *
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册