From bf30503335c2c8015dd20f991ef4480af9b5898d Mon Sep 17 00:00:00 2001 From: Zhang Ting Date: Fri, 11 Feb 2022 10:14:45 +0800 Subject: [PATCH] Support different dtypes of inputs for elementwise ops (#38859) * improve backward performance * support different dtypes for elementwise ops --- paddle/fluid/operators/abs_op.cu | 5 +- paddle/fluid/operators/activation_op.cu | 24 +-- paddle/fluid/operators/bce_loss_op.cu | 13 +- paddle/fluid/operators/clip_op.h | 7 +- .../fluid/operators/controlflow/bitwise_op.cu | 4 +- .../operators/controlflow/compare_all_op.cu | 4 +- paddle/fluid/operators/dropout_impl.cu.h | 36 ++-- .../elementwise/elementwise_functor.h | 18 +- .../elementwise/elementwise_op_impl.cu.h | 8 +- paddle/fluid/operators/label_smooth_op.cu | 8 +- paddle/fluid/operators/lgamma_op.cu | 4 +- paddle/fluid/operators/p_norm_op.cu | 3 +- paddle/fluid/operators/renorm_op.cu | 5 +- paddle/fluid/operators/viterbi_decode_op.cu | 5 +- paddle/fluid/platform/function_traits.h | 1 + paddle/pten/kernels/funcs/elementwise_base.h | 187 ++++++++++++------ paddle/pten/kernels/gpu/cast_kernel.cu | 4 +- paddle/pten/kernels/gpu/elementwise.h | 5 +- paddle/pten/kernels/gpu/full_kernel.cu | 8 +- paddle/pten/kernels/gpu/reduce.h | 2 +- paddle/pten/kernels/gpu/scale_kernel.cu | 4 +- .../kernels/primitive/datamover_primitives.h | 54 +++++ .../primitive/datamover_primitives_xpu2.h | 46 +++++ 23 files changed, 309 insertions(+), 146 deletions(-) diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu index 48e19defd03..882c8547a04 100644 --- a/paddle/fluid/operators/abs_op.cu +++ b/paddle/fluid/operators/abs_op.cu @@ -50,9 +50,8 @@ class AbsKernel std::vector ins = {x}; std::vector outs = {out}; auto functor = CudaAbsFunctor(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, T, math::Real>(dev_ctx, ins, &outs, - functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel>( + dev_ctx, ins, &outs, functor); } }; diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 1ee5f35883d..e578ad899e7 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -1367,14 +1367,14 @@ class ELUGradCudaKernel : public framework::OpKernel { if (alpha > 0) { CudaELUGradFunctor functor; functor.alpha = alpha; - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } else { CudaELUGradNegativeAlphaFunctor functor; functor.alpha = alpha; ins.push_back(x); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } } }; @@ -1450,8 +1450,8 @@ class ActivationCudaKernel for (auto& attr : attrs) { *attr.second = ctx.Attr(attr.first); } - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } }; @@ -1480,17 +1480,17 @@ class ActivationGradCudaKernel if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { // Only need forward output Out ins.push_back(out); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } else if (static_cast(Functor::FwdDeps()) == static_cast(kDepX)) { // Only need forward input X ins.push_back(x); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } else { - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } } }; diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu index 6ab2e8a6df2..80bdfc5a21a 100644 --- a/paddle/fluid/operators/bce_loss_op.cu +++ b/paddle/fluid/operators/bce_loss_op.cu @@ -31,7 +31,7 @@ struct BCELossFunctor { neg_100 = static_cast(-100.); } - HOSTDEVICE inline T operator()(const T& x, const T& label) const { + HOSTDEVICE inline T operator()(const T x, const T label) const { PADDLE_ENFORCE( (x >= static_cast(0)) && (x <= one), "Input is expected to be within the interval [0, 1], but recieved %f.", @@ -52,8 +52,7 @@ struct BCELossGradFunctor { eps = static_cast(1e-12); } - HOSTDEVICE inline T operator()(const T& x, const T& label, - const T& dout) const { + HOSTDEVICE inline T operator()(const T x, const T label, const T dout) const { T term1 = max((one - x) * x, eps); return (dout * (x - label) / term1); } @@ -73,8 +72,8 @@ class BCELossCUDAKernel : public framework::OpKernel { std::vector outs = {out}; auto& dev_ctx = ctx.template device_context(); auto functor = BCELossFunctor(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } }; @@ -91,8 +90,8 @@ class BCELossGradCUDAKernel : public framework::OpKernel { std::vector outs = {dx}; auto& dev_ctx = ctx.template device_context(); auto functor = BCELossGradFunctor(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kTernary, T, T>(dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } }; diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h index 47bb61a77fa..dde1214679a 100644 --- a/paddle/fluid/operators/clip_op.h +++ b/paddle/fluid/operators/clip_op.h @@ -45,7 +45,7 @@ template class ClipGradFunctor { public: explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {} - HOSTDEVICE T operator()(const T& x, const T& y) const { + HOSTDEVICE T operator()(const T x, const T y) const { return (y > min_ && y < max_) ? x : static_cast(0); } @@ -103,8 +103,7 @@ class ClipKernel : public framework::OpKernel { std::vector ins = {x}; std::vector outs = {out}; auto functor = ClipFunctor(min, max); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, T, T>( + paddle::operators::LaunchSameDimsElementwiseCudaKernel( context.template device_context(), ins, &outs, functor); #endif @@ -177,7 +176,7 @@ class ClipGradKernel : public framework::OpKernel { std::vector outs = {d_x}; auto functor = ClipGradFunctor(min, max); d_x->mutable_data(context.GetPlace()); - LaunchSameDimsElementwiseCudaKernel( + LaunchSameDimsElementwiseCudaKernel( context.template device_context(), ins, &outs, functor); #else diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cu b/paddle/fluid/operators/controlflow/bitwise_op.cu index 3a4d5303953..5d98da2c027 100644 --- a/paddle/fluid/operators/controlflow/bitwise_op.cu +++ b/paddle/fluid/operators/controlflow/bitwise_op.cu @@ -57,8 +57,8 @@ class UnaryBitwiseOpKernel std::vector outs = {out}; const auto& cuda_ctx = ctx.template device_context(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, T, T>(cuda_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(cuda_ctx, ins, + &outs, functor); } }; diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu index 1dfa7f44279..d96dcebe51f 100644 --- a/paddle/fluid/operators/controlflow/compare_all_op.cu +++ b/paddle/fluid/operators/controlflow/compare_all_op.cu @@ -55,8 +55,8 @@ class CompareReduceOpKernel context.template device_context(); std::vector ins = {x, y}; std::vector outs = {&tmp}; - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kBinary, T, bool>(cuda_ctx, ins, &outs, Functor()); + paddle::operators::LaunchSameDimsElementwiseCudaKernel( + cuda_ctx, ins, &outs, Functor()); // Reduce by 'bitwise and' operator std::vector reduce_dims; diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 192902902e5..d7c49466d5a 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -32,6 +32,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/dropout_impl_util.h" #include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/pten/kernels/funcs/cuda_kernel_config.h" @@ -123,6 +124,19 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, } } +template +struct CudaDropoutGradFunctor { + explicit CudaDropoutGradFunctor(const T factor) : factor_(factor) {} + + __device__ __forceinline__ T operator()(const T dout, + const MaskType mask) const { + return dout * static_cast(mask) * factor_; + } + + private: + T factor_; +}; + template __global__ void DropoutGradCUDAKernel(const T* dout, const MaskType* mask, const T factor, const int64_t size, @@ -259,21 +273,13 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, if (dropout_prob == 1.0f) { dX.device(place) = static_cast(0) * dY; } else { - int vec_size = platform::GetVectorizedSize(grad_y.data()); - if (vec_size == 4 && size % 4 == 0) { - auto factor = static_cast(1.0f / (1.0f - dropout_prob)); - auto stream = dev_ctx.stream(); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(dev_ctx, size, vec_size); - DropoutGradCUDAKernel< - T, uint8_t, - 4><<>>( - grad_y.data(), mask.data(), factor, size, - grad_x->data()); - } else { - dX.device(place) = - dY * M.cast() / static_cast(1.0f - dropout_prob); - } + auto factor = static_cast(1.0f / (1.0f - dropout_prob)); + auto stream = dev_ctx.stream(); + std::vector ins = {&grad_y, &mask}; + std::vector outs = {grad_x}; + auto functor = CudaDropoutGradFunctor(factor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel( + dev_ctx, ins, &outs, functor); } } else { dX.device(place) = dY * M.cast(); diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 19d3a6c385c..fc6126be058 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -234,21 +234,22 @@ struct FMinFunctor { template struct MinGradXFunctor { - inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const { + inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const { return dout * static_cast(x < y); } }; template struct MinGradYFunctor { - inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const { + inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const { return dout * static_cast(x >= y); } }; template struct MinGradXYFunctor { - inline HOSTDEVICE pten::framework::Array operator()( - const InT& x, const InT& y, const InT& dout) { + inline HOSTDEVICE pten::framework::Array operator()(const InT x, + const InT y, + const InT dout) { pten::framework::Array outs; // dx = dout * (x < y) outs[0] = static_cast(dout * static_cast(x < y)); @@ -303,21 +304,22 @@ struct MulGradXYFunctor, Complex> { // Ternary compare template struct MaxGradXFunctor { - inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const { + inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const { return dout * static_cast(x > y); } }; template struct MaxGradYFunctor { - inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const { + inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const { return dout * static_cast(x <= y); } }; template struct MaxGradXYFunctor { - inline HOSTDEVICE pten::framework::Array operator()( - const InT& x, const InT& y, const InT& dout) { + inline HOSTDEVICE pten::framework::Array operator()(const InT x, + const InT y, + const InT dout) { pten::framework::Array outs; // dx = dout * (x > y) outs[0] = static_cast(dout * static_cast(x > y)); diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index c09d2363bb5..233b0767ed6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -26,8 +26,7 @@ namespace operators { using ElementwiseType = pten::ElementwiseType; -template +template void LaunchSameDimsElementwiseCudaKernel( const KPDevice &ctx, const std::vector &ins, std::vector *outs, Functor func) { @@ -54,9 +53,8 @@ void LaunchSameDimsElementwiseCudaKernel( for (int i = 0; i < pt_outputs_tmp.size(); i++) { pt_outputs.push_back(pt_outputs_tmp[i].get()); } - pten::funcs::LaunchSameDimsElementwiseCudaKernel(ctx, pt_inputs, - &pt_outputs, func); + pten::funcs::LaunchSameDimsElementwiseCudaKernel( + ctx, pt_inputs, &pt_outputs, func); } } // namespace operators diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu index 7979d3a74bb..f149e104eff 100644 --- a/paddle/fluid/operators/label_smooth_op.cu +++ b/paddle/fluid/operators/label_smooth_op.cu @@ -87,8 +87,8 @@ class LabelSmoothGPUKernel : public framework::OpKernel { std::vector ins = {in_t}; std::vector outs = {out_t}; auto functor = LabelSmoothFunctor(epsilon, label_dim); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } } }; @@ -107,8 +107,8 @@ class LabelSmoothGradGPUKernel : public framework::OpKernel { std::vector ins = {d_out_t}; std::vector outs = {d_in_t}; auto functor = LabelSmoothGradFunctor(epsilon); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } }; } // namespace operators diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu index 64d1a479627..b9f273727b0 100644 --- a/paddle/fluid/operators/lgamma_op.cu +++ b/paddle/fluid/operators/lgamma_op.cu @@ -39,8 +39,8 @@ class LgammaKernel std::vector ins = {x}; std::vector outs = {out}; auto functor = CudaLgammaFunctor(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } }; diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu index abbbffb6331..7c8dfc7f647 100644 --- a/paddle/fluid/operators/p_norm_op.cu +++ b/paddle/fluid/operators/p_norm_op.cu @@ -126,8 +126,7 @@ class PnormCUDAKernel : public framework::OpKernel { std::vector outs = {out_norm}; const auto& cuda_ctx = ctx.template device_context(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, T, T, UnsignedPowFunctor>( + paddle::operators::LaunchSameDimsElementwiseCudaKernel( cuda_ctx, ins, &outs, UnsignedPowFunctor(1. / porder)); } } diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu index ec1b4f6d5b2..089ecccac64 100644 --- a/paddle/fluid/operators/renorm_op.cu +++ b/paddle/fluid/operators/renorm_op.cu @@ -151,9 +151,8 @@ class CUDARenormKernel : public framework::OpKernel { const auto& cuda_ctx = context.template device_context(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, MT, T, UnsignedPowFunctor>( - cuda_ctx, ins, &outs, func); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(cuda_ctx, ins, + &outs, func); std::vector reduce_axis = {0, 2}; TensorReduceImpl>( cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor(), reduce_axis, diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu index b52f46e4a8a..d527cefd724 100644 --- a/paddle/fluid/operators/viterbi_decode_op.cu +++ b/paddle/fluid/operators/viterbi_decode_op.cu @@ -80,9 +80,8 @@ struct GetMask { std::vector ins = {&lhs, &rhs}; std::vector outs = {mask}; auto& dev_ctx = ctx.template device_context(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kBinary, int64_t, T>(dev_ctx, ins, &outs, - CompareFunctor()); + paddle::operators::LaunchSameDimsElementwiseCudaKernel( + dev_ctx, ins, &outs, CompareFunctor()); } }; diff --git a/paddle/fluid/platform/function_traits.h b/paddle/fluid/platform/function_traits.h index eca78e03e17..662e3ac58a6 100644 --- a/paddle/fluid/platform/function_traits.h +++ b/paddle/fluid/platform/function_traits.h @@ -54,6 +54,7 @@ template struct FunctionTraits { static const size_t arity = sizeof...(Args); static const bool has_pointer_args = IsPointerArgs::value; + using ArgsTuple = std::tuple; }; } // namespace platform diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h index 0f26f3d8aa6..110b405bbcb 100644 --- a/paddle/pten/kernels/funcs/elementwise_base.h +++ b/paddle/pten/kernels/funcs/elementwise_base.h @@ -438,14 +438,78 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout, #if defined(__NVCC__) || defined(__HIPCC__) -template +// static unroller +template