From 95c3d6135841d74211b1343679ee568588051d7b Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Sun, 9 Apr 2023 10:18:03 +0800 Subject: [PATCH] Cherry pick the support of bfloat16 for several operators. (#52608) * Register exp/expm1/logit bf16 activation op kernels (#48702) * register more bf16 ops * update to register coresponding backward ops * Addition of bf16 type support for Compare OP (#46413) * first commit * clarify the quotes * change code style format * support bfloat16 * add bfloat16 support for more ops (#48272) * [Bfloat16]register bfloat16 datatype for squared l2 norm (#50908) * Sync the pull request #51903. * Add some header files back. * modify cmake file for cuda11.8 compile (#49020) * modify cmake file for cuda11.8 compile * add op_library(fused_embedding_eltwise_layernorm_op DEPS bert_encoder_functor) * Fix compling error. * Cherry-pick pull request #51396. --------- Co-authored-by: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Co-authored-by: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Co-authored-by: Shaojie WANG Co-authored-by: zqw_1997 <118182234+zhengqiwen1997@users.noreply.github.com> --- .../fused_softmax_mask_upper_triangle_op.cu | 12 +++++++ paddle/fluid/operators/math.h | 5 +++ paddle/fluid/operators/math/cross_entropy.cu | 7 +++- paddle/fluid/operators/math/cross_entropy.h | 33 ++++++++++++++----- paddle/phi/kernels/full_kernel.cc | 3 +- .../phi/kernels/gpu/activation_grad_kernel.cu | 9 +++-- paddle/phi/kernels/gpu/activation_kernel.cu | 9 +++-- paddle/phi/kernels/gpu/arg_min_max_kernel.cu | 2 ++ .../kernels/gpu/cross_entropy_grad_kernel.cu | 21 ++++++++++++ .../phi/kernels/gpu/cross_entropy_kernel.cu | 17 ++++++++-- .../phi/kernels/gpu/gather_nd_grad_kernel.cu | 4 ++- paddle/phi/kernels/gpu/gather_nd_kernel.cu | 4 ++- .../kernels/gpu/index_sample_grad_kernel.cu | 2 ++ paddle/phi/kernels/gpu/index_sample_kernel.cu | 2 ++ .../phi/kernels/gpu/tril_triu_grad_kernel.cu | 3 +- paddle/phi/kernels/gpu/tril_triu_kernel.cu | 3 +- paddle/phi/kernels/kps/compare_kernel.cu | 18 ++++++---- paddle/phi/kernels/shape_kernel.cc | 3 +- python/paddle/tensor/manipulation.py | 13 ++++++-- 19 files changed, 138 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu index f92479888f8..94699c9ce69 100644 --- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu @@ -68,6 +68,11 @@ __device__ __inline__ void load_data_upper_tri(plat::float16* dst, *(reinterpret_cast(dst)) = *(reinterpret_cast(src)); } +__device__ __inline__ void load_data_upper_tri(plat::bfloat16* dst, + const plat::bfloat16* src) { + *(reinterpret_cast(dst)) = *(reinterpret_cast(src)); +} + __device__ __inline__ void load_data_upper_tri(float* dst, const float* src) { *(reinterpret_cast(dst)) = *(reinterpret_cast(src)); } @@ -76,6 +81,10 @@ __device__ __inline__ void load_zero_vector_upper_tri(plat::float16* dst) { *(reinterpret_cast(dst)) = make_float2(0.0f, 0.0f); } +__device__ __inline__ void load_zero_vector_upper_tri(plat::bfloat16* dst) { + *(reinterpret_cast(dst)) = make_float2(0.0f, 0.0f); +} + __device__ __inline__ void load_zero_vector_upper_tri(float* dst) { *(reinterpret_cast(dst)) = make_float4(0.0f, 0.0f, 0.0f, 0.0f); } @@ -595,8 +604,11 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( fused_softmax_mask_upper_triangle, ops::SoftmaxMaskFuseUpperTriangleKernel, + ops::SoftmaxMaskFuseUpperTriangleKernel, ops::SoftmaxMaskFuseUpperTriangleKernel); REGISTER_OP_CUDA_KERNEL( fused_softmax_mask_upper_triangle_grad, ops::SoftmaxMaskFuseUpperTriangleGradKernel, + ops::SoftmaxMaskFuseUpperTriangleGradKernel, ops::SoftmaxMaskFuseUpperTriangleGradKernel); diff --git a/paddle/fluid/operators/math.h b/paddle/fluid/operators/math.h index 47281fb0280..f376663ecec 100644 --- a/paddle/fluid/operators/math.h +++ b/paddle/fluid/operators/math.h @@ -15,6 +15,7 @@ #pragma once #include "math.h" // NOLINT +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/core/hostdevice.h" @@ -33,6 +34,10 @@ inline HOSTDEVICE platform::float16 real_log(platform::float16 x) { return static_cast(::logf(static_cast(x))); } +inline HOSTDEVICE phi::dtype::bfloat16 real_log(phi::dtype::bfloat16 x) { + return static_cast(::logf(static_cast(x))); +} + inline HOSTDEVICE float real_log(float x) { return ::logf(x); } inline HOSTDEVICE double real_log(double x) { return ::log(x); } diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index c366dd6fcef..f8bd4b60d47 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/cross_entropy.h" + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/math.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" @@ -152,7 +154,10 @@ void CrossEntropyFunctor::operator()( template class CrossEntropyFunctor; template class CrossEntropyFunctor; -template class CrossEntropyFunctor; +template class CrossEntropyFunctor; +#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(8, 1, 0) +template class CrossEntropyFunctor; +#endif } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h index 0de10789ba0..651579005b9 100644 --- a/paddle/fluid/operators/math/cross_entropy.h +++ b/paddle/fluid/operators/math/cross_entropy.h @@ -17,7 +17,8 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" #include "paddle/phi/core/hostdevice.h" namespace paddle { @@ -46,14 +47,30 @@ struct TolerableValue { // Also. In standard implementation of cross entropy, other // framework not has the ValueClipping. template <> -struct TolerableValue { - HOSTDEVICE platform::float16 operator()(const platform::float16& x) const { - if (platform::isfinite(x)) +struct TolerableValue { + HOSTDEVICE phi::dtype::float16 operator()( + const phi::dtype::float16& x) const { + if (phi::dtype::isfinite(x)) { return x; - else if (x > static_cast(0)) - return std::numeric_limits::max(); - else - return std::numeric_limits::min(); + } else if (x > static_cast(0)) { + return std::numeric_limits::max(); + } else { + return std::numeric_limits::min(); + } + } +}; + +template <> +struct TolerableValue { + HOSTDEVICE phi::dtype::bfloat16 operator()( + const phi::dtype::bfloat16& x) const { + if (phi::dtype::isfinite(x)) { + return x; + } else if (x > static_cast(0)) { + return std::numeric_limits::max(); + } else { + return std::numeric_limits::min(); + } } }; diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc index 9622bff5c25..ce898210633 100644 --- a/paddle/phi/kernels/full_kernel.cc +++ b/paddle/phi/kernels/full_kernel.cc @@ -59,7 +59,8 @@ PD_REGISTER_KERNEL(full_batch_size_like, int, int64_t, bool, - phi::dtype::float16) { + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } #endif diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index b947c70cb89..d40f3b5013a 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -370,7 +370,8 @@ PD_REGISTER_KERNEL(exp_grad, double, int, int64_t, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel) @@ -385,7 +386,8 @@ PD_REGISTER_KERNEL(expm1_grad, phi::Expm1GradKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(logit_grad, GPU, @@ -393,7 +395,8 @@ PD_REGISTER_KERNEL(logit_grad, phi::LogitGradKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(square_grad, GPU, diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index e57332c4075..ab32f420701 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -212,21 +212,24 @@ PD_REGISTER_KERNEL(exp, double, int, int64_t, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(expm1, GPU, ALL_LAYOUT, phi::Expm1Kernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(logit, GPU, ALL_LAYOUT, phi::LogitKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(square, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu index 13db1853495..4c440ed0dd7 100644 --- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu +++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu @@ -255,6 +255,7 @@ PD_REGISTER_KERNEL(arg_min, ALL_LAYOUT, phi::ArgMinKernel, phi::dtype::float16, + phi::dtype::bfloat16, float, double, int32_t, @@ -267,6 +268,7 @@ PD_REGISTER_KERNEL(arg_max, ALL_LAYOUT, phi::ArgMaxKernel, phi::dtype::float16, + phi::dtype::bfloat16, float, double, int32_t, diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu index 5d40304c5e0..93cdf64a8ef 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu @@ -282,6 +282,7 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, } // namespace phi +#ifdef PADDLE_WITH_HIP PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad, GPU, ALL_LAYOUT, @@ -289,3 +290,23 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad, float, double, phi::dtype::float16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad, + GPU, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad, + GPU, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxGradKernel, + float, + double, + phi::dtype::float16) {} +#endif +#endif diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu index 76201a1077e..087ba293fb8 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu @@ -252,7 +252,7 @@ __device__ __forceinline__ AccT ThreadReduce(const T* input, input -= offset; size += offset; if (tid >= offset) { - val = reducer(val, input[tid]); + val = reducer(val, static_cast(input[tid])); } size -= blockDim.x; input += blockDim.x; @@ -268,14 +268,14 @@ __device__ __forceinline__ AccT ThreadReduce(const T* input, #pragma unroll for (int i = 0; i < VecSize; ++i) { - val = reducer(val, ins[i]); + val = reducer(val, static_cast(ins[i])); } } // scalar part tid = size - remain + threadIdx.x; for (; tid < size; tid += blockDim.x) { - val = reducer(val, input[tid]); + val = reducer(val, static_cast(input[tid])); } return val; } @@ -1470,6 +1470,16 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax, float, phi::dtype::float16) {} #else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(cross_entropy_with_softmax, + GPU, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else PD_REGISTER_KERNEL(cross_entropy_with_softmax, GPU, ALL_LAYOUT, @@ -1478,3 +1488,4 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax, double, phi::dtype::float16) {} #endif +#endif diff --git a/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu index a78dc717b04..da1045c27c5 100644 --- a/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/gather_nd_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/scatter.cu.h" @@ -63,4 +64,5 @@ PD_REGISTER_KERNEL(gather_nd_grad, double, int64_t, int, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/gather_nd_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_kernel.cu index 7b241295890..b8ac4aa263a 100644 --- a/paddle/phi/kernels/gpu/gather_nd_kernel.cu +++ b/paddle/phi/kernels/gpu/gather_nd_kernel.cu @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/gather_nd_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/scatter.cu.h" @@ -58,4 +59,5 @@ PD_REGISTER_KERNEL(gather_nd, int, int16_t, bool, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu index d2671dff7b0..db1c3966e91 100644 --- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu @@ -134,6 +134,8 @@ PD_REGISTER_KERNEL(index_sample_grad, GPU, ALL_LAYOUT, phi::IndexSampleGradKernel, + phi::dtype::float16, + phi::dtype::bfloat16, float, double, int, diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu index 9b95d761fcb..053851fa265 100644 --- a/paddle/phi/kernels/gpu/index_sample_kernel.cu +++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu @@ -107,6 +107,8 @@ PD_REGISTER_KERNEL(index_sample, GPU, ALL_LAYOUT, phi::IndexSampleKernel, + phi::dtype::float16, + phi::dtype::bfloat16, float, double, int, diff --git a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu index 3271b38ae87..ba93ed41a49 100644 --- a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu @@ -25,4 +25,5 @@ PD_REGISTER_KERNEL(tril_triu_grad, double, int, int64_t, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/tril_triu_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_kernel.cu index 65dcca70584..db42fa7d425 100644 --- a/paddle/phi/kernels/gpu/tril_triu_kernel.cu +++ b/paddle/phi/kernels/gpu/tril_triu_kernel.cu @@ -25,4 +25,5 @@ PD_REGISTER_KERNEL(tril_triu, double, int, int64_t, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/kps/compare_kernel.cu b/paddle/phi/kernels/kps/compare_kernel.cu index b981d802255..b882fcc2a6c 100644 --- a/paddle/phi/kernels/kps/compare_kernel.cu +++ b/paddle/phi/kernels/kps/compare_kernel.cu @@ -114,7 +114,8 @@ PD_REGISTER_KERNEL(less_than, int64_t, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(less_equal, KPS, ALL_LAYOUT, @@ -125,7 +126,8 @@ PD_REGISTER_KERNEL(less_equal, int64_t, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(greater_than, KPS, ALL_LAYOUT, @@ -136,7 +138,8 @@ PD_REGISTER_KERNEL(greater_than, int64_t, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(greater_equal, KPS, ALL_LAYOUT, @@ -147,7 +150,8 @@ PD_REGISTER_KERNEL(greater_equal, int64_t, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(equal, KPS, ALL_LAYOUT, @@ -158,7 +162,8 @@ PD_REGISTER_KERNEL(equal, int64_t, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(not_equal, KPS, ALL_LAYOUT, @@ -169,7 +174,8 @@ PD_REGISTER_KERNEL(not_equal, int64_t, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(equal_all, KPS, diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc index 2c2b41e3c66..b866719859c 100644 --- a/paddle/phi/kernels/shape_kernel.cc +++ b/paddle/phi/kernels/shape_kernel.cc @@ -63,7 +63,8 @@ PD_REGISTER_KERNEL(shape, double, phi::dtype::complex, phi::dtype::complex, - phi::dtype::float16) { + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } #endif diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index f987e8b89cf..422a11c7e88 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -3791,8 +3791,17 @@ def gather_nd(x, index, name=None): check_variable_and_dtype( x, 'x', - ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'], - 'gather_np', + [ + 'bool', + 'float16', + 'uint16', + 'float32', + 'float64', + 'int16', + 'int32', + 'int64', + ], + 'gather_nd', ) check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather_np') helper = LayerHelper('gather_nd', **locals()) -- GitLab