From 95c3d6135841d74211b1343679ee568588051d7b Mon Sep 17 00:00:00 2001
From: Yiqun Liu <Xreki@users.noreply.github.com>
Date: Sun, 9 Apr 2023 10:18:03 +0800
Subject: [PATCH] Cherry pick the support of bfloat16 for several operators.
 (#52608)

* Register exp/expm1/logit bf16 activation op kernels (#48702)

* register more bf16 ops

* update to register coresponding backward ops

* Addition of bf16 type support for Compare OP  (#46413)

* first commit

* clarify the quotes

* change code style format

* support bfloat16

* add bfloat16 support for more ops (#48272)

* [Bfloat16]register bfloat16 datatype for squared l2 norm (#50908)

* Sync the pull request #51903.

* Add some header files back.

* modify cmake file for cuda11.8 compile (#49020)

* modify cmake file for cuda11.8 compile

* add op_library(fused_embedding_eltwise_layernorm_op DEPS bert_encoder_functor)

* Fix compling error.

* Cherry-pick pull request #51396.

---------

Co-authored-by: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Co-authored-by: limingshu <61349199+JamesLim-sy@users.noreply.github.com>
Co-authored-by: Shaojie WANG <wsjmessi@163.com>
Co-authored-by: zqw_1997 <118182234+zhengqiwen1997@users.noreply.github.com>
---
 .../fused_softmax_mask_upper_triangle_op.cu   | 12 +++++++
 paddle/fluid/operators/math.h                 |  5 +++
 paddle/fluid/operators/math/cross_entropy.cu  |  7 +++-
 paddle/fluid/operators/math/cross_entropy.h   | 33 ++++++++++++++-----
 paddle/phi/kernels/full_kernel.cc             |  3 +-
 .../phi/kernels/gpu/activation_grad_kernel.cu |  9 +++--
 paddle/phi/kernels/gpu/activation_kernel.cu   |  9 +++--
 paddle/phi/kernels/gpu/arg_min_max_kernel.cu  |  2 ++
 .../kernels/gpu/cross_entropy_grad_kernel.cu  | 21 ++++++++++++
 .../phi/kernels/gpu/cross_entropy_kernel.cu   | 17 ++++++++--
 .../phi/kernels/gpu/gather_nd_grad_kernel.cu  |  4 ++-
 paddle/phi/kernels/gpu/gather_nd_kernel.cu    |  4 ++-
 .../kernels/gpu/index_sample_grad_kernel.cu   |  2 ++
 paddle/phi/kernels/gpu/index_sample_kernel.cu |  2 ++
 .../phi/kernels/gpu/tril_triu_grad_kernel.cu  |  3 +-
 paddle/phi/kernels/gpu/tril_triu_kernel.cu    |  3 +-
 paddle/phi/kernels/kps/compare_kernel.cu      | 18 ++++++----
 paddle/phi/kernels/shape_kernel.cc            |  3 +-
 python/paddle/tensor/manipulation.py          | 13 ++++++--
 19 files changed, 138 insertions(+), 32 deletions(-)
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
index f92479888f8..94699c9ce69 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
@@ -68,6 +68,11 @@ __device__ __inline__ void load_data_upper_tri(plat::float16* dst,
   *(reinterpret_cast<float2*>(dst)) = *(reinterpret_cast<const float2*>(src));
 }
 
+__device__ __inline__ void load_data_upper_tri(plat::bfloat16* dst,
+                                               const plat::bfloat16* src) {
+  *(reinterpret_cast<float2*>(dst)) = *(reinterpret_cast<const float2*>(src));
+}
+
 __device__ __inline__ void load_data_upper_tri(float* dst, const float* src) {
   *(reinterpret_cast<float4*>(dst)) = *(reinterpret_cast<const float4*>(src));
 }
@@ -76,6 +81,10 @@ __device__ __inline__ void load_zero_vector_upper_tri(plat::float16* dst) {
   *(reinterpret_cast<float2*>(dst)) = make_float2(0.0f, 0.0f);
 }
 
+__device__ __inline__ void load_zero_vector_upper_tri(plat::bfloat16* dst) {
+  *(reinterpret_cast<float2*>(dst)) = make_float2(0.0f, 0.0f);
+}
+
 __device__ __inline__ void load_zero_vector_upper_tri(float* dst) {
   *(reinterpret_cast<float4*>(dst)) = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 }
@@ -595,8 +604,11 @@ namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     fused_softmax_mask_upper_triangle,
     ops::SoftmaxMaskFuseUpperTriangleKernel<phi::GPUContext, plat::float16>,
+    ops::SoftmaxMaskFuseUpperTriangleKernel<phi::GPUContext, plat::bfloat16>,
     ops::SoftmaxMaskFuseUpperTriangleKernel<phi::GPUContext, float>);
 REGISTER_OP_CUDA_KERNEL(
     fused_softmax_mask_upper_triangle_grad,
     ops::SoftmaxMaskFuseUpperTriangleGradKernel<phi::GPUContext, plat::float16>,
+    ops::SoftmaxMaskFuseUpperTriangleGradKernel<phi::GPUContext,
+                                                plat::bfloat16>,
     ops::SoftmaxMaskFuseUpperTriangleGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/math.h b/paddle/fluid/operators/math.h
index 47281fb0280..f376663ecec 100644
--- a/paddle/fluid/operators/math.h
+++ b/paddle/fluid/operators/math.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "math.h"  // NOLINT
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/core/hostdevice.h"
 
@@ -33,6 +34,10 @@ inline HOSTDEVICE platform::float16 real_log(platform::float16 x) {
   return static_cast<platform::float16>(::logf(static_cast<float>(x)));
 }
 
+inline HOSTDEVICE phi::dtype::bfloat16 real_log(phi::dtype::bfloat16 x) {
+  return static_cast<phi::dtype::bfloat16>(::logf(static_cast<float>(x)));
+}
+
 inline HOSTDEVICE float real_log(float x) { return ::logf(x); }
 
 inline HOSTDEVICE double real_log(double x) { return ::log(x); }
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index c366dd6fcef..f8bd4b60d47 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/cross_entropy.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 
@@ -152,7 +154,10 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
 
 template class CrossEntropyFunctor<phi::GPUContext, float>;
 template class CrossEntropyFunctor<phi::GPUContext, double>;
-template class CrossEntropyFunctor<phi::GPUContext, platform::float16>;
+template class CrossEntropyFunctor<phi::GPUContext, phi::dtype::float16>;
+#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(8, 1, 0)
+template class CrossEntropyFunctor<phi::GPUContext, phi::dtype::bfloat16>;
+#endif
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index 0de10789ba0..651579005b9 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -17,7 +17,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/hostdevice.h"
 
 namespace paddle {
@@ -46,14 +47,30 @@ struct TolerableValue {
 // Also. In standard implementation of cross entropy, other
 // framework not has the ValueClipping.
 template <>
-struct TolerableValue<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& x) const {
-    if (platform::isfinite(x))
+struct TolerableValue<phi::dtype::float16> {
+  HOSTDEVICE phi::dtype::float16 operator()(
+      const phi::dtype::float16& x) const {
+    if (phi::dtype::isfinite(x)) {
       return x;
-    else if (x > static_cast<platform::float16>(0))
-      return std::numeric_limits<platform::float16>::max();
-    else
-      return std::numeric_limits<platform::float16>::min();
+    } else if (x > static_cast<phi::dtype::float16>(0)) {
+      return std::numeric_limits<phi::dtype::float16>::max();
+    } else {
+      return std::numeric_limits<phi::dtype::float16>::min();
+    }
+  }
+};
+
+template <>
+struct TolerableValue<phi::dtype::bfloat16> {
+  HOSTDEVICE phi::dtype::bfloat16 operator()(
+      const phi::dtype::bfloat16& x) const {
+    if (phi::dtype::isfinite(x)) {
+      return x;
+    } else if (x > static_cast<phi::dtype::bfloat16>(0)) {
+      return std::numeric_limits<phi::dtype::bfloat16>::max();
+    } else {
+      return std::numeric_limits<phi::dtype::bfloat16>::min();
+    }
   }
 };
 
diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc
index 9622bff5c25..ce898210633 100644
--- a/paddle/phi/kernels/full_kernel.cc
+++ b/paddle/phi/kernels/full_kernel.cc
@@ -59,7 +59,8 @@ PD_REGISTER_KERNEL(full_batch_size_like,
                    int,
                    int64_t,
                    bool,
-                   phi::dtype::float16) {
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 #endif
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index b947c70cb89..d40f3b5013a 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -370,7 +370,8 @@ PD_REGISTER_KERNEL(exp_grad,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
@@ -385,7 +386,8 @@ PD_REGISTER_KERNEL(expm1_grad,
                    phi::Expm1GradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(logit_grad,
                    GPU,
@@ -393,7 +395,8 @@ PD_REGISTER_KERNEL(logit_grad,
                    phi::LogitGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(square_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index e57332c4075..ab32f420701 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -212,21 +212,24 @@ PD_REGISTER_KERNEL(exp,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(expm1,
                    GPU,
                    ALL_LAYOUT,
                    phi::Expm1Kernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(logit,
                    GPU,
                    ALL_LAYOUT,
                    phi::LogitKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(square,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index 13db1853495..4c440ed0dd7 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -255,6 +255,7 @@ PD_REGISTER_KERNEL(arg_min,
                    ALL_LAYOUT,
                    phi::ArgMinKernel,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    float,
                    double,
                    int32_t,
@@ -267,6 +268,7 @@ PD_REGISTER_KERNEL(arg_max,
                    ALL_LAYOUT,
                    phi::ArgMaxKernel,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    float,
                    double,
                    int32_t,
diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
index 5d40304c5e0..93cdf64a8ef 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -282,6 +282,7 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -289,3 +290,23 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
                    float,
                    double,
                    phi::dtype::float16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CrossEntropyWithSoftmaxGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CrossEntropyWithSoftmaxGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
+#endif
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index 76201a1077e..087ba293fb8 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -252,7 +252,7 @@ __device__ __forceinline__ AccT ThreadReduce(const T* input,
     input -= offset;
     size += offset;
     if (tid >= offset) {
-      val = reducer(val, input[tid]);
+      val = reducer(val, static_cast<AccT>(input[tid]));
     }
     size -= blockDim.x;
     input += blockDim.x;
@@ -268,14 +268,14 @@ __device__ __forceinline__ AccT ThreadReduce(const T* input,
 
 #pragma unroll
     for (int i = 0; i < VecSize; ++i) {
-      val = reducer(val, ins[i]);
+      val = reducer(val, static_cast<AccT>(ins[i]));
     }
   }
 
   // scalar part
   tid = size - remain + threadIdx.x;
   for (; tid < size; tid += blockDim.x) {
-    val = reducer(val, input[tid]);
+    val = reducer(val, static_cast<AccT>(input[tid]));
   }
   return val;
 }
@@ -1470,6 +1470,16 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax,
                    float,
                    phi::dtype::float16) {}
 #else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(cross_entropy_with_softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CrossEntropyWithSoftmaxKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
 PD_REGISTER_KERNEL(cross_entropy_with_softmax,
                    GPU,
                    ALL_LAYOUT,
@@ -1478,3 +1488,4 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax,
                    double,
                    phi::dtype::float16) {}
 #endif
+#endif
diff --git a/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu
index a78dc717b04..da1045c27c5 100644
--- a/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/gather_nd_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"
@@ -63,4 +64,5 @@ PD_REGISTER_KERNEL(gather_nd_grad,
                    double,
                    int64_t,
                    int,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/gather_nd_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_kernel.cu
index 7b241295890..b8ac4aa263a 100644
--- a/paddle/phi/kernels/gpu/gather_nd_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_nd_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/gather_nd_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"
@@ -58,4 +59,5 @@ PD_REGISTER_KERNEL(gather_nd,
                    int,
                    int16_t,
                    bool,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
index d2671dff7b0..db1c3966e91 100644
--- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
@@ -134,6 +134,8 @@ PD_REGISTER_KERNEL(index_sample_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::IndexSampleGradKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    float,
                    double,
                    int,
diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu
index 9b95d761fcb..053851fa265 100644
--- a/paddle/phi/kernels/gpu/index_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu
@@ -107,6 +107,8 @@ PD_REGISTER_KERNEL(index_sample,
                    GPU,
                    ALL_LAYOUT,
                    phi::IndexSampleKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    float,
                    double,
                    int,
diff --git a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
index 3271b38ae87..ba93ed41a49 100644
--- a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
@@ -25,4 +25,5 @@ PD_REGISTER_KERNEL(tril_triu_grad,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/tril_triu_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_kernel.cu
index 65dcca70584..db42fa7d425 100644
--- a/paddle/phi/kernels/gpu/tril_triu_kernel.cu
+++ b/paddle/phi/kernels/gpu/tril_triu_kernel.cu
@@ -25,4 +25,5 @@ PD_REGISTER_KERNEL(tril_triu,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/kps/compare_kernel.cu b/paddle/phi/kernels/kps/compare_kernel.cu
index b981d802255..b882fcc2a6c 100644
--- a/paddle/phi/kernels/kps/compare_kernel.cu
+++ b/paddle/phi/kernels/kps/compare_kernel.cu
@@ -114,7 +114,8 @@ PD_REGISTER_KERNEL(less_than,
                    int64_t,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(less_equal,
                    KPS,
                    ALL_LAYOUT,
@@ -125,7 +126,8 @@ PD_REGISTER_KERNEL(less_equal,
                    int64_t,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(greater_than,
                    KPS,
                    ALL_LAYOUT,
@@ -136,7 +138,8 @@ PD_REGISTER_KERNEL(greater_than,
                    int64_t,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(greater_equal,
                    KPS,
                    ALL_LAYOUT,
@@ -147,7 +150,8 @@ PD_REGISTER_KERNEL(greater_equal,
                    int64_t,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(equal,
                    KPS,
                    ALL_LAYOUT,
@@ -158,7 +162,8 @@ PD_REGISTER_KERNEL(equal,
                    int64_t,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(not_equal,
                    KPS,
                    ALL_LAYOUT,
@@ -169,7 +174,8 @@ PD_REGISTER_KERNEL(not_equal,
                    int64_t,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(equal_all,
                    KPS,
diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc
index 2c2b41e3c66..b866719859c 100644
--- a/paddle/phi/kernels/shape_kernel.cc
+++ b/paddle/phi/kernels/shape_kernel.cc
@@ -63,7 +63,8 @@ PD_REGISTER_KERNEL(shape,
                    double,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>,
-                   phi::dtype::float16) {
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 #endif
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index f987e8b89cf..422a11c7e88 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -3791,8 +3791,17 @@ def gather_nd(x, index, name=None):
     check_variable_and_dtype(
         x,
         'x',
-        ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'],
-        'gather_np',
+        [
+            'bool',
+            'float16',
+            'uint16',
+            'float32',
+            'float64',
+            'int16',
+            'int32',
+            'int64',
+        ],
+        'gather_nd',
     )
     check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather_np')
     helper = LayerHelper('gather_nd', **locals())
-- 
GitLab