diff --git a/paddle/phi/kernels/funcs/eigen/broadcast.cc b/paddle/phi/kernels/funcs/eigen/broadcast.cc
index 3459d7acd6baf0c4192719159ff02db721886332..008c51249f2497241af3bd9765aeadc4e4e425ba 100644
--- a/paddle/phi/kernels/funcs/eigen/broadcast.cc
+++ b/paddle/phi/kernels/funcs/eigen/broadcast.cc
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
@@ -73,6 +74,7 @@ struct EigenBroadcastGrad<Eigen::DefaultDevice, T, Rank> {
   template struct FUNCTOR<Eigen::DefaultDevice, T, 6>
 INSTANTIATION(EigenBroadcast, bool);
 INSTANTIATION(EigenBroadcast, dtype::float16);
+INSTANTIATION(EigenBroadcast, dtype::bfloat16);
 INSTANTIATION(EigenBroadcast, float);
 INSTANTIATION(EigenBroadcast, double);
 INSTANTIATION(EigenBroadcast, int);
diff --git a/paddle/phi/kernels/funcs/eigen/broadcast.cu b/paddle/phi/kernels/funcs/eigen/broadcast.cu
index d9de69ec55e8b5852766cbfc48244aa6347438fc..742081a30c1a0aa249012be3003573e0330e1266 100644
--- a/paddle/phi/kernels/funcs/eigen/broadcast.cu
+++ b/paddle/phi/kernels/funcs/eigen/broadcast.cu
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
@@ -73,6 +74,7 @@ struct EigenBroadcastGrad<Eigen::GpuDevice, T, Rank> {
   template struct FUNCTOR<Eigen::GpuDevice, T, 6>
 INSTANTIATION(EigenBroadcast, bool);
 INSTANTIATION(EigenBroadcast, dtype::float16);
+INSTANTIATION(EigenBroadcast, dtype::bfloat16);
 INSTANTIATION(EigenBroadcast, float);
 INSTANTIATION(EigenBroadcast, double);
 INSTANTIATION(EigenBroadcast, int);
diff --git a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
index 6ca8dbd9205d801f2c6a08e4b4db83f924964bbb..950f811475c99f508654d36b3fbc5c3131f22e41 100644
--- a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
@@ -24,4 +24,5 @@ PD_REGISTER_KERNEL(einsum_grad,
                    phi::EinsumGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
index 0c3c29e82c42aefae33a4a9be9e9a7d9ec0c1e99..990877a8445cbf98d5b487a969a24059dbc24c84 100644
--- a/paddle/phi/kernels/gpu/tile_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(tile,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
index aceb97a49b1c24040a5a2c47cea45dfe07a0c6a4..a72db326807f8eea865b197e6723924413e29a9b 100644
--- a/paddle/phi/kernels/impl/einsum_grad_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -197,20 +197,24 @@ void EinsumGradKernel(const Context& dev_ctx,
     // release the cache tensor dTC to save memory right now. they are useless
     // now.
     cache.clear();
-    *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
-                                                       labeltype,
-                                                       labelshape,
-                                                       broadcast_dims,
-                                                       ellipsis_dims[0],
-                                                       ops[0],
-                                                       dA);
-    *(x_grad[1]) = PerformTileAndReduction<T, Context>(dev_ctx,
-                                                       labeltype,
-                                                       labelshape,
-                                                       broadcast_dims,
-                                                       ellipsis_dims[1],
-                                                       ops[1],
-                                                       dB);
+    if (x_grad[0]) {
+      *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
+                                                         labeltype,
+                                                         labelshape,
+                                                         broadcast_dims,
+                                                         ellipsis_dims[0],
+                                                         ops[0],
+                                                         dA);
+    }
+    if (x_grad[1]) {
+      *(x_grad[1]) = PerformTileAndReduction<T, Context>(dev_ctx,
+                                                         labeltype,
+                                                         labelshape,
+                                                         broadcast_dims,
+                                                         ellipsis_dims[1],
+                                                         ops[1],
+                                                         dB);
+    }
   }
 }
 }  // namespace phi
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
index c58d46edde7536d0210a25f829decc04de0fb0fd..b33a943c9f27e20047703fa56d1b6d9a0cea73f7 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
@@ -478,5 +478,23 @@ class TestStaticGraphShape(unittest.TestCase):
         self.assertEqual(C.shape, (-1, 384))
 
 
+class TestBF16(unittest.TestCase):
+    """
+    EinsumOp support bfloat16 type, add unittest here for the correctness.
+    """
+
+    def test_shape(self):
+        cuda_major = paddle.version.cuda().split('.')[0].strip()
+        if paddle.is_compiled_with_cuda() and int(cuda_major) >= 11:
+            """ MatmulKernel support bfloat16 only if cuda_major > 11.0.
+            """
+            A = paddle.to_tensor(np.array([1.0, 2.0])).astype(paddle.bfloat16)
+            A = A.cuda()
+            B = paddle.to_tensor(np.array([2.0, 3.0])).astype(paddle.bfloat16)
+            B = B.cuda()
+            C = paddle.einsum('i,i->', A, B)
+            self.assertEqual(C.item(), 8.0)
+
+
 if __name__ == "__main__":
     unittest.main()