diff --git a/paddle/phi/kernels/gpu/trace_grad_kernel.cu b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
index 6692c1e19b033c3945387166f9954cc71fa6de32..a97e71a01874eb23cf4ef262500d9beaff00f7e9 100644
--- a/paddle/phi/kernels/gpu/trace_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
@@ -27,5 +27,6 @@ PD_REGISTER_KERNEL(trace_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu
index 4a749c5b3347da24c8aba35d33673801b4b7e407..671ca490e136a2e5adb6e61eee7360f0ab9ff835 100644
--- a/paddle/phi/kernels/gpu/trace_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_kernel.cu
@@ -52,5 +52,6 @@ PD_REGISTER_KERNEL(trace,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/python/paddle/fluid/tests/unittests/test_trace_op.py b/python/paddle/fluid/tests/unittests/test_trace_op.py
index e910658721628a76bcbb907f98422cbb3f129cb1..f13dd1701affe8d44ec4e7d088228e2750c16e53 100644
--- a/python/paddle/fluid/tests/unittests/test_trace_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trace_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import fluid, tensor
@@ -68,6 +68,82 @@ class TestTraceOpCase2(TestTraceOp):
         )
 
 
+class TestTraceFP16Op1(TestTraceOp):
+    def init_config(self):
+        self.dtype = np.float16
+        self.case = np.random.randn(20, 6).astype(self.dtype)
+        self.inputs = {'Input': self.case}
+        self.attrs = {'offset': 0, 'axis1': 0, 'axis2': 1}
+        self.target = np.trace(self.inputs['Input'])
+
+
+class TestTraceFP16Op2(TestTraceOp):
+    def init_config(self):
+        self.dtype = np.float16
+        self.case = np.random.randn(2, 20, 2, 3).astype(self.dtype)
+        self.inputs = {'Input': self.case}
+        self.attrs = {'offset': -5, 'axis1': 1, 'axis2': -1}
+        self.target = np.trace(
+            self.inputs['Input'],
+            offset=self.attrs['offset'],
+            axis1=self.attrs['axis1'],
+            axis2=self.attrs['axis2'],
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestTraceBF16Op1(OpTest):
+    def setUp(self):
+        self.op_type = "trace"
+        self.python_api = paddle.trace
+        self.init_config()
+        self.outputs = {'Out': self.target}
+
+        self.inputs['Input'] = convert_float_to_uint16(self.inputs['Input'])
+        self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+        self.place = core.CUDAPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', numeric_grad_delta=0.02
+        )
+
+    def init_config(self):
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+        self.case = np.random.randn(20, 6).astype(self.np_dtype)
+        self.inputs = {'Input': self.case}
+        self.attrs = {'offset': 0, 'axis1': 0, 'axis2': 1}
+        self.target = np.trace(self.inputs['Input'])
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestTraceBF16Op2(TestTraceBF16Op1):
+    def init_config(self):
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+        self.case = np.random.randn(2, 20, 2, 3).astype(self.np_dtype)
+        self.inputs = {'Input': self.case}
+        self.attrs = {'offset': -5, 'axis1': 1, 'axis2': -1}
+        self.target = np.trace(
+            self.inputs['Input'],
+            offset=self.attrs['offset'],
+            axis1=self.attrs['axis1'],
+            axis2=self.attrs['axis2'],
+        )
+
+
 class TestTraceAPICase(unittest.TestCase):
     def test_case1(self):
         case = np.random.randn(2, 20, 2, 3).astype('float32')