diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index 601735c2f148adc14c94e786c7707b920f00fd8b..05a110fe65b839e5d305fea0a14aaeb1c83aeaf6 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -94,24 +94,19 @@ class CastCUDAOpKernel : public framework::OpKernel<InT> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
-#ifdef PADDLE_WITH_HIP
-REGISTER_OP_CUDA_KERNEL(
-    cast, ops::CastCUDAOpKernel<float>, ops::CastCUDAOpKernel<double>,
-    ops::CastCUDAOpKernel<int>, ops::CastCUDAOpKernel<int64_t>,
-    ops::CastCUDAOpKernel<int16_t>, ops::CastCUDAOpKernel<bool>,
-    ops::CastCUDAOpKernel<uint8_t>,
-    ops::CastCUDAOpKernel<paddle::platform::float16>,
-    ops::CastCUDAOpKernel<paddle::platform::complex<float>>,
-    ops::CastCUDAOpKernel<paddle::platform::complex<double>>);
+namespace plat = paddle::platform;
+
+#define REGISTER_CAST_CUDA_BASE(op_name, ...)                               \
+  REGISTER_OP_CUDA_KERNEL(                                                  \
+      op_name, ops::CastCUDAOpKernel<float>, ops::CastCUDAOpKernel<double>, \
+      ops::CastCUDAOpKernel<int>, ops::CastCUDAOpKernel<int64_t>,           \
+      ops::CastCUDAOpKernel<int16_t>, ops::CastCUDAOpKernel<bool>,          \
+      ops::CastCUDAOpKernel<uint8_t>, ops::CastCUDAOpKernel<plat::float16>, \
+      ops::CastCUDAOpKernel<plat::complex<float>>,                          \
+      ops::CastCUDAOpKernel<plat::complex<double>>, ##__VA_ARGS__);
+
+#if !defined(PADDLE_WITH_HIP)
+REGISTER_CAST_CUDA_BASE(cast, ops::CastCUDAOpKernel<plat::bfloat16>)
 #else
-REGISTER_OP_CUDA_KERNEL(
-    cast, ops::CastCUDAOpKernel<float>, ops::CastCUDAOpKernel<double>,
-    ops::CastCUDAOpKernel<int>, ops::CastCUDAOpKernel<int64_t>,
-    ops::CastCUDAOpKernel<int16_t>, ops::CastCUDAOpKernel<bool>,
-    ops::CastCUDAOpKernel<uint8_t>,
-    ops::CastCUDAOpKernel<paddle::platform::float16>,
-    ops::CastCUDAOpKernel<paddle::platform::bfloat16>,
-    ops::CastCUDAOpKernel<paddle::platform::complex<float>>,
-    ops::CastCUDAOpKernel<paddle::platform::complex<double>>);
+REGISTER_CAST_CUDA_BASE(cast)
 #endif
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 3621d20fa247218a5fabd67ce3fc41f7c5397afa..41fd0b442fe1c50e5cdf26df814874f164cffae5 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -147,6 +147,9 @@ def get_numeric_gradient(place,
         op.run(scope, place)
         for output_name in output_names:
             output_numpy = np.array(scope.find_var(output_name).get_tensor())
+            # numpy.dtype does not have bfloat16, thus we use numpy.uint16 to
+            # store bfloat16 data, and need to be converted to float to check
+            # the floating precision.
             if tensor_to_check._dtype() == core.VarDesc.VarType.BF16:
                 output_numpy = convert_uint16_to_float(output_numpy)
             sum.append(output_numpy.astype(tensor_to_check_dtype).mean())
@@ -362,11 +365,26 @@ class OpTest(unittest.TestCase):
             self.dtype = data_type
 
     def is_bfloat16_op(self):
+        # self.dtype is the dtype of inputs, and is set in infer_dtype_from_inputs_outputs.
+        # Make sure this function is called after calling infer_dtype_from_inputs_outputs.
         return self.dtype == np.uint16 or (
-            hasattr(self, 'mkldnn_data_type') and
-            getattr(self, 'mkldnn_data_type') is "bfloat16") or (
-                hasattr(self, 'attrs') and 'mkldnn_data_type' in self.attrs and
-                self.attrs['mkldnn_data_type'] == 'bfloat16')
+            hasattr(self, 'output_dtype') and
+            self.output_dtype == np.uint16) or (
+                hasattr(self, 'mkldnn_data_type') and
+                getattr(self, 'mkldnn_data_type') is "bfloat16") or (
+                    hasattr(self, 'attrs') and
+                    'mkldnn_data_type' in self.attrs and
+                    self.attrs['mkldnn_data_type'] == 'bfloat16')
+
+    def is_mkldnn_op(self):
+        return (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or (
+            hasattr(self, "attrs") and "use_mkldnn" in self.attrs and
+            self.attrs["use_mkldnn"] == True)
+
+    def is_xpu_op(self):
+        return (hasattr(self, "use_xpu") and self.use_xpu == True) or (
+            hasattr(self, "attrs") and "use_xpu" in self.attrs and
+            self.attrs["use_xpu"] == True)
 
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def is_np_data(input):
@@ -398,8 +416,8 @@ class OpTest(unittest.TestCase):
 
         # infer dtype from inputs, and dtype means the precision of the test
         # collect dtype of all inputs
-        dtype_set = set()
-        infer_dtype(inputs, dtype_set)
+        input_dtype_set = set()
+        infer_dtype(inputs, input_dtype_set)
         dtype_list = [
             np.dtype(np.float64), np.dtype(np.float32), np.dtype(np.float16),
             np.dtype(np.int64), np.dtype(np.int32), np.dtype(np.uint16),
@@ -408,12 +426,20 @@ class OpTest(unittest.TestCase):
         ]
         # check the dtype in dtype_list in order, select the first dtype that in dtype_set
         for dtype in dtype_list:
-            if dtype in dtype_set:
+            if dtype in input_dtype_set:
                 self.dtype = dtype
                 break
-        # save dtype in class attr
+        # save input dtype in class attr
         self.__class__.dtype = self.dtype
 
+        # infer dtype of outputs
+        output_dtype_set = set()
+        infer_dtype(outputs, output_dtype_set)
+        for dtype in dtype_list:
+            if dtype in output_dtype_set:
+                self.output_dtype = dtype
+                break
+
     def feed_var(self, input_vars, place):
         feed_map = {}
         for var_name in input_vars:
@@ -439,14 +465,10 @@ class OpTest(unittest.TestCase):
 
     def _append_ops(self, block):
         self.__class__.op_type = self.op_type  # for ci check, please not delete it for now
-        if (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or \
-            (hasattr(self, "attrs") and "use_mkldnn" in self.attrs and \
-                    self.attrs["use_mkldnn"] == True):
+        if self.is_mkldnn_op():
             self.__class__.use_mkldnn = True
 
-        if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
-            (hasattr(self, "attrs") and "use_xpu" in self.attrs and \
-                    self.attrs["use_xpu"] == True):
+        if self.is_xpu_op():
             self.__class__.use_xpu = True
 
         op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
@@ -1092,12 +1114,15 @@ class OpTest(unittest.TestCase):
             atol = 0
 
         if self.is_bfloat16_op():
-            check_dygraph = False
-            if hasattr(self, 'force_fp32_output') and getattr(
-                    self, 'force_fp32_output'):
-                atol = 1e-2
+            if self.is_mkldnn_op():
+                check_dygraph = False
+                if hasattr(self, 'force_fp32_output') and getattr(
+                        self, 'force_fp32_output'):
+                    atol = 1e-2
+                else:
+                    atol = 2
             else:
-                atol = 2
+                atol = 1e-2
 
         if no_check_set is not None:
             if self.op_type not in no_check_set_white_list.no_check_set_white_list:
@@ -1193,6 +1218,7 @@ class OpTest(unittest.TestCase):
                 expect = self.outputs[out_name]
                 expect_t = expect[0] if isinstance(expect, tuple) else expect
 
+                # np.uint16 represents bfloat16
                 if actual_t.dtype == np.uint16 and expect_t.dtype in [
                         np.float32, np.float64
                 ]:
@@ -1205,6 +1231,7 @@ class OpTest(unittest.TestCase):
                     expect_t = convert_uint16_to_float(expect_t)
                     actual_t = convert_uint16_to_float(actual_t)
                     atol = max(atol, 0.03)
+
                 # NOTE(zhiqiu): np.allclose([], [1.]) returns True
                 # see details: https://stackoverflow.com/questions/38331703/why-does-numpys-broadcasting-sometimes-allow-comparing-arrays-of-different-leng
                 if expect_t.size == 0:
@@ -1214,13 +1241,19 @@ class OpTest(unittest.TestCase):
                     np.allclose(
                         actual_t,
                         expect_t,
-                        rtol=rtol,
                         atol=atol,
+                        rtol=rtol,
                         equal_nan=equal_nan),
                     "Output (" + out_name + ") has diff at " + str(place) +
                     "\nExpect " + str(expect_t) + "\n" + "But Got" +
                     str(actual_t) + " in class " + self.__class__.__name__)
                 if check_dygraph:
+                    if self.is_bfloat16_op():
+                        if imperative_actual_t.dtype == np.uint16:
+                            imperative_actual_t = convert_uint16_to_float(
+                                imperative_actual_t)
+                        if expect_t.dtype == np.uint16:
+                            expect_t = convert_uint16_to_float(expect_t)
                     if six.moves.reduce(
                             lambda x, y: x * y, imperative_actual_t.shape,
                             1) == 0 and six.moves.reduce(
@@ -1232,6 +1265,7 @@ class OpTest(unittest.TestCase):
                                 imperative_actual_t,
                                 expect_t,
                                 atol=atol,
+                                rtol=rtol,
                                 equal_nan=equal_nan),
                             "Output (" + out_name + ") has diff at " +
                             str(place) + "\nExpect " + str(expect_t) + "\n" +
@@ -1340,14 +1374,10 @@ class OpTest(unittest.TestCase):
                      check_dygraph=True,
                      inplace_atol=None):
         self.__class__.op_type = self.op_type
-        if (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or \
-            (hasattr(self, "attrs") and "use_mkldnn" in self.attrs and \
-                    self.attrs["use_mkldnn"] == True):
+        if self.is_mkldnn_op():
             self.__class__.use_mkldnn = True
 
-        if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
-            (hasattr(self, "attrs") and "use_xpu" in self.attrs and \
-                    self.attrs["use_xpu"] == True):
+        if self.is_xpu_op():
             self.__class__.use_xpu = True
 
         places = self._get_places()
@@ -1452,10 +1482,10 @@ class OpTest(unittest.TestCase):
         op_outputs = self.outputs if hasattr(self, "outputs") else dict()
         op_attrs = self.attrs if hasattr(self, "attrs") else dict()
 
-        if self.is_bfloat16_op():
+        self._check_grad_helper()
+        if self.is_bfloat16_op() and self.is_mkldnn_op():
             check_dygraph = False
 
-        self._check_grad_helper()
         if self.dtype == np.float64 and \
             self.op_type not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST:
             numeric_grad_delta = 1e-5
diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py
index 0fc3dccab4a64d118b94496d99bdfd0760f3bb6e..948e344e4c158abd1e4bdee2a62e83a29aa0d6ea 100644
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
@@ -14,7 +14,6 @@
 
 from __future__ import print_function
 
-import op_test
 import unittest
 import numpy as np
 
@@ -22,9 +21,10 @@ import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+from op_test import OpTest, convert_uint16_to_float, convert_float_to_uint16
 
 
-class TestCastOp1(op_test.OpTest):
+class TestCastOpFp32ToFp64(OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
@@ -42,7 +42,7 @@ class TestCastOp1(op_test.OpTest):
         self.check_grad(['X'], ['Out'])
 
 
-class TestCastOp2(op_test.OpTest):
+class TestCastOpFp16ToFp32(OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float16')}
@@ -57,7 +57,7 @@ class TestCastOp2(op_test.OpTest):
         self.check_output(atol=1e-3)
 
 
-class TestCastOp3(op_test.OpTest):
+class TestCastOpFp32ToFp16(OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
@@ -72,6 +72,36 @@ class TestCastOp3(op_test.OpTest):
         self.check_output(atol=1e-3)
 
 
+class TestCastOpBf16ToFp32(OpTest):
+    def setUp(self):
+        ipt = np.array(np.random.randint(10, size=[10, 10])).astype('uint16')
+        self.inputs = {'X': ipt}
+        self.outputs = {'Out': convert_uint16_to_float(ipt)}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.BF16),
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCastOpFp32ToBf16(OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10]).astype('float32')
+        self.inputs = {'X': ipt}
+        self.outputs = {'Out': convert_float_to_uint16(ipt)}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'out_dtype': int(core.VarDesc.VarType.BF16)
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestCastOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):