diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
index 097c9799b70f232568b3ac1ebfda9360984e27e2..07065ac908e4e302d7cc39deddd9a97d3cf3c8ef 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -88,6 +88,9 @@ namespace distributed {
     case experimental::DataType::BOOL:       \
       func<bool>(args);                      \
       break;                                 \
+    case experimental::DataType::BFLOAT16:   \
+      func<bfloat16>(args);                  \
+      break;                                 \
     default:                                 \
       VLOG(0) << "Error: Unknown DataType."; \
       exit(-1);                              \
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index b406f596401effec68adaf45c61248a0053f64ed..75f061f693b9ba4aa335eed70124214935788d30 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -996,6 +996,9 @@ void* GetPointerByOffset(void* raw_pointer,
   } else if (type == experimental::DataType::BOOL) {
     return reinterpret_cast<void*>(reinterpret_cast<bool*>(raw_pointer) +
                                    offset);
+  } else if (type == experimental::DataType::BFLOAT16) {
+    return reinterpret_cast<void*>(reinterpret_cast<uint16_t*>(raw_pointer) +
+                                   offset);
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index a5d89f6001fa18359aa60c661ace2e9526ae2d6f..5d89da86efa6cf5e3d6d7bb27bc5698400abc31a 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -59,7 +59,7 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
     return ncclUint8;
   } else if (type == framework::proto::VarType::BOOL) {
     return ncclUint8;
-#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000
   } else if (type == framework::proto::VarType::BF16) {
     return ncclBfloat16;
 #endif
@@ -86,7 +86,7 @@ inline ncclDataType_t ToNCCLDataType(experimental::DataType type) {
     return ncclInt8;
   } else if (type == experimental::DataType::BOOL) {
     return ncclUint8;
-#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000
   } else if (type == experimental::DataType::BFLOAT16) {
     return ncclBfloat16;
 #endif
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 41cb3256c8f5d573afcc50d311d879e250edf70b..82f1f70cd2163e1d31f260a47dfdbbab3e6e4d32 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -478,7 +478,8 @@ def is_initialized():
 
     Check whether the distributed environment has been initialized
 
-    Returns (bool): `True` if distributed environment has been initialized, otherwise `False`.
+    Returns:
+        `True` if distributed environment has been initialized, otherwise `False`.
 
     Examples:
         .. code-block:: python
@@ -626,7 +627,7 @@ def broadcast(tensor, src, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The Tensor to send if current rank is the source, or the Tensor to receive otherwise. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         src (int): The source rank.
         group (Group, optional): The group instance return by new_group or None for global default group.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
@@ -709,7 +710,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank id.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM.
         group (Group, optional): The group instance return by new_group or None for global default group.
@@ -817,7 +818,7 @@ def all_gather(tensor_list, tensor, group=None, sync_op=True):
 
     Args:
         tensor_list (list): A list of output Tensors. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8, bool, complex64 or complex128.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool, bfloat16, complex64 or complex128.
         tensor (Tensor): The Tensor to send. Its data type
             should be float16, float32, float64, int32, int64, int8, uint8, bool, complex64 or complex128.
         group (Group, optional): The group instance return by new_group or None for global default group.
@@ -999,9 +1000,9 @@ def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The output Tensor. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         tensor_list (list|tuple): A list/tuple of Tensors to scatter. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool. Default value is None.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. Default value is None.
         src (int): The source rank id. Default value is 0.
         group (Group, optional): The group instance return by new_group or None for global default group.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
@@ -1096,7 +1097,7 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True):
 
     Args:
         in_tensor_list (list): A list of input Tensors. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         out_tensor_list (list): A list of output Tensors. The data type of its elements should be the same as the
             data type of the input Tensors.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
@@ -1197,7 +1198,7 @@ def alltoall_single(in_tensor,
         ``alltoall_single`` is only supported in eager mode.
 
     Args:
-        in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor.
         in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor``
             must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None.
@@ -1286,7 +1287,7 @@ def send(tensor, dst=0, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The Tensor to send. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
@@ -1352,7 +1353,7 @@ def recv(tensor, src=0, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The Tensor to receive. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         src (int): The source rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
@@ -1435,7 +1436,7 @@ def isend(tensor, dst, group=None):
 
     Args:
         tensor (Tensor): The Tensor to send. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
 
@@ -1485,7 +1486,7 @@ def irecv(tensor, src=None, group=None):
 
     Args:
         tensor (Tensor): The Tensor to receive. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         src (int): The source rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
 
@@ -1594,7 +1595,7 @@ def batch_isend_irecv(p2p_op_list):
     corresponding tasks. NCCL are currently supported.
 
     Args:
-        p2p_op_list: A list of point-to-point operations(type of each operator is
+        p2p_op_list (List[P2POp]): A list of point-to-point operations(type of each operator is
             ``paddle.distributed.P2POp``). The order of the isend/irecv in the list
             matters and it needs to match with corresponding isend/irecv on the
             remote end.
@@ -1668,9 +1669,9 @@ def reduce_scatter(tensor,
     Reduces, then scatters a list of tensors to all processes in a group
 
     Args:
-        tensor (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        tensor (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         tensor_list (list[Tensor]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
         group (Group, optional): The group instance return by new_group or None for global
             default group. Default: None.
@@ -1736,9 +1737,9 @@ def _reduce_scatter_base(output,
     Reduces, then scatters a flattened tensor to all processes in a group.
 
     Args:
-        output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         input (Tensor): Input tensor that is of size output tensor size times world size. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
index 19d6f848792a3916b2521fc5b0fa5af55616db74..6631b7f46e0d0804c25b039f5e309a7cb7cddf58 100644
--- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
@@ -71,14 +71,14 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_allreduce_api MODULES test_collective_allreduce_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_allreduce_api
-                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
     test_collective_alltoall_api MODULES test_collective_alltoall_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_alltoall_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   bash_test_modules(
@@ -98,7 +98,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_alltoall_single_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_alltoall_single_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -125,7 +125,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_broadcast_api MODULES test_collective_broadcast_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_broadcast_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -154,7 +154,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_isend_irecv_api MODULES test_collective_isend_irecv_api
     ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_isend_irecv_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -187,7 +187,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_reduce_api MODULES test_collective_reduce_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_reduce_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   bash_test_modules(
@@ -207,7 +207,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_reduce_scatter_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_reduce_scatter_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -221,7 +221,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_scatter_api MODULES test_collective_scatter_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_scatter_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -235,7 +235,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_sendrecv_api MODULES test_collective_sendrecv_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_sendrecv_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
index 11fe3e4c0259a11c672d199017600d00013246b2..4d5f82e2882203012b8bc28761f152fe07525163 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,10 +25,18 @@ class TestCollectiveAllgatherAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
             tensor_list = []
-            paddle.distributed.all_gather(tensor_list, tindata)
-            return [tensor.numpy() for tensor in tensor_list]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.all_gather(tensor_list, tindata)
+                return [
+                    tensor.cast("float32").numpy() for tensor in tensor_list
+                ]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.all_gather(tensor_list, tindata)
+                return [tensor.numpy() for tensor in tensor_list]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
index 44446bd84a1646399230fbb4022ae669e9f198b4..9bdbaa18177e1ce7fca86daed4975be15aa19322 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,9 +25,15 @@ class TestCollectiveAllreduceAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.all_reduce(tindata)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.all_reduce(tindata)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.all_reduce(tindata)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
index e0589072ab2ad4ccf913b06618093f0faddc00be..eb19cadb11426ea0ac24b2365a87874da6251ba1 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
@@ -13,23 +13,31 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+import test_collective_api_base as test_base
 
 
-class TestCollectiveAllToAllAPI(TestCollectiveAPIRunnerBase):
+class TestCollectiveAllToAllAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            tindata = paddle.split(tindata, 2, axis=0)
             toutdata = []
-            paddle.distributed.alltoall(tindata, toutdata)
-            return [data.numpy() for data in toutdata]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                tindata = paddle.split(tindata, 2, axis=0)
+                dist.alltoall(tindata, toutdata)
+                return [data.cast("float32").numpy() for data in toutdata]
+            else:
+                tindata = paddle.to_tensor(indata)
+                tindata = paddle.split(tindata, 2, axis=0)
+                dist.alltoall(tindata, toutdata)
+                return [data.numpy() for data in toutdata]
 
 
 if __name__ == "__main__":
-    runtime_main(TestCollectiveAllToAllAPI, "alltoall")
+    test_base.runtime_main(TestCollectiveAllToAllAPI, "alltoall")
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
index 8a1492b779b62fb116bb08a1fa0c4ece096bed1c..f66b3a74bfd2129006a61112b9fc1ee4758029ae 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,10 +25,17 @@ class TestCollectiveAllToAllSingleAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            toutdata = paddle.to_tensor(indata)
-            paddle.distributed.alltoall_single(tindata, toutdata)
-            return [toutdata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                toutdata = paddle.to_tensor(tindata, "float32").cast("uint16")
+                dist.alltoall_single(tindata, toutdata)
+                return [toutdata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                toutdata = paddle.to_tensor(indata)
+                dist.alltoall_single(tindata, toutdata)
+                return [toutdata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
index acb1b4a5866c84dbba42ab2df420747f842e3f28..9004d27d56183c7d4f5bd04f5fca8df75b894134 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,9 +25,15 @@ class TestCollectiveBroadcastAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.broadcast(tindata, src=1)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.broadcast(tindata, src=1)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.broadcast(tindata, src=1)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
index 5434706234535c0ced2188e3c3fa1c19856316d0..37a38b218c5dc13f5b53c7838290131dab62b575 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,13 +25,23 @@ class TestCollectiveIsendIrecvAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            if rank == 0:
-                task = paddle.distributed.isend(tindata, dst=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                if rank == 0:
+                    task = dist.isend(tindata, dst=1)
+                else:
+                    task = dist.irecv(tindata, src=0)
+                task.wait()
+                return [tindata.cast("float32").numpy()]
             else:
-                task = paddle.distributed.irecv(tindata, src=0)
-            task.wait()
-            return [tindata.numpy()]
+                tindata = paddle.to_tensor(indata)
+                if rank == 0:
+                    task = dist.isend(tindata, dst=1)
+                else:
+                    task = dist.irecv(tindata, src=0)
+                task.wait()
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
index 5525bd8fa4aab50517d68a238d130b184567b5f3..5e9dfc8265ea1f997dba4399ce108e7d6b772311 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,9 +25,15 @@ class TestCollectiveReduceAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.reduce(tindata, dst=0)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.reduce(tindata, dst=0)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.reduce(tindata, dst=0)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
index 19777260b6e89019caee2054a5b091a2e651d033..c9df2459a78e0f242beb954f4add467e83e9ec6b 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,10 +25,17 @@ class TestCollectiveReduceScatterAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-            paddle.distributed.reduce_scatter(subdata1, [subdata1, subdata2])
-            return [subdata1.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                dist.reduce_scatter(subdata1, [subdata1, subdata2])
+                return [subdata1.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                dist.reduce_scatter(subdata1, [subdata1, subdata2])
+                return [subdata1.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
index fa65928967bdf288751fab1a9f829fc86bf77134..8f27f84a32d5201e51877e66630016b3520d790a 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,15 +25,27 @@ class TestCollectiveScatterAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-            if rank == 0:
-                paddle.distributed.scatter(subdata1, src=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                if rank == 0:
+                    dist.scatter(subdata1, src=1)
+                else:
+                    dist.scatter(subdata1,
+                                 tensor_list=[subdata1, subdata2],
+                                 src=1)
+                return [subdata1.cast("float32").numpy()]
             else:
-                paddle.distributed.scatter(subdata1,
-                                           tensor_list=[subdata1, subdata2],
-                                           src=1)
-            return [subdata1.numpy()]
+                tindata = paddle.to_tensor(indata)
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                if rank == 0:
+                    dist.scatter(subdata1, src=1)
+                else:
+                    dist.scatter(subdata1,
+                                 tensor_list=[subdata1, subdata2],
+                                 src=1)
+                return [subdata1.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
index ac8ffde7a48b380a29f8a1218b4ed30058377f9a..b4bf24ffbfaa962e3936b933800c7807fbbf2d91 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
@@ -13,24 +13,34 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+import test_collective_api_base as test_base
 
 
-class TestCollectiveSendRecvAPI(TestCollectiveAPIRunnerBase):
+class TestCollectiveSendRecvAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            if rank == 0:
-                paddle.distributed.send(tindata, dst=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                if rank == 0:
+                    dist.send(tindata, dst=1)
+                else:
+                    dist.recv(tindata, src=0)
+                return [tindata.cast("float32").numpy()]
             else:
-                paddle.distributed.recv(tindata, src=0)
-            return [tindata.numpy()]
+                tindata = paddle.to_tensor(indata)
+                if rank == 0:
+                    dist.send(tindata, dst=1)
+                else:
+                    dist.recv(tindata, src=0)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
-    runtime_main(TestCollectiveSendRecvAPI, "sendrecv")
+    test_base.runtime_main(TestCollectiveSendRecvAPI, "sendrecv")
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
index af4e6c10baaf9fd5c6e5726e34ee5a16fc9ec416..9040564ce1206c2e246c867259056ecff7166484 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
@@ -26,213 +26,55 @@ class TestCollectiveAllgatherAPI(TestDistBase):
         pass
 
     def test_allgather_nccl(self):
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api.py",
+                                  "allgather",
+                                  "nccl",
+                                  dtype=dtype)
 
     def test_allgather_gloo(self):
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api.py",
+                                  "allgather",
+                                  "gloo",
+                                  "3",
+                                  dtype=dtype)
 
     def test_allgatther_nccl_dygraph(self):
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api_dygraph.py",
+                                  "allgather",
+                                  "nccl",
+                                  static_mode="0",
+                                  dtype=dtype)
 
     def test_allgather_gloo_dygraph(self):
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api_dygraph.py",
+                                  "allgather",
+                                  "gloo",
+                                  "3",
+                                  static_mode="0",
+                                  dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
index c0bd54a6fad7a08cb78360c861a598162b36eb04..a5080f78bcee2dd68d566cbf35fb5d6881cc786f 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
@@ -41,9 +41,11 @@ class TestCollectiveAllreduceAPI(TestDistBase):
 
     def test_allreduce_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allreduce_api_dygraph.py",
                                   "allreduce",
@@ -53,8 +55,8 @@ class TestCollectiveAllreduceAPI(TestDistBase):
 
     def test_allreduce_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allreduce_api_dygraph.py",
@@ -65,5 +67,5 @@ class TestCollectiveAllreduceAPI(TestDistBase):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
index a042507ede1d4bb3354396638820a50bf1e4fff9..1edb06ae512d69e8f88e678d38cfde7f8cc246d9 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
@@ -30,9 +30,11 @@ class TestCollectiveAllToAllAPI(TestDistBase):
 
     def test_alltoall_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_api_dygraph.py",
                                   "alltoall",
@@ -41,5 +43,5 @@ class TestCollectiveAllToAllAPI(TestDistBase):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
index 2f18903068edb8bc7546adab355cdce72f7053d5..e3ef3f302f33e8e52d10e66352728c2ccbf89789 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
@@ -23,9 +23,11 @@ class TestCollectiveAllToAllSingleAPI(test_base.TestDistBase):
 
     def test_alltooall_single_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_single_api_dygraph.py",
                                   "alltoall",
@@ -34,5 +36,5 @@ class TestCollectiveAllToAllSingleAPI(test_base.TestDistBase):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
index f0c768280524792610995ce18998f5aeb0420c39..8f4e747b622eba8fe8857b4c76f76744a91a9990 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
@@ -35,9 +35,11 @@ class TestCollectiveBroadcastAPI(TestDistBase):
 
     def test_broadcast_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_broadcast_api_dygraph.py",
                                   "broadcast",
@@ -47,8 +49,8 @@ class TestCollectiveBroadcastAPI(TestDistBase):
 
     def test_broadcast_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_broadcast_api_dygraph.py",
@@ -59,5 +61,5 @@ class TestCollectiveBroadcastAPI(TestDistBase):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
index 333da7e6807aaecb676e1c1757a6130caba8bcc0..2b0727cae0c8e9c82906de31abf392c170309733 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
@@ -23,9 +23,11 @@ class TestCollectiveIsendIrecvAPI(test_base.TestDistBase):
 
     def test_isend_irecv_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_isend_irecv_api_dygraph.py",
                                   "sendrecv",
@@ -34,5 +36,5 @@ class TestCollectiveIsendIrecvAPI(test_base.TestDistBase):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
index ccaf61472fe8a92490e2e3e46ffee5dd9c85cf6d..35bff97f91619ee463d30dedc116a1ed0db27615 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
@@ -38,9 +38,11 @@ class TestCollectiveReduceAPI(TestDistBase):
 
     def test_reduce_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_api_dygraph.py",
                                   "reduce",
@@ -50,8 +52,8 @@ class TestCollectiveReduceAPI(TestDistBase):
 
     def test_reduce_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_api_dygraph.py",
@@ -62,5 +64,5 @@ class TestCollectiveReduceAPI(TestDistBase):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
index d490a8bbce5df605e63f5ee0c61847247592c765..669478f58a37dd2c7dbc0f457c1ab414fe00ccda 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
@@ -23,9 +23,11 @@ class TestCollectiveReduceScatterAPI(test_base.TestDistBase):
 
     def test_reduce_scatter_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_scatter_api_dygraph.py",
                                   "reduce_scatter",
@@ -34,5 +36,5 @@ class TestCollectiveReduceScatterAPI(test_base.TestDistBase):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
index d5e8e7cc62e162a3c5d88cfb4cdafc9d0861fd8c..ab7de7975feed8252322dbead74cb40b6b053cdd 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
@@ -34,9 +34,11 @@ class TestCollectiveScatterAPI(TestDistBase):
 
     def test_scatter_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_scatter_api_dygraph.py",
                                   "scatter",
@@ -46,8 +48,8 @@ class TestCollectiveScatterAPI(TestDistBase):
 
     def test_scatter_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_scatter_api_dygraph.py",
@@ -58,5 +60,5 @@ class TestCollectiveScatterAPI(TestDistBase):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
index ee8ada3d22be65f585bffde1a17db73eba303e44..3db6df5d46e19f9011d885238beb7cecf86cf392 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
@@ -32,9 +32,11 @@ class TestCollectiveSendRecvAPI(TestDistBase):
 
     def test_sendrecv_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_sendrecv_api_dygraph.py",
                                   "sendrecv",
@@ -43,5 +45,5 @@ class TestCollectiveSendRecvAPI(TestDistBase):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv
index 08c7c394ab7883f6fb73425ef380e0cc4dcbcf71..883cf7941e3685b22d54d997130a79d60843775c 100644
--- a/python/paddle/fluid/tests/unittests/collective/testslist.csv
+++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv
@@ -7,27 +7,27 @@ test_c_split,linux,gpu;rocm,120,DIST,test_runner.py,2,,PYTHONPATH=..;http_proxy=
 test_collective_split_embedding,linux,rocm;gpu,300,DIST,../dist_test.sh,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
 test_collective_allgather_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_allgather_object_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_allreduce_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_alltoall_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_allreduce_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_alltoall_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_alltoall_single,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_alltoall_single_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_alltoall_single_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_barrier_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_batch_isend_irecv,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_broadcast_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_broadcast_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_cpu_barrier_with_gloo,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_global_gather,linux,gpu;rocm,200,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_global_scatter,linux,gpu;rocm,200,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_isend_irecv_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_isend_irecv_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_optimizer,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_process_group,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_reduce,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_reduce_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_reduce_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_reduce_scatter,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_reduce_scatter_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_reduce_scatter_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_scatter,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_scatter_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_scatter_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_sendrecv,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_sendrecv_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_sendrecv_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index e08c0f0f99328282a97960a7a0bae77891a8c18d..3a3e5d99ca7095786288d02859de3c27d4dbeaea 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -23,6 +23,7 @@ from contextlib import closing
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
+from paddle_bfloat import bfloat16
 
 
 def create_bool_test_data(shape=None, seed=None):
@@ -76,6 +77,9 @@ def create_test_data(shape=None, dtype=None, seed=None):
     assert shape, "Shape should be specified"
     if dtype == "float32" or dtype == "float16" or dtype == "float64":
         return create_float_test_data(shape=shape, dtype=dtype, seed=seed)
+    elif dtype == "bfloat16":
+        # since numpy does not support bfloat16 yet, use `paddle_bfloat` to replace
+        return create_float_test_data(shape=shape, dtype=bfloat16, seed=seed)
     elif dtype == "bool":
         return create_bool_test_data(shape=shape, seed=seed)
     elif dtype == "int32" or dtype == "int64" or dtype == "int8" or dtype == "uint8":
@@ -167,6 +171,15 @@ class TestDistBase(unittest.TestCase):
 
         self.temp_dir = tempfile.TemporaryDirectory()
 
+        # NOTE: this is a hack to get int format nccl version, like 2134
+        # if current platform is not linux, version number will be 0
+        nccl_version_str = subprocess.check_output(
+            r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'",
+            stderr=subprocess.DEVNULL,
+            shell=True).decode('utf-8')
+        self._nccl_version = int("".join(
+            nccl_version_str.split("."))) if nccl_version_str else 0
+
     def tearDown(self):
         self.temp_dir.cleanup()
 
@@ -305,6 +318,10 @@ class TestDistBase(unittest.TestCase):
             model_file, required_envs)
         input1 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid0)
         input2 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid1)
+        # cast bfloat16 to float32 for numeric comparison
+        if dtype == "bfloat16":
+            input1 = input1.astype("float32")
+            input2 = input2.astype("float32")
         if col_type == "allgather":
             need_result = np.vstack((input1, input2))
             tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
@@ -321,7 +338,13 @@ class TestDistBase(unittest.TestCase):
             np.testing.assert_allclose(tr1_out[0], need_result, rtol=1e-05)
         elif col_type == "reduce":
             need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out[0], need_result, rtol=1e-05)
+            # bfloat16 precision loss comes from truncating the last 16 bits of float32,
+            # which sums (\sum_{i=-23}^{-8}2^{i}) to about 0.0078
+            if dtype == "bfloat16":
+                rtol = 8e-03
+            else:
+                rtol = 1e-05
+            np.testing.assert_allclose(tr0_out[0], need_result, rtol=rtol)
         elif col_type == "scatter":
             need_result = input2
             need_result1 = need_result[0:need_result.shape[0] // 2]
@@ -332,18 +355,28 @@ class TestDistBase(unittest.TestCase):
             need_result = input1 + input2
             need_result1 = need_result[0:need_result.shape[0] // 2]
             need_result2 = need_result[need_result.shape[0] // 2:]
-            np.testing.assert_allclose(tr0_out[0], need_result1, rtol=1e-05)
-            np.testing.assert_allclose(tr1_out[0], need_result2, rtol=1e-05)
+            if dtype == "bfloat16":
+                rtol = 8e-03
+            else:
+                rtol = 1e-05
+            np.testing.assert_allclose(tr0_out[0], need_result1, rtol=rtol)
+            np.testing.assert_allclose(tr1_out[0], need_result2, rtol=rtol)
         elif col_type == "allreduce":
             need_result = input1 + input2
+            if dtype == "bfloat16":
+                rtol = 8e-03
+                atol = 8e-03
+            else:
+                rtol = 1e-05
+                atol = 1e-05
             np.testing.assert_allclose(tr0_out[0],
                                        need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+                                       rtol=rtol,
+                                       atol=atol)
             np.testing.assert_allclose(tr1_out[0],
                                        need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+                                       rtol=rtol,
+                                       atol=atol)
         elif col_type == "parallel_embedding":
             result_data = tr0_out[0]
             np.random.seed(2020)