diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index fdb7a3b2cb447904face736d52be665e3c6c91cc..50e4f7285b1698b3524666c812325f2b12ea1fe7 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -25,6 +25,7 @@ from .parallel_with_gloo import gloo_release
 
 from paddle.distributed.fleet.dataset import InMemoryDataset  # noqa: F401
 from paddle.distributed.fleet.dataset import QueueDataset  # noqa: F401
+from paddle.distributed.fleet.base.topology import ParallelMode  # noqa: F401
 
 from .collective import broadcast  # noqa: F401
 from .collective import all_reduce  # noqa: F401
@@ -86,4 +87,5 @@ __all__ = [  # noqa
       "wait",
       "get_rank",
       "ProbabilityEntry",
+      "ParallelMode",
 ]
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index 5b8d185212c23c0cfd5db8831b9c0667be8741f7..ef34fd144a703b2996b34ca940bfa403b11f257b 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -27,6 +27,22 @@ _HYBRID_PARALLEL_GROUP = None
 
 
 class ParallelMode(object):
+    """
+    There are all the parallel modes currently supported:
+    - DATA_PARALLEL: Distribute input data to different devices.
+    - TENSOR_PARALLEL: Shards tensors in the network to different devices.
+    - PIPELINE_PARALLEL: Place different layers of the network on different devices.
+    - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states 
+                         corresponding to the parameters to each device.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            parallel_mode = paddle.distributed.ParallelMode
+            print(parallel_mode.DATA_PARALLEL)  # 0
+
+    """
     DATA_PARALLEL = 0
     TENSOR_PARALLEL = 1
     PIPELINE_PARALLEL = 2