From 9dd1f4bff1d7f87b5e79332c6a944eb874bb3d2a Mon Sep 17 00:00:00 2001
From: zqw_1997 <118182234+zhengqiwen1997@users.noreply.github.com>
Date: Thu, 9 Feb 2023 16:07:05 +0800
Subject: [PATCH] remove paddle.fluid.dygraph.parallel.ParallelEnv (#50157)

* remove dygraph.parallel.ParallelEnv

* logger.py error: AttributeError: module 'paddle' has no attribute 'distributed'

* move the implenmentation to the root folder

* logger.py import ParallelEnv from paddle.parallel to avoid circular import

* add the comment of why import ParallelEnv from paddle.parallel in logger.py and remove the api interface in the paddle/parallel.py

* outdated Env and note removed

* decouple the logger.py and ParallelEnv

* remove another ref of parallel in init.py
---
 python/paddle/device/__init__.py              |   3 +-
 python/paddle/distributed/__init__.py         |   6 +-
 .../distributed/auto_parallel/engine.py       |   5 +-
 python/paddle/distributed/parallel.py         | 216 +++++++++++++++-
 .../passes/auto_parallel_quantization.py      |   5 +-
 .../paddle/fluid/dataloader/batch_sampler.py  |   2 +-
 python/paddle/fluid/dygraph/parallel.py       | 239 +-----------------
 .../custom_runtime/process_group_xccl.py      |   7 +-
 .../auto_parallel/amp_pass_unittest.py        |   3 +-
 .../auto_parallel/clip_grad_by_global_norm.py |   3 +-
 .../gradient_merge_pass_unittest.py           |   3 +-
 .../auto_parallel/recompute_pass_unittest.py  |   3 +-
 .../auto_parallel/sharding_newexe.py          |   3 +-
 .../auto_parallel/sharding_pass_unittest.py   |   3 +-
 .../auto_parallel/test_fused_linear_pass.py   |   3 +-
 .../unittests/auto_parallel/test_pass_bf16.py |   3 +-
 .../auto_parallel/test_selective_recompute.py |   3 +-
 .../collective/process_group_gloo.py          |   5 +-
 .../collective/process_group_nccl.py          |   5 +-
 .../tests/unittests/xpu/process_group_bkcl.py |   5 +-
 python/paddle/hapi/callbacks.py               |  18 +-
 python/paddle/hapi/logger.py                  |   4 +-
 python/paddle/hapi/model.py                   |  48 ++--
 python/paddle/utils/download.py               |   2 +-
 24 files changed, 297 insertions(+), 300 deletions(-)

diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index d3bcd56db7..fac5d76b2b 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -19,7 +19,6 @@ import ctypes
 import paddle
 from paddle.fluid import core
 from paddle.fluid import framework
-from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.fluid.framework import is_compiled_with_cinn  # noqa: F401
 from paddle.fluid.framework import is_compiled_with_cuda  # noqa: F401
 from paddle.fluid.framework import is_compiled_with_rocm  # noqa: F401
@@ -238,7 +237,7 @@ def _convert_to_place(device):
                 "The device should not be 'gpu', "
                 "since PaddlePaddle is not compiled with CUDA"
             )
-        place = core.CUDAPlace(ParallelEnv().dev_id)
+        place = core.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
     elif lower_device == 'xpu':
         if not core.is_compiled_with_xpu():
             raise ValueError(
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 900cdacb11..9be44effbc 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -19,6 +19,7 @@ from .launch.main import launch  # noqa: F401
 from .parallel import init_parallel_env  # noqa: F401
 from .parallel import get_rank  # noqa: F401
 from .parallel import get_world_size  # noqa: F401
+from .parallel import ParallelEnv  # noqa: F401
 
 from .parallel_with_gloo import gloo_init_parallel_env
 from .parallel_with_gloo import gloo_barrier
@@ -69,11 +70,6 @@ from .entry_attr import ProbabilityEntry  # noqa: F401
 from .entry_attr import CountFilterEntry  # noqa: F401
 from .entry_attr import ShowClickEntry  # noqa: F401
 
-# (TODO: GhostScreaming) It needs migration of ParallelEnv. However,
-# it's hard to migrate APIs in paddle.fluid.dygraph.parallel completely.
-# It will be replaced later.
-from paddle.fluid.dygraph.parallel import ParallelEnv  # noqa: F401
-
 from . import cloud_utils  # noqa: F401
 
 from .sharding import group_sharded_parallel  # noqa: F401
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index a270d75459..f098564a30 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -26,7 +26,6 @@ import paddle.distributed.auto_parallel.utils as auto_utils
 import paddle.utils as utils
 from paddle import static
 from paddle.distributed import fleet
-from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.fluid.executor import _to_name_str
 from paddle.fluid.layers.utils import flatten
 from paddle.framework import IrGraph
@@ -771,7 +770,9 @@ class Engine:
 
         self._place = _get_device()
         if isinstance(self._place, paddle.framework.CUDAPlace):
-            self._place = paddle.framework.CUDAPlace(ParallelEnv().dev_id)
+            self._place = paddle.framework.CUDAPlace(
+                paddle.distributed.ParallelEnv().dev_id
+            )
 
         if self._strategy.seed:
             paddle.seed(self._strategy.seed + self._dp_ranks[0])
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 7dad9831e7..6cc7bfa581 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -41,9 +41,6 @@ from paddle.distributed.fleet.launch_utils import check_backend
 # (TODO: GhostScreaming) It will be removed later.
 from paddle.fluid import core
 
-# (TODO: GhostScreaming) It will be removed later.
-from paddle.fluid.dygraph.parallel import ParallelEnv
-
 # (TODO: GhostScreaming) It will be removed later.
 from paddle.framework import (
     _set_expected_place,
@@ -60,6 +57,219 @@ ParallelStrategy = core.ParallelStrategy
 _global_parallel_env = None
 
 
+class ParallelEnv:
+    """
+    .. note::
+        This API is not recommended, if you need to get rank and world_size,
+        it is recommended to use ``paddle.distributed.get_rank()`` and
+        ``paddle.distributed.get_world_size()`` .
+
+    This class is used to obtain the environment variables required for
+    the parallel execution of ``paddle.nn.Layer`` in dynamic mode.
+
+    The parallel execution in dynamic mode needs to be started using ``paddle.distributed.launch``
+    or ``paddle.distributed.spawn`` .
+
+    Examples:
+      .. code-block:: python
+
+        import paddle
+        import paddle.distributed as dist
+
+        def train():
+            # 1. initialize parallel environment
+            dist.init_parallel_env()
+
+            # 2. get current ParallelEnv
+            parallel_env = dist.ParallelEnv()
+            print("rank: ", parallel_env.rank)
+            print("world_size: ", parallel_env.world_size)
+
+            # print result in process 1:
+            # rank: 1
+            # world_size: 2
+            # print result in process 2:
+            # rank: 2
+            # world_size: 2
+
+        if __name__ == '__main__':
+            # 1. start by ``paddle.distributed.spawn`` (default)
+            dist.spawn(train, nprocs=2)
+            # 2. start by ``paddle.distributed.launch``
+            # train()
+    """
+
+    def __init__(self):
+        self._rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self._world_size = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        self._device_type = str(os.getenv("PADDLE_XCCL_BACKEND", ""))
+
+        # imperative only support one gpu or xpu
+        if self._device_type != "":
+            FLAGS_selected_custom_devices = 'FLAGS_selected_{}s'.format(
+                self._device_type
+            )
+            selected_custom_devices = os.getenv(
+                FLAGS_selected_custom_devices, "0"
+            ).split(",")
+            self._device_id = int(selected_custom_devices[0])
+        else:
+            if core.is_compiled_with_cuda():
+                selected_gpus = os.getenv("FLAGS_selected_gpus", "0").split(",")
+                self._device_id = int(selected_gpus[0])
+            elif core.is_compiled_with_xpu():
+                selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
+                self._device_id = int(selected_xpus[0])
+            elif core.is_compiled_with_npu():
+                selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
+                self._device_id = int(selected_npus[0])
+            elif core.is_compiled_with_mlu():
+                selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
+                self._device_id = int(selected_mlus[0])
+
+        self._trainer_endpoints = os.getenv(
+            "PADDLE_TRAINER_ENDPOINTS", ""
+        ).split(",")
+        self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
+        self._nrings = int(os.getenv("FLAGS_nccl_nrings", "1"))
+        assert (
+            self._nrings > 0
+        ), "nccl_nrings must be an integer greater than 0."
+        assert (
+            self._nrings < 9
+        ), "nccl_nrings should be less than 9, which is enough in most scenarios."
+
+    @property
+    def rank(self):
+        """
+        Rank of current trainer.
+
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . The default value is 0.
+
+        Examples:
+          .. code-block:: python
+
+            # execute this command in terminal: export PADDLE_TRAINER_ID=0
+            import paddle.distributed as dist
+
+            env = dist.ParallelEnv()
+            print("The rank is %d" % env.rank)
+            # The rank is 0
+        """
+        return self._rank
+
+    @property
+    def world_size(self):
+        """
+        The number of trainers (number of processes participating in current job).
+
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . The default value is 1.
+
+        Examples:
+          .. code-block:: python
+
+            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
+            import paddle.distributed as dist
+
+            env = dist.ParallelEnv()
+            print("The world_size is %d" % env.world_size)
+            # The world_size is 4
+        """
+        return self._world_size
+
+    @property
+    def device_id(self):
+        """
+        The ID of selected GPU card for parallel training.
+
+        Its value is equal to the value of the environment variable ``FLAGS_selected_gpus`` . The default value is 0.
+
+        Examples:
+          .. code-block:: python
+
+            # execute this command in terminal: export FLAGS_selected_gpus=1
+            import paddle.distributed as dist
+
+            env = dist.ParallelEnv()
+            print("The device id are %d" % env.device_id)
+            # The device id are 1
+        """
+        return self._device_id
+
+    @property
+    def device_type(self):
+        """
+        The type of custom device for parallel training.
+
+        Its value is equal to the value of the environment variable ``PADDLE_XCCL_BACKEND`` . The default value is None.
+
+        """
+        return self._device_type
+
+    @property
+    def current_endpoint(self):
+        """
+        The endpoint of current trainer, it is in the form of (node IP + port).
+
+        Its value is equal to the value of the environment variable ``PADDLE_CURRENT_ENDPOINT`` . The default value is "".
+
+        Examples:
+          .. code-block:: python
+
+            # execute this command in terminal: export PADDLE_CURRENT_ENDPOINT=127.0.0.1:6170
+            import paddle.distributed as dist
+
+            env = dist.ParallelEnv()
+            print("The current endpoint are %s" % env.current_endpoint)
+            # The current endpoint are 127.0.0.1:6170
+        """
+        return self._current_endpoint
+
+    @property
+    def trainer_endpoints(self):
+        """
+        The endpoints of all trainer nodes in the task,
+        which are used to broadcast the NCCL ID when NCCL2 is initialized.
+
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ENDPOINTS`` . The default value is "".
+
+        Examples:
+          .. code-block:: python
+
+            # execute this command in terminal: export PADDLE_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171
+            import paddle.distributed as dist
+
+            env = dist.ParallelEnv()
+            print("The trainer endpoints are %s" % env.trainer_endpoints)
+            # The trainer endpoints are ['127.0.0.1:6170', '127.0.0.1:6171']
+        """
+        return self._trainer_endpoints
+
+    @property
+    def nrings(self):
+        """
+        Nrings of current trainer.
+
+        Its value is equal to the value of the environment variable ``FLAGS_nccl_nrings`` . The default value is 1.
+
+        Examples:
+          .. code-block:: python
+
+            # execute this command in terminal: export FLAGS_nccl_nrings=1
+            import paddle.distributed as dist
+
+            env = dist.ParallelEnv()
+            print("The nrings is %d" % env.nrings)
+            # the number of ring is 1
+        """
+        return self._nrings
+
+    # [aliases] Compatible with old method names
+    local_rank = rank
+    nranks = world_size
+    dev_id = device_id
+
+
 def _get_global_parallel_env():
     global _global_parallel_env
     if _global_parallel_env is None:
diff --git a/python/paddle/distributed/passes/auto_parallel_quantization.py b/python/paddle/distributed/passes/auto_parallel_quantization.py
index ea4357fdcc..3638b8c4cb 100644
--- a/python/paddle/distributed/passes/auto_parallel_quantization.py
+++ b/python/paddle/distributed/passes/auto_parallel_quantization.py
@@ -17,7 +17,6 @@ import logging
 import numpy as np
 
 import paddle
-from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.framework import IrGraph, core
 from paddle.static.quantization import (
     AddQuantDequantForInferencePass,
@@ -72,7 +71,9 @@ class QuantizationPass(PassBase):
         # TODO: scope and place will be removed,
         # cause params should be initialized by engine module.
         scope = paddle.static.global_scope()
-        place = paddle.framework.CUDAPlace(ParallelEnv().dev_id)
+        place = paddle.framework.CUDAPlace(
+            paddle.distributed.ParallelEnv().dev_id
+        )
 
         # 0. record the relation among blocks
         parent_idx_dict = dict()
diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py
index ff749271e5..3e0449719c 100644
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
@@ -250,7 +250,7 @@ class DistributedBatchSampler(BatchSampler):
             drop_last, bool
         ), "drop_last should be a boolean number"
 
-        from paddle.fluid.dygraph.parallel import ParallelEnv
+        from paddle.distributed import ParallelEnv
 
         if num_replicas is not None:
             assert (
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 90c71abbaa..525e90f7a0 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -35,236 +35,21 @@ from paddle.fluid.framework import (
     in_dygraph_mode,
 )
 
-__all__ = ["ParallelEnv", "DataParallel"]
+__all__ = ["DataParallel"]
 
 ParallelStrategy = core.ParallelStrategy
 
 
-class ParallelEnv:
-    """
-    .. note::
-        This API is not recommended, if you need to get rank and world_size,
-        it is recommended to use ``paddle.distributed.get_rank()`` and
-        ``paddle.distributed.get_world_size()`` .
-
-    This class is used to obtain the environment variables required for
-    the parallel execution of ``paddle.nn.Layer`` in dynamic mode.
-
-    The parallel execution in dynamic mode needs to be started using ``paddle.distributed.launch``
-    or ``paddle.distributed.spawn`` .
-
-    Examples:
-      .. code-block:: python
-
-        import paddle
-        import paddle.distributed as dist
-
-        def train():
-            # 1. initialize parallel environment
-            dist.init_parallel_env()
-
-            # 2. get current ParallelEnv
-            parallel_env = dist.ParallelEnv()
-            print("rank: ", parallel_env.rank)
-            print("world_size: ", parallel_env.world_size)
-
-            # print result in process 1:
-            # rank: 1
-            # world_size: 2
-            # print result in process 2:
-            # rank: 2
-            # world_size: 2
-
-        if __name__ == '__main__':
-            # 1. start by ``paddle.distributed.spawn`` (default)
-            dist.spawn(train, nprocs=2)
-            # 2. start by ``paddle.distributed.launch``
-            # train()
-    """
-
-    def __init__(self):
-        self._rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-        self._world_size = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
-        self._device_type = str(os.getenv("PADDLE_XCCL_BACKEND", ""))
-
-        # imperative only support one gpu or xpu
-        if self._device_type != "":
-            FLAGS_selected_custom_devices = 'FLAGS_selected_{}s'.format(
-                self._device_type
-            )
-            selected_custom_devices = os.getenv(
-                FLAGS_selected_custom_devices, "0"
-            ).split(",")
-            self._device_id = int(selected_custom_devices[0])
-        else:
-            if core.is_compiled_with_cuda():
-                selected_gpus = os.getenv("FLAGS_selected_gpus", "0").split(",")
-                self._device_id = int(selected_gpus[0])
-            elif core.is_compiled_with_xpu():
-                selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
-                self._device_id = int(selected_xpus[0])
-            elif core.is_compiled_with_npu():
-                selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
-                self._device_id = int(selected_npus[0])
-            elif core.is_compiled_with_mlu():
-                selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
-                self._device_id = int(selected_mlus[0])
-
-        self._trainer_endpoints = os.getenv(
-            "PADDLE_TRAINER_ENDPOINTS", ""
-        ).split(",")
-        self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
-        self._nrings = int(os.getenv("FLAGS_nccl_nrings", "1"))
-        assert (
-            self._nrings > 0
-        ), "nccl_nrings must be an integer greater than 0."
-        assert (
-            self._nrings < 9
-        ), "nccl_nrings should be less than 9, which is enough in most scenarios."
-
-    @property
-    def rank(self):
-        """
-        Rank of current trainer.
-
-        Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . The default value is 0.
-
-        Examples:
-          .. code-block:: python
-
-            # execute this command in terminal: export PADDLE_TRAINER_ID=0
-            import paddle.distributed as dist
-
-            env = dist.ParallelEnv()
-            print("The rank is %d" % env.rank)
-            # The rank is 0
-        """
-        return self._rank
-
-    @property
-    def world_size(self):
-        """
-        The number of trainers (number of processes participating in current job).
-
-        Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . The default value is 1.
-
-        Examples:
-          .. code-block:: python
-
-            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
-            import paddle.distributed as dist
-
-            env = dist.ParallelEnv()
-            print("The world_size is %d" % env.world_size)
-            # The world_size is 4
-        """
-        return self._world_size
-
-    @property
-    def device_id(self):
-        """
-        The ID of selected GPU card for parallel training.
-
-        Its value is equal to the value of the environment variable ``FLAGS_selected_gpus`` . The default value is 0.
-
-        Examples:
-          .. code-block:: python
-
-            # execute this command in terminal: export FLAGS_selected_gpus=1
-            import paddle.distributed as dist
-
-            env = dist.ParallelEnv()
-            print("The device id are %d" % env.device_id)
-            # The device id are 1
-        """
-        return self._device_id
-
-    @property
-    def device_type(self):
-        """
-        The type of custom device for parallel training.
-
-        Its value is equal to the value of the environment variable ``PADDLE_XCCL_BACKEND`` . The default value is None.
-
-        """
-        return self._device_type
-
-    @property
-    def current_endpoint(self):
-        """
-        The endpoint of current trainer, it is in the form of (node IP + port).
-
-        Its value is equal to the value of the environment variable ``PADDLE_CURRENT_ENDPOINT`` . The default value is "".
-
-        Examples:
-          .. code-block:: python
-
-            # execute this command in terminal: export PADDLE_CURRENT_ENDPOINT=127.0.0.1:6170
-            import paddle.distributed as dist
-
-            env = dist.ParallelEnv()
-            print("The current endpoint are %s" % env.current_endpoint)
-            # The current endpoint are 127.0.0.1:6170
-        """
-        return self._current_endpoint
-
-    @property
-    def trainer_endpoints(self):
-        """
-        The endpoints of all trainer nodes in the task,
-        which are used to broadcast the NCCL ID when NCCL2 is initialized.
-
-        Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ENDPOINTS`` . The default value is "".
-
-        Examples:
-          .. code-block:: python
-
-            # execute this command in terminal: export PADDLE_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171
-            import paddle.distributed as dist
-
-            env = dist.ParallelEnv()
-            print("The trainer endpoints are %s" % env.trainer_endpoints)
-            # The trainer endpoints are ['127.0.0.1:6170', '127.0.0.1:6171']
-        """
-        return self._trainer_endpoints
-
-    @property
-    def nrings(self):
-        """
-        Nrings of current trainer.
-
-        Its value is equal to the value of the environment variable ``FLAGS_nccl_nrings`` . The default value is 1.
-
-        Examples:
-          .. code-block:: python
-
-            # execute this command in terminal: export FLAGS_nccl_nrings=1
-            import paddle.distributed as dist
-
-            env = dist.ParallelEnv()
-            print("The nrings is %d" % env.nrings)
-            # the number of ring is 1
-        """
-        return self._nrings
-
-    # [aliases] Compatible with old method names
-    local_rank = rank
-    nranks = world_size
-    dev_id = device_id
-
-
-# NOTE: [ Compatible ] Originally this class name is `Env`. The semantics of the old class names
-# are inaccurate and may confuse users, so replace it with `ParallelEnv`, but to be compatible
-# with the old examples, here still need to keep this name.
-Env = ParallelEnv
-
-
 def _build_default_parallel_strategy():
     strategy = ParallelStrategy()
-    strategy.nranks = ParallelEnv().nranks
-    strategy.local_rank = ParallelEnv().local_rank
-    strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
-    strategy.current_endpoint = ParallelEnv().current_endpoint
+    strategy.nranks = paddle.distributed.ParallelEnv().nranks
+    strategy.local_rank = paddle.distributed.ParallelEnv().local_rank
+    strategy.trainer_endpoints = (
+        paddle.distributed.ParallelEnv().trainer_endpoints
+    )
+    strategy.current_endpoint = (
+        paddle.distributed.ParallelEnv().current_endpoint
+    )
     return strategy
 
 
@@ -318,11 +103,13 @@ def _split_tensors(coalesced_grads_and_grad_vars):
 
 def scale_loss(loss):
     # TODO(liuyuhui) Currently only for xpu. Will be removed in the future.
-    if not ParallelEnv().world_size > 1:
+    if not paddle.distributed.ParallelEnv().world_size > 1:
         return loss
 
     loss_scale = to_variable(
-        np.array([ParallelEnv().world_size]).astype("float32")
+        np.array([paddle.distributed.ParallelEnv().world_size]).astype(
+            "float32"
+        )
     )
     loss_scale.stop_gradient = True
     scaled_loss = loss / loss_scale
diff --git a/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py b/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py
index 383713e0f5..0e4181ba04 100644
--- a/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py
+++ b/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py
@@ -19,17 +19,16 @@ import numpy as np
 
 import paddle
 from paddle.fluid import core
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 
 def init_process_group(strategy=None):
-    nranks = ParallelEnv().nranks
-    rank = ParallelEnv().local_rank
+    nranks = paddle.distributed.ParallelEnv().nranks
+    rank = paddle.distributed.ParallelEnv().local_rank
     is_master = True if rank == 0 else False
     store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks)
     pg_group = core.ProcessGroupCustom.create(
         store,
-        ParallelEnv().device_type,
+        paddle.distributed.ParallelEnv().device_type,
         rank,
         nranks,
     )
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py
index 1f90f90b2f..388ab592e9 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py
@@ -20,7 +20,6 @@ from get_gpt_model import FakeDataset, generate_model
 
 import paddle
 from paddle.distributed.fleet import auto
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 
 def apply_pass(use_amp=False, level=None):
@@ -62,7 +61,7 @@ class TestAMPPass(unittest.TestCase):
         paddle.seed(2021)
         np.random.seed(2021)
         random.seed(2021)
-        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        place = paddle.fluid.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
         engine._executor = paddle.static.Executor(place)
 
     def get_engine(self, use_amp=False, level=None):
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/clip_grad_by_global_norm.py b/python/paddle/fluid/tests/unittests/auto_parallel/clip_grad_by_global_norm.py
index baae57b84a..11fe954b7b 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/clip_grad_by_global_norm.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/clip_grad_by_global_norm.py
@@ -20,7 +20,6 @@ from get_gpt_model import FakeDataset, generate_model
 
 import paddle
 from paddle.distributed.fleet import auto
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 paddle.enable_static()
 
@@ -73,7 +72,7 @@ class TestGradientClipByGlobalNorm(unittest.TestCase):
         paddle.seed(2022)
         np.random.seed(2022)
         random.seed(2022)
-        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        place = paddle.fluid.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
         engine._executor = paddle.static.Executor(place)
 
     def get_engine(self, use_sharding=False):
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py
index 2a6d61d961..adf40a236a 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py
@@ -20,7 +20,6 @@ from get_gpt_model import FakeDataset, generate_model
 
 import paddle
 from paddle.distributed.fleet import auto
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 paddle.enable_static()
 
@@ -56,7 +55,7 @@ class TestGradientMergePass(unittest.TestCase):
         paddle.seed(2021)
         np.random.seed(2021)
         random.seed(2021)
-        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        place = paddle.fluid.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
         engine._executor = paddle.static.Executor(place)
 
     def get_engine(self, use_gradient_merge=False):
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py
index ae15ec02d6..c698ea3d70 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py
@@ -20,7 +20,6 @@ from get_gpt_model import FakeDataset, generate_model
 
 import paddle
 from paddle.distributed.fleet import auto
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 
 def apply_pass(use_recompute=False, no_recompute_segments=[]):
@@ -52,7 +51,7 @@ class TestRecomputePass(unittest.TestCase):
         paddle.seed(2022)
         np.random.seed(2022)
         random.seed(2022)
-        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        place = paddle.fluid.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
         engine._executor = paddle.static.Executor(place)
 
     def get_engine(self, use_recompute=False, no_recompute_segments=[]):
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/sharding_newexe.py b/python/paddle/fluid/tests/unittests/auto_parallel/sharding_newexe.py
index ca76daada5..48690f585c 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/sharding_newexe.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/sharding_newexe.py
@@ -20,7 +20,6 @@ from get_gpt_model import FakeDataset, generate_model
 
 import paddle
 from paddle.distributed.fleet import auto
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 paddle.enable_static()
 
@@ -83,7 +82,7 @@ class TestShardingStage2WithNewEXE(unittest.TestCase):
         paddle.seed(2022)
         np.random.seed(2022)
         random.seed(2022)
-        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        place = paddle.fluid.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
         engine._executor = paddle.static.Executor(place)
 
     def get_engine(
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py
index a77837ec20..4ecc551124 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py
@@ -20,7 +20,6 @@ from get_gpt_model import FakeDataset, generate_model
 
 import paddle
 from paddle.distributed.fleet import auto
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 paddle.enable_static()
 
@@ -56,7 +55,7 @@ class TestShardingPass(unittest.TestCase):
         paddle.seed(2022)
         np.random.seed(2022)
         random.seed(2022)
-        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        place = paddle.fluid.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
         engine._executor = paddle.static.Executor(place)
 
     def get_engine(self, use_sharding=False, stage=None):
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_fused_linear_pass.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_fused_linear_pass.py
index aad5922a76..d2da582ef1 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_fused_linear_pass.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_fused_linear_pass.py
@@ -21,7 +21,6 @@ from get_gpt_model import FakeDataset, generate_model
 
 import paddle
 from paddle.distributed.fleet import auto
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 sys.path.append("..")
 from test_sparse_addmm_op import get_cuda_version
@@ -55,7 +54,7 @@ class TestFusedLinearPass(unittest.TestCase):
         paddle.seed(2021)
         np.random.seed(2021)
         random.seed(2021)
-        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        place = paddle.fluid.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
         engine._executor = paddle.static.Executor(place)
 
     def get_engine(self, use_fused_passes=False, fused_passes_list=[]):
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py
index 2c72ccd938..b9d744ff02 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py
@@ -21,7 +21,6 @@ import paddle
 import paddle.fluid.core as core
 import paddle.nn as nn
 from paddle.distributed.fleet import auto
-from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.static import InputSpec
 from paddle.static.amp.bf16.amp_utils import _valid_types
 from paddle.static.amp.fp16_utils import find_true_prev_op
@@ -90,7 +89,7 @@ class TestBF16Pass(unittest.TestCase):
         paddle.seed(2021)
         np.random.seed(2021)
         random.seed(2021)
-        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        place = paddle.fluid.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
         engine._executor = paddle.static.Executor(place)
 
     def get_engine(self, use_bf16=False):
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_selective_recompute.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_selective_recompute.py
index 64563314ac..73d58c7a52 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_selective_recompute.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_selective_recompute.py
@@ -21,7 +21,6 @@ from get_gpt_model import FakeDataset
 
 import paddle
 from paddle.distributed.fleet import auto
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 sys.path.append("..")
 import auto_parallel_gpt_model as modeling
@@ -92,7 +91,7 @@ class TestRecomputePassWithRecomputeAPI(unittest.TestCase):
         paddle.seed(2022)
         np.random.seed(2022)
         random.seed(2022)
-        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        place = paddle.fluid.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
         engine._executor = paddle.static.Executor(place)
 
     def get_engine(
diff --git a/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py b/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py
index c54101baec..c657088f33 100644
--- a/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py
+++ b/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py
@@ -19,7 +19,6 @@ import numpy as np
 
 import paddle
 from paddle.fluid import core
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 
 class TestProcessGroupFp32(unittest.TestCase):
@@ -34,8 +33,8 @@ class TestProcessGroupFp32(unittest.TestCase):
         self.shape = (2, 10, 5)
 
     def test_create_process_group_gloo(self):
-        nranks = ParallelEnv().nranks
-        rank = ParallelEnv().local_rank
+        nranks = paddle.distributed.ParallelEnv().nranks
+        rank = paddle.distributed.ParallelEnv().local_rank
         is_master = True if rank == 0 else False
         store = paddle.fluid.core.TCPStore(
             "127.0.0.1", 6272, is_master, nranks, 30
diff --git a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
index 3be3cfecf1..713e0a01b4 100644
--- a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
@@ -19,12 +19,11 @@ import numpy as np
 
 import paddle
 import paddle.distributed as dist
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 
 def init_process_group(strategy=None):
-    nranks = ParallelEnv().nranks
-    rank = ParallelEnv().local_rank
+    nranks = paddle.distributed.ParallelEnv().nranks
+    rank = dist.ParallelEnv().local_rank
     is_master = True if rank == 0 else False
     pg_group = dist.init_parallel_env()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
index f783319918..49fe7c97d0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
+++ b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
@@ -20,12 +20,11 @@ import numpy as np
 
 import paddle
 import paddle.distributed as dist
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 
 def init_process_group(strategy=None):
-    nranks = ParallelEnv().nranks
-    rank = ParallelEnv().local_rank
+    nranks = paddle.distributed.ParallelEnv().nranks
+    rank = dist.ParallelEnv().local_rank
     is_master = True if rank == 0 else False
     pg_group = dist.init_parallel_env()
 
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index 2d069a39e5..1cd1d224f4 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -20,7 +20,6 @@ import warnings
 import numpy as np
 
 import paddle
-from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.utils import try_import
 
 from .progressbar import ProgressBar
@@ -350,7 +349,7 @@ class ProgBarLogger(Callback):
         self.log_freq = log_freq
 
     def _is_print(self):
-        return self.verbose and ParallelEnv().local_rank == 0
+        return self.verbose and paddle.distributed.ParallelEnv().local_rank == 0
 
     def on_train_begin(self, logs=None):
         self.epochs = self.params['epochs']
@@ -598,7 +597,11 @@ class ModelCheckpoint(Callback):
         self.epoch = epoch
 
     def _is_save(self):
-        return self.model and self.save_dir and ParallelEnv().local_rank == 0
+        return (
+            self.model
+            and self.save_dir
+            and paddle.distributed.ParallelEnv().local_rank == 0
+        )
 
     def on_epoch_end(self, epoch, logs=None):
         if self._is_save() and self.epoch % self.save_freq == 0:
@@ -922,7 +925,7 @@ class VisualDL(Callback):
         self.epoch = 0
 
     def _is_write(self):
-        return ParallelEnv().local_rank == 0
+        return paddle.distributed.ParallelEnv().local_rank == 0
 
     def on_train_begin(self, logs=None):
         self.epochs = self.params['epochs']
@@ -1074,7 +1077,7 @@ class WandbCallback(Callback):
         _ = self.run
 
     def _is_write(self):
-        return ParallelEnv().local_rank == 0
+        return paddle.distributed.ParallelEnv().local_rank == 0
 
     @property
     def run(self):
@@ -1333,7 +1336,10 @@ class ReduceLROnPlateau(Callback):
                     new_lr = old_lr * self.factor
                     new_lr = max(new_lr, self.min_lr)
                     self.model._optimizer._learning_rate = new_lr
-                    if self.verbose > 0 and ParallelEnv().local_rank == 0:
+                    if (
+                        self.verbose > 0
+                        and paddle.distributed.ParallelEnv().local_rank == 0
+                    ):
                         print(
                             '\nEpoch %d: ReduceLROnPlateau reducing learning '
                             'rate to %s.' % (self.epoch + 1, new_lr)
diff --git a/python/paddle/hapi/logger.py b/python/paddle/hapi/logger.py
index ac6f29b338..25a6bbcbf1 100644
--- a/python/paddle/hapi/logger.py
+++ b/python/paddle/hapi/logger.py
@@ -16,8 +16,6 @@ import logging
 import os
 import sys
 
-from paddle.fluid.dygraph.parallel import ParallelEnv
-
 __all__ = []
 
 
@@ -40,7 +38,7 @@ def setup_logger(output=None, name="hapi", log_level=logging.INFO):
 
     format_str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
     # stdout logging: only local rank==0
-    local_rank = ParallelEnv().local_rank
+    local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
     if local_rank == 0 and len(logger.handlers) == 0:
         ch = logging.StreamHandler(stream=sys.stdout)
         ch.setLevel(log_level)
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 9222090d1a..d5bad6a977 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -30,7 +30,6 @@ from paddle.autograd import no_grad
 from paddle.distributed.fleet.base import role_maker
 from paddle.fluid import core
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.fluid.executor import global_scope
 from paddle.fluid.framework import Variable
 from paddle.fluid.framework import _current_expected_place as _get_device
@@ -190,17 +189,21 @@ def init_communicator(
 def prepare_distributed_context(place=None):
     if place is None:
         place = (
-            fluid.CUDAPlace(ParallelEnv().dev_id)
-            if ParallelEnv().nranks > 1
+            fluid.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
+            if paddle.distributed.ParallelEnv().nranks > 1
             else fluid.CUDAPlace(0)
         )
 
     place = _get_paddle_place(place)
     strategy = fluid.dygraph.parallel.ParallelStrategy()
-    strategy.nranks = ParallelEnv().nranks
-    strategy.local_rank = ParallelEnv().local_rank
-    strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
-    strategy.current_endpoint = ParallelEnv().current_endpoint
+    strategy.nranks = paddle.distributed.ParallelEnv().nranks
+    strategy.local_rank = paddle.distributed.ParallelEnv().local_rank
+    strategy.trainer_endpoints = (
+        paddle.distributed.ParallelEnv().trainer_endpoints
+    )
+    strategy.current_endpoint = (
+        paddle.distributed.ParallelEnv().current_endpoint
+    )
 
     if strategy.nranks < 2:
         return
@@ -282,8 +285,8 @@ class StaticGraphAdapter:
             'test_batch': 0,
         }
 
-        self._nranks = ParallelEnv().nranks
-        self._local_rank = ParallelEnv().local_rank
+        self._nranks = paddle.distributed.ParallelEnv().nranks
+        self._local_rank = paddle.distributed.ParallelEnv().local_rank
 
         self._amp_level = "O0"
         self._amp_configs = {}
@@ -733,8 +736,8 @@ class DynamicGraphAdapter:
     def __init__(self, model):
         super().__init__()
         self.model = model
-        self._nranks = ParallelEnv().nranks
-        self._local_rank = ParallelEnv().local_rank
+        self._nranks = paddle.distributed.ParallelEnv().nranks
+        self._local_rank = paddle.distributed.ParallelEnv().local_rank
         self._merge_count = {
             'eval_total': 0,
             'test_total': 0,
@@ -751,10 +754,14 @@ class DynamicGraphAdapter:
         if self._nranks > 1:
             dist.init_parallel_env()
             stradegy = fluid.dygraph.parallel.ParallelStrategy()
-            stradegy.nranks = ParallelEnv().nranks
-            stradegy.local_rank = ParallelEnv().local_rank
-            stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
-            stradegy.current_endpoint = ParallelEnv().current_endpoint
+            stradegy.nranks = paddle.distributed.ParallelEnv().nranks
+            stradegy.local_rank = paddle.distributed.ParallelEnv().local_rank
+            stradegy.trainer_endpoints = (
+                paddle.distributed.ParallelEnv().trainer_endpoints
+            )
+            stradegy.current_endpoint = (
+                paddle.distributed.ParallelEnv().current_endpoint
+            )
             self.ddp_model = fluid.dygraph.parallel.DataParallel(
                 self.model.network, stradegy
             )
@@ -1373,7 +1380,7 @@ class Model:
 
         """
 
-        if ParallelEnv().local_rank == 0:
+        if paddle.distributed.ParallelEnv().local_rank == 0:
             if not training:
                 self._save_inference_model(path)
             else:
@@ -1657,7 +1664,10 @@ class Model:
         self._place = _get_device()
         if isinstance(self._place, fluid.CUDAPlace):
             global _parallel_context_initialized
-            if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
+            if (
+                paddle.distributed.ParallelEnv().nranks > 1
+                and not _parallel_context_initialized
+            ):
                 if fluid._non_static_mode():
                     main_prog_seed = fluid.default_main_program().random_seed
                     startup_prog_seed = (
@@ -2307,7 +2317,9 @@ class Model:
                 mode == 'train'
                 or self._adapter._merge_count.get(mode + '_batch', 0) <= 0
             ):
-                logs['batch_size'] = batch_size * ParallelEnv().nranks
+                logs['batch_size'] = (
+                    batch_size * paddle.distributed.ParallelEnv().nranks
+                )
             else:
                 logs['batch_size'] = self._adapter._merge_count[mode + '_batch']
 
diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index a7f6883c97..9c82531565 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -136,7 +136,7 @@ def get_path_from_url(
         str: a local path to save downloaded models & weights & datasets.
     """
 
-    from paddle.fluid.dygraph.parallel import ParallelEnv
+    from paddle.distributed import ParallelEnv
 
     assert is_url(url), "downloading from {} not a url".format(url)
     # parse path after download to decompress under root_dir
-- 
GitLab