Fluid clean parallel (#50626)

* fluid clean: remove parallel and parallel_helper api * fix: fix the import path. * fix DataParallel imports issue

Fluid clean parallel (#50626)
* fluid clean: remove parallel and parallel_helper api * fix: fix the import path. * fix DataParallel imports issue
98ae15c0 · qizhaoaoe · GitHub · bbca66f2 · 98ae15c0 · 98ae15c0
16 changed file
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -345,7 +345,7 @@ from .autograd import set_grad_enabled  # noqa: F401
 from .autograd import is_grad_enabled  # noqa: F401
 from .framework import save  # noqa: F401
 from .framework import load  # noqa: F401
-from .framework import DataParallel  # noqa: F401
+from .distributed import DataParallel  # noqa: F401
 from .framework import set_default_dtype  # noqa: F401
 from .framework import get_default_dtype  # noqa: F401

--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -20,7 +20,7 @@ from .parallel import init_parallel_env  # noqa: F401
 from .parallel import get_rank  # noqa: F401
 from .parallel import get_world_size  # noqa: F401
 from .parallel import ParallelEnv  # noqa: F401
+from .parallel import DataParallel
 from .parallel_with_gloo import gloo_init_parallel_env
 from .parallel_with_gloo import gloo_barrier
 from .parallel_with_gloo import gloo_release

--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -17,7 +17,6 @@ import os
 import paddle
 from paddle.fluid import compiler
-from paddle.fluid.dygraph import parallel_helper
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.ir import apply_build_strategy
 from paddle.fluid.wrapped_decorator import wrap_decorator
@@ -236,6 +235,7 @@ class Fleet:
                fleet.init(log_level = "DEBUG")
        """
+        from paddle.distributed import parallel_helper
        set_log_level(log_level)

--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -14,16 +14,16 @@
 import paddle
 from paddle import framework
+from paddle.distributed.parallel import (
-# (TODO: GhostScreaming) It will be removed later.
-from paddle.fluid import core
-from paddle.framework import (
    _split_tensors,
    build_groups,
    in_dygraph_mode,
    sync_params_buffers,
 )
+# (TODO: GhostScreaming) It will be removed later.
+from paddle.fluid import core
 from .log_util import logger
 __all__ = []

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
--- a/python/paddle/fluid/dygraph/parallel_helper.py
+++ b/python/paddle/fluid/dygraph/parallel_helper.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from ..layers import collective
 from ..framework import Parameter
 __parallel_ctx__clz__ = None
@@ -48,11 +48,12 @@ def _init_parallel_ctx():
 def _broadcast_parameters(parameters):
+    from ..distributed import broadcast
    for param in parameters:
        # In model parallel, some parameters are split into multiple devices,
        # so we could not broadcast these parameters.
        if param.is_distributed:
            continue
        if isinstance(param, Parameter) and param.trainable:
-            collective._broadcast(param, 0, sync_mode=True)
+            broadcast(param, 0, sync_op=True)
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -21,10 +21,6 @@ from .layers import *
 from . import tracer
 from .tracer import *
-from . import parallel
-from .parallel import *
 from . import learning_rate_scheduler
 from .learning_rate_scheduler import *
@@ -33,5 +29,4 @@ from .math_op_patch import monkey_patch_math_varbase
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
-__all__ += parallel.__all__
 __all__ += learning_rate_scheduler.__all__
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -13,21 +13,17 @@
 # limitations under the License.
 import collections
-import contextlib
-import sys
 import numpy as np
 import re
 import copy
 import weakref
 import warnings
-from copy import deepcopy
 import inspect
 import paddle
 import paddle.profiler as profiler
 from paddle.profiler.utils import in_profiler_mode
-from . import parallel_helper
 from .. import unique_name
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
@@ -38,18 +34,16 @@ from .layer_hooks import (
 )
 from .base import (
    program_desc_tracing_guard,
-    param_guard,
    in_declarative_mode,
    _convert_into_variable,
 )
 from paddle.fluid import framework
-from ..param_attr import ParamAttr
 from paddle.fluid.executor import Executor, global_scope
 from paddle.fluid.framework import (
    convert_np_dtype_to_dtype_,
    in_dygraph_mode,
 )
-from paddle.fluid.framework import Program, program_guard
+from paddle.fluid.framework import Program
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.core import VarDesc
 from paddle.fluid.dygraph import no_grad
@@ -968,6 +962,8 @@ class Layer:
        pass
    def _dygraph_call_func(self, *inputs, **kwargs):
+        from paddle.distributed import parallel_helper
        for forward_pre_hook in self._forward_pre_hooks.values():
            hook_result = forward_pre_hook(self, inputs)
            if hook_result is not None:

--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -34,7 +34,6 @@ from ..framework import (
 )
 from .base import switch_to_static_graph
 from .math_op_patch import monkey_patch_math_varbase
-from .parallel import scale_loss
 from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE
 import paddle.utils.deprecated as deprecated
 import paddle.profiler as profiler
@@ -281,6 +280,8 @@ def monkey_patch_varbase():
                # 4: [5000.]
        """
+        from paddle.distributed.parallel import scale_loss
        if framework._non_static_mode():
            if in_profiler_mode():
                record_event = profiler.RecordEvent(

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -27,7 +27,6 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.dygraph as dygraph
 import paddle.incubate.distributed.fleet.role_maker as role_maker
 from paddle.fluid import compiler
 from paddle.incubate.distributed.fleet.collective import (
@@ -671,7 +670,7 @@ class TestParallelDyGraphRunnerBase:
                or args.update_method == "hccl"
                or args.update_method == "cncl"
            ):
-                strategy = dygraph.parallel.ParallelStrategy()
+                strategy = paddle.distributed.parallel.ParallelStrategy()
                strategy.nranks = nranks
                strategy.local_rank = args.trainer_id
                strategy.trainer_endpoints = args.endpoints.split(",")
@@ -682,11 +681,11 @@ class TestParallelDyGraphRunnerBase:
                    "begin to prepare context in dygraph with nccl2",
                )
                if not args.find_unused_parameters:
-                    model = dygraph.parallel.DataParallel(
+                    model = paddle.DataParallel(
                        model, strategy, find_unused_parameters=False
                    )
                else:
-                    model = dygraph.parallel.DataParallel(
+                    model = paddle.DataParallel(
                        model, strategy, find_unused_parameters=True
                    )
                print_to_err(type(self).__name__, "model built in dygraph")
@@ -694,11 +693,11 @@ class TestParallelDyGraphRunnerBase:
            elif args.update_method == "gloo":
                paddle.distributed.init_parallel_env()
                if not args.find_unused_parameters:
-                    model = dygraph.parallel.DataParallel(
+                    model = paddle.DataParallel(
                        model, find_unused_parameters=False
                    )
                else:
-                    model = dygraph.parallel.DataParallel(
+                    model = paddle.DataParallel(
                        model, find_unused_parameters=True
                    )

--- a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
@@ -16,10 +16,9 @@ import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import paddle.fluid.dygraph as dygraph
-from paddle.distributed import init_parallel_env
 from paddle.nn import Linear
@@ -39,9 +38,9 @@ class MLP(fluid.Layer):
 class TestDataParallelStateDict(unittest.TestCase):
    def test_data_parallel_state_dict(self):
        with fluid.dygraph.guard():
-            init_parallel_env()
+            paddle.distributed.init_parallel_env()
            mlp = MLP()
-            parallel_mlp = dygraph.parallel.DataParallel(mlp)
+            parallel_mlp = paddle.DataParallel(mlp)
            single_state = mlp.state_dict()
            parallel_state = parallel_mlp.state_dict()

--- a/python/paddle/fluid/tests/unittests/test_imperative_parallel_coalesce_split.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_parallel_coalesce_split.py
@@ -22,12 +22,6 @@ import paddle.fluid as fluid
 import paddle.nn.functional as F
 from paddle.fluid import core
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.parallel import (
-    DataParallel,
-    _coalesce_tensors,
-    _reshape_inplace,
-    _split_tensors,
-)
 class MyLayer(fluid.Layer):
@@ -43,10 +37,15 @@ class MyLayer(fluid.Layer):
 class TestImperativeParallelCoalesceSplit(unittest.TestCase):
    def test_coalesce_split(self):
+        from paddle.distributed.parallel import (
+            _coalesce_tensors,
+            _split_tensors,
+        )
        with fluid.dygraph.guard():
            test_layer = MyLayer("test_layer")
            strategy = core.ParallelStrategy()
-            test_layer = DataParallel(test_layer, strategy)
+            test_layer = paddle.DataParallel(test_layer, strategy)
            # test variables prepare
            vars = []
@@ -72,10 +71,12 @@ class TestImperativeParallelCoalesceSplit(unittest.TestCase):
                self.assertEqual(orig_var_shape, var.shape)
    def test_reshape_inplace(self):
+        from paddle.distributed.parallel import _reshape_inplace
        with fluid.dygraph.guard():
            test_layer = MyLayer("test_layer")
            strategy = core.ParallelStrategy()
-            test_layer = DataParallel(test_layer, strategy)
+            test_layer = paddle.DataParallel(test_layer, strategy)
            ori_shape = [2, 25]
            new_shape = [5, 10]

--- a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
+++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
@@ -24,7 +24,6 @@ from paddle.distributed.spawn import (
    _options_valid_check,
 )
 from paddle.fluid import core
-from paddle.fluid.dygraph import parallel_helper
 # NOTE(chenweihang): Coverage CI is currently not able to count python3
 # unittest, so the unittests here covers some cases that will only be
@@ -44,6 +43,8 @@ class TestInitParallelEnv(unittest.TestCase):
            dist.init_parallel_env()
    def test_init_parallel_env_break(self):
+        from paddle.distributed import parallel_helper
        os.environ['FLAGS_selected_gpus'] = '0'
        os.environ['PADDLE_TRAINER_ID'] = '0'
        os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170'

--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -30,17 +30,18 @@ from ..fluid.core import CustomPlace  # noqa: F401
 from ..fluid.core import VarBase  # noqa: F401
 from ..fluid import core  # noqa: F401
+from ..fluid.dygraph import base, layers, to_variable
 from ..fluid.dygraph.base import no_grad_ as no_grad  # noqa: F401
 from ..fluid.dygraph.base import grad  # noqa: F401
 from .io import save  # noqa: F401
 from .io import load  # noqa: F401
-from ..fluid.dygraph.parallel import DataParallel  # noqa: F401
 from ..fluid import monkey_patch_variable
 from ..fluid.dygraph import monkey_patch_math_varbase
 from ..fluid.framework import disable_signal_handler  # noqa: F401
 from ..fluid.framework import get_flags  # noqa: F401
 from ..fluid.framework import set_flags  # noqa: F401
+from ..fluid.framework import Parameter, ParamBase
 from ..fluid.dygraph.base import enable_dygraph as disable_static  # noqa: F401
 from ..fluid.dygraph.base import disable_dygraph as enable_static  # noqa: F401
 from ..fluid.framework import _non_static_mode as in_dynamic_mode  # noqa: F401
@@ -70,11 +71,6 @@ from ..fluid.framework import switch_startup_program
 from ..fluid.framework import _set_expected_place  # noqa: F401
 from ..fluid.framework import Block, Program  # noqa: F401
 from ..fluid.framework import IrGraph  # noqa: F401
-from ..fluid.dygraph import parallel_helper  # noqa: F401
+from ..fluid.framework import deprecate_stat_dict
-from ..fluid.dygraph.parallel import (
-    _split_tensors,
-    build_groups,
-    sync_params_buffers,
-)
 __all__ = []
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -223,7 +223,7 @@ def prepare_distributed_context(place=None):
        )
    place = _get_paddle_place(place)
-    strategy = fluid.dygraph.parallel.ParallelStrategy()
+    strategy = paddle.distributed.parallel.ParallelStrategy()
    strategy.nranks = paddle.distributed.ParallelEnv().nranks
    strategy.local_rank = paddle.distributed.ParallelEnv().local_rank
    strategy.trainer_endpoints = (
@@ -781,7 +781,7 @@ class DynamicGraphAdapter:
        if self._nranks > 1:
            dist.init_parallel_env()
-            stradegy = fluid.dygraph.parallel.ParallelStrategy()
+            stradegy = paddle.distributed.parallel.ParallelStrategy()
            stradegy.nranks = paddle.distributed.ParallelEnv().nranks
            stradegy.local_rank = paddle.distributed.ParallelEnv().local_rank
            stradegy.trainer_endpoints = (
@@ -790,9 +790,7 @@ class DynamicGraphAdapter:
            stradegy.current_endpoint = (
                paddle.distributed.ParallelEnv().current_endpoint
            )
-            self.ddp_model = fluid.dygraph.parallel.DataParallel(
+            self.ddp_model = paddle.DataParallel(self.model.network, stradegy)
-                self.model.network, stradegy
-            )
    @property
    def mode(self):