From 911d6bb1fa8fe5f0b5e3f21229bdbe20dbfc033e Mon Sep 17 00:00:00 2001 From: Ghost Screaming Date: Thu, 8 Dec 2022 11:21:21 +0800 Subject: [PATCH] Clean fluid APIs in distributed and fleet files (#48851) * Fix bug of reduce_sum op. When input.numel() > INT32_MAX, its result is wrong. * Remove climits. * Clean fluid API in paddle/distributed and paddle/fleetx folders. Include following files: python/paddle/distributed/__init__.py python/paddle/distributed/collective.py python/paddle/distributed/fleet/utils/fs.py python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py python/paddle/distributed/fleet/utils/hybrid_parallel_util.py python/paddle/distributed/fleet/utils/internal_storage.py python/paddle/distributed/launch/context/device.py python/paddle/distributed/parallel.py python/paddle/distributed/parallel_with_gloo.py python/paddle/distributed/spawn.py python/paddle/framework/__init__.py To be mentioned, 'paddle.fluid.dygraph.parallel.ParallelEnv' and 'fluid.framework.core' keeps unchanged in those files. ParallelEnv is used by paddle.fluid.dygraph.parallel.DataParallel. However, APIs in paddle.fluid.dygraph.parallel can't be migrated to paddle.distributed, as there exists cyclic import dependencies in modules like paddle.static, paddle.tensor. And 'fluid.framework.core' will be changed to import framework.core after fluid.core is transmitted. * Change TODO authors. --- python/paddle/distributed/__init__.py | 3 +++ python/paddle/distributed/collective.py | 4 +++- python/paddle/distributed/fleet/utils/fs.py | 1 + .../fleet/utils/hybrid_parallel_inference.py | 4 +++- .../fleet/utils/hybrid_parallel_util.py | 7 ++++-- .../fleet/utils/internal_storage.py | 22 +++++++++-------- .../distributed/launch/context/device.py | 24 ++++++++++--------- python/paddle/distributed/parallel.py | 12 ++++++++-- .../paddle/distributed/parallel_with_gloo.py | 1 + python/paddle/distributed/spawn.py | 3 ++- python/paddle/framework/__init__.py | 8 +++++++ 11 files changed, 61 insertions(+), 28 deletions(-) diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index e7832758a8..4e81ce52ef 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -64,6 +64,9 @@ from .entry_attr import ProbabilityEntry # noqa: F401 from .entry_attr import CountFilterEntry # noqa: F401 from .entry_attr import ShowClickEntry # noqa: F401 +# (TODO: GhostScreaming) It needs migration of ParallelEnv. However, +# it's hard to migrate APIs in paddle.fluid.dygraph.parallel completely. +# It will be replaced later. from paddle.fluid.dygraph.parallel import ParallelEnv # noqa: F401 from . import cloud_utils # noqa: F401 diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 6d8cd60c6b..6b9075de20 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -15,9 +15,11 @@ import datetime import paddle + +# (TODO: GhostScreaming) It will be removed later. import paddle.fluid.core as core +from paddle.framework import _non_static_mode, in_dygraph_mode -from ..fluid.framework import _non_static_mode, in_dygraph_mode from .communication.group import Group, _add_new_group, is_initialized from .fleet.layers.mpu.mp_ops import _c_concat # noqa: F401 from .fleet.layers.mpu.mp_ops import _c_identity # noqa: F401 diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index ea1fbc5c94..b61abbbaa5 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -20,6 +20,7 @@ import re import shutil import time +# (TODO: GhostScreaming) It will be removed later. from paddle.fluid import core from .log_util import logger diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py index 49aed0862f..c2c9a31769 100644 --- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py +++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py @@ -17,8 +17,10 @@ from collections import defaultdict import numpy as np import paddle.distributed.fleet as fleet + +# (TODO: GhostScreaming) It will be removed later. import paddle.fluid.core as core -from paddle.fluid.framework import Block, Program, _non_static_mode +from paddle.framework import Block, Program, _non_static_mode class HybridParallelInferenceHelper: diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py index 5f7709f0fe..2b0653ea35 100644 --- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py +++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py @@ -14,13 +14,16 @@ import paddle from paddle import framework + +# (TODO: GhostScreaming) It will be removed later. from paddle.fluid import core -from paddle.fluid.dygraph.parallel import ( +from paddle.framework import ( + _in_legacy_dygraph, _split_tensors, build_groups, + in_dygraph_mode, sync_params_buffers, ) -from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode from .log_util import logger diff --git a/python/paddle/distributed/fleet/utils/internal_storage.py b/python/paddle/distributed/fleet/utils/internal_storage.py index ce3a404098..2b27d6a0dc 100644 --- a/python/paddle/distributed/fleet/utils/internal_storage.py +++ b/python/paddle/distributed/fleet/utils/internal_storage.py @@ -25,7 +25,9 @@ import numpy as np import paddle -import paddle.fluid as fluid +from paddle import framework + +# (TODO: GhostScreaming) It will be removed later. from paddle.fluid import core from ..meta_parallel.sharding.sharding_utils import Type, device_guard @@ -111,7 +113,7 @@ class ParamStorage(InternalStorage): if keep_alignment: self._array_params() - @fluid.dygraph.no_grad + @framework.no_grad() def add_rank_params(self, trainable_params, param2align, convert_gpu=True): """ Add new parameters to the InternalStorage. Params becomes a view of this InternalStorage buffer. @@ -145,7 +147,7 @@ class ParamStorage(InternalStorage): self._params.append(param) self._param_ids.append(id(param)) - @fluid.dygraph.no_grad + @framework.no_grad() def _add_param_as_view(self, param, align, convert_gpu=True): assert ( @@ -185,7 +187,7 @@ class ParamStorage(InternalStorage): self._fill = offset return p_shape - @fluid.dygraph.no_grad + @framework.no_grad() def _convert_buffer(self, param, p_shape, align): var_end = self._fill + np.prod(p_shape) @@ -199,7 +201,7 @@ class ParamStorage(InternalStorage): self._fill = offset - @fluid.dygraph.no_grad + @framework.no_grad() def _array_params(self): """ Given the parameters which have been registered previously, rebuild the whole InternalStorage. @@ -261,7 +263,7 @@ class GradStorage(InternalStorage): if keep_alignment: self._array_grads() - @fluid.dygraph.no_grad + @framework.no_grad() def add_grad(self, param, align): """ Add a new parameter gradient to the InternalStorage. Param.grad becomes a view of this InternalStorage buffer. @@ -275,7 +277,7 @@ class GradStorage(InternalStorage): self._params.append(param) self._param_ids.append(id(param)) - @fluid.dygraph.no_grad + @framework.no_grad() def manumal_relase(self): """ Release the buffer from InternalStorage. The InternalStorage will need to be rebuilt before use. @@ -291,7 +293,7 @@ class GradStorage(InternalStorage): self.params_checked_in = 0 self._release = True - @fluid.dygraph.no_grad + @framework.no_grad() def rebuild(self): """ Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage. @@ -305,7 +307,7 @@ class GradStorage(InternalStorage): self._release = False - @fluid.dygraph.no_grad + @framework.no_grad() def _array_grads(self): """ Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage. @@ -315,7 +317,7 @@ class GradStorage(InternalStorage): for p in self._params: self._add_grad_as_view(p, self._parm2align[p.name]) - @fluid.dygraph.no_grad + @framework.no_grad() def _add_grad_as_view(self, param, align): assert ( np.prod(self.buffer.shape) > 0 diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py index f5aaf83d13..48dba9af56 100644 --- a/python/paddle/distributed/launch/context/device.py +++ b/python/paddle/distributed/launch/context/device.py @@ -14,9 +14,11 @@ import os -import paddle.fluid as fluid from paddle.device import get_available_custom_device +# (TODO: GhostScreaming) It will be removed later. +from paddle.fluid import core + class DeviceType: CPU = 'cpu' @@ -148,25 +150,25 @@ class Device: ) if visible_devices_str in os.environ: visible_devices = os.getenv(visible_devices_str) - elif fluid.core.is_compiled_with_cuda(): + elif core.is_compiled_with_cuda(): dev._dtype = DeviceType.GPU - num = fluid.core.get_cuda_device_count() + num = core.get_cuda_device_count() visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") - elif fluid.core.is_compiled_with_xpu(): + elif core.is_compiled_with_xpu(): dev._dtype = DeviceType.XPU - num = fluid.core.get_xpu_device_count() + num = core.get_xpu_device_count() visible_devices = os.getenv("XPU_VISIBLE_DEVICES") - elif fluid.core.is_compiled_with_npu(): + elif core.is_compiled_with_npu(): dev._dtype = DeviceType.NPU - num = fluid.core.get_npu_device_count() + num = core.get_npu_device_count() visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES") - elif fluid.core.is_compiled_with_mlu(): + elif core.is_compiled_with_mlu(): dev._dtype = DeviceType.MLU - num = fluid.core.get_mlu_device_count() + num = core.get_mlu_device_count() visible_devices = os.getenv("MLU_VISIBLE_DEVICES") - elif fluid.core.is_compiled_with_ipu(): + elif core.is_compiled_with_ipu(): dev._dtype = DeviceType.IPU - num = fluid.core.get_ipu_device_count() + num = core.get_ipu_device_count() # For IPUs, 'labels' is a list which contains the available numbers of IPU devices. dev._labels = [str(x) for x in range(0, num + 1)] return dev diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index bd449acabf..99a7114610 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -38,10 +38,18 @@ from paddle.distributed.fleet.base.private_helper_function import ( # noqa: F40 from paddle.distributed.fleet.launch_utils import check_backend # deprecated module import +# (TODO: GhostScreaming) It will be removed later. from paddle.fluid import core -from paddle.fluid.dygraph import parallel_helper + +# (TODO: GhostScreaming) It will be removed later. from paddle.fluid.dygraph.parallel import ParallelEnv -from paddle.fluid.framework import _set_expected_place, in_dygraph_mode + +# (TODO: GhostScreaming) It will be removed later. +from paddle.framework import ( + _set_expected_place, + in_dygraph_mode, + parallel_helper, +) __all__ = [] diff --git a/python/paddle/distributed/parallel_with_gloo.py b/python/paddle/distributed/parallel_with_gloo.py index d0c1b3eac9..3204b6460d 100755 --- a/python/paddle/distributed/parallel_with_gloo.py +++ b/python/paddle/distributed/parallel_with_gloo.py @@ -20,6 +20,7 @@ from paddle.distributed.fleet.base.private_helper_function import ( ) # deprecated module import +# (TODO: GhostScreaming) It will be removed later. from paddle.fluid import core __all__ = [] diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index 21ea1d4bdc..cb9804f452 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -37,8 +37,9 @@ from paddle.distributed.utils.launch_utils import ( ) # deprecated module import +# (TODO: GhostScreaming) It will be removed later. from paddle.fluid import core -from paddle.fluid.framework import set_flags +from paddle.framework import set_flags __all__ = [] diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index 30d6379368..99d9cffed1 100755 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -68,5 +68,13 @@ from ..fluid.framework import _in_legacy_dygraph # noqa: F401 from ..fluid.framework import _global_flags # noqa: F401 from ..fluid.framework import _apply_pass # noqa: F401 from ..fluid.framework import switch_main_program +from ..fluid.framework import _set_expected_place # noqa: F401 +from ..fluid.framework import Block, Program # noqa: F401 +from ..fluid.dygraph import parallel_helper # noqa: F401 +from ..fluid.dygraph.parallel import ( + _split_tensors, + build_groups, + sync_params_buffers, +) __all__ = [] -- GitLab