[CodeStyle][F401] remove unused imports in python/paddle/distributed (#46758)

* [CodeStyle][F401] remove unused import in python/paddle/distributed * remove pass * empty commit * Fix ValueError: list.remove(x): x not in list for meta_optimizer_names. Fix ValueError: list.remove(x): x not in list for meta_optimizer_names. * Fix split import. Fix split import. * add noqa after meta_optimizers in factory * restort collective ops * expand `import *` * add noqa after required imports * try to fix APIs without core.ops * Revert "try to fix APIs without core.ops" This reverts commit 6172beaf601e84bf61f2490c12c4739f0edaa5eb. * fix an increment * empty commit * add noqa after required imports * expand `import *`, fix ci error Co-authored-by: N Shuangchi He <34329208+Yulv-git@users.noreply.github.com>

[CodeStyle][F401] remove unused imports in python/paddle/distributed (#46758)
* [CodeStyle][F401] remove unused import in python/paddle/distributed * remove pass * empty commit * Fix ValueError: list.remove(x): x not in list for meta_optimizer_names. Fix ValueError: list.remove(x): x not in list for meta_optimizer_names. * Fix split import. Fix split import. * add noqa after meta_optimizers in factory * restort collective ops * expand `import *` * add noqa after required imports * try to fix APIs without core.ops * Revert "try to fix APIs without core.ops" This reverts commit 6172beaf601e84bf61f2490c12c4739f0edaa5eb. * fix an increment * empty commit * add noqa after required imports * expand `import *`, fix ci error Co-authored-by: N Shuangchi He <34329208+Yulv-git@users.noreply.github.com>
fe716a0b · Nyakku Shigure · GitHub · ef144953 · fe716a0b · fe716a0b
139 changed file
--- a/python/paddle/distributed/auto_parallel/cluster_v2.py
+++ b/python/paddle/distributed/auto_parallel/cluster_v2.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import copy
 import numpy as np
 from enum import IntEnum
 from enum import unique

-import paddle
 from paddle.fluid import core
-from paddle.fluid.core import Device
-from paddle.fluid.core import Link
+from paddle.fluid.core import Device  # noqa: F401
+from paddle.fluid.core import Link  # noqa: F401


 @unique

--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -13,17 +13,13 @@
 # limitations under the License.

 import copy
-from copy import deepcopy
 import time

 from paddle.fluid import core
-from paddle.fluid import framework

-from .utils import print_program_with_dist_attr, is_gradient_clip_op
+from .utils import is_gradient_clip_op
 from .operators import find_compatible_distributed_operator_impls
-from .dist_context import get_default_distributed_context, _node_id
-from .dist_tensor import DistributedTensor
-from .dist_op import DistributedOperator
+from .dist_context import _node_id
 from .dist_attribute import TensorDistributedAttribute
 from .dist_attribute import OperatorDistributedAttribute
 from .process_mesh import ProcessMesh

--- a/python/paddle/distributed/auto_parallel/cost/base_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/base_cost.py
@@ -17,7 +17,7 @@ from functools import reduce

 import paddle

-from ..utils import _get_comm_group, _get_corresponding_rank
+from ..utils import _get_comm_group
 from ..process_group import get_process_group
 from ..cluster import LinkType
 from ..dist_tensor import DistributedTensor

--- a/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
@@ -14,7 +14,7 @@

 import math

-from .base_cost import register_op_cost, CommOpCost, _g_op_cost_factory
+from .base_cost import CommOpCost, register_op_cost


 @register_op_cost

--- a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License

-from .base_cost import Cost, register_op_cost, CompOpCost, _g_op_cost_factory
+from .base_cost import CompOpCost, register_op_cost


 @register_op_cost

--- a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
@@ -16,7 +16,6 @@ from collections import OrderedDict
 from functools import reduce

 import paddle
-import paddle.fluid.core as core
 from paddle.distributed.fleet.meta_optimizers.common import OpRole

 from .base_cost import Cost

--- a/python/paddle/distributed/auto_parallel/cost_model.py
+++ b/python/paddle/distributed/auto_parallel/cost_model.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import json
 import queue
 import copy
 from enum import Enum

--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -13,7 +13,6 @@
 # limitations under the License

 import copy
-from collections import defaultdict
 from paddle.fluid.framework import Variable
 from .process_mesh import ProcessMesh


--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -14,17 +14,14 @@

 import copy
 from collections import defaultdict
-import paddle.fluid
 from paddle.fluid import framework
-from paddle.fluid.framework import get_flags, set_flags
+from paddle.fluid.framework import set_flags
 from paddle.fluid import core
 from paddle.distributed.passes import PassContext
-from .dist_attribute import TensorDistributedAttribute
-from .dist_attribute import OperatorDistributedAttribute
 from .dist_tensor import DistributedTensor
 from .dist_op import DistributedOperator
 from .process_mesh import ProcessMesh
-from .utils import is_loss_grad_op, is_loss_op
+from .utils import is_loss_grad_op

 # There always exists a default context for user. And user can set it to another one.
 _g_default_distributed_context = None

--- a/python/paddle/distributed/auto_parallel/dist_loader.py
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -14,12 +14,9 @@

 import abc
 import numpy as np
-from functools import wraps

 import paddle
-from .utils import to_list
-from paddle.fluid.layers.utils import flatten
-from paddle.io import DataLoader, BatchSampler, IterableDataset
+from paddle.io import BatchSampler, IterableDataset
 from paddle.fluid.dataloader.batch_sampler import _InfiniteIterableSampler
 from paddle.fluid.dataloader.dataloader_iter import _DatasetKind, default_collate_fn, default_convert_fn


--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -13,16 +13,12 @@
 # limitations under the License

 import copy
-from collections import defaultdict
 import paddle
 from paddle.fluid import core
 from paddle.fluid.framework import Variable
-from .dist_attribute import TensorDistributedAttribute
 from .dist_attribute import OperatorDistributedAttribute
 from .dist_attribute import append_op_input_suffix
 from .dist_attribute import append_op_output_suffix
-from .dist_attribute import get_tensor_dist_attr_field_keys
-from .dist_attribute import get_op_dist_attr_field_keys
 from .utils import convert_to_shard_spec, verify_shard_spec



--- a/python/paddle/distributed/auto_parallel/dist_saver.py
+++ b/python/paddle/distributed/auto_parallel/dist_saver.py
@@ -16,16 +16,13 @@ import re
 import os
 import errno
 import pickle
-import warnings
 import logging
 import numpy as np
 import paddle

 from paddle import fluid
 from paddle.fluid import core
-from paddle.fluid.framework import static_only
 from .utils import get_dist_attr
-from .converter import Converter
 from .process_group import _g_process_group_map
 from ..utils.log_utils import get_logger


--- a/python/paddle/distributed/auto_parallel/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
@@ -19,7 +19,6 @@ import paddle
 from paddle.fluid import core
 from paddle.fluid.framework import Parameter, Block, Variable
 from .dist_attribute import TensorDistributedAttribute
-from .dist_attribute import get_tensor_dist_attr_field_keys
 from .utils import _linear_idx2coordinate



--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -13,8 +13,6 @@
 # limitations under the License.

 import os
-import time
-import copy
 import logging
 import random
 import numpy as np
@@ -24,14 +22,13 @@ import paddle
 import paddle.utils as utils

 from paddle import fluid, profiler, static
-from paddle.jit import to_static
 from paddle.metric import Metric
 from paddle.static import InputSpec
 from paddle.fluid import core
 from paddle.fluid import Variable
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.executor import global_scope, _to_name_str
-from paddle.fluid.framework import Operator, Parameter, _non_static_mode
+from paddle.fluid.framework import Operator, _non_static_mode
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.distributed import fleet
@@ -44,7 +41,7 @@ from .parallelizer_v2 import Parallelizer
 from .dist_op import DistributedOperator
 from .dist_saver import DistributedSaver
 from .dist_loader import NonIterableGeneratorLoader
-from .utils import print_program_with_dist_attr, to_list
+from .utils import to_list
 from .utils import get_logger, get_dist_attr
 from .process_group import new_process_group, get_all_process_groups
 from .dist_context import DistributedContext, get_default_distributed_context

--- a/python/paddle/distributed/auto_parallel/helper.py
+++ b/python/paddle/distributed/auto_parallel/helper.py
@@ -15,11 +15,9 @@
 import logging
 from collections import defaultdict

-import paddle
-
 from paddle.nn import Layer
 from paddle.jit import to_static, not_to_static
-from paddle.fluid.framework import Operator, Parameter, _non_static_mode
+from paddle.fluid.framework import Parameter
 from paddle.fluid.framework import program_guard
 from paddle.fluid.executor import global_scope
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction

--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -12,14 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from collections import defaultdict
-
 import paddle
 from paddle.fluid import core
 from .process_mesh import ProcessMesh
 from .process_mesh import get_current_process_mesh
-from .process_mesh import set_current_process_mesh
-from .process_mesh import reset_current_process_mesh
 from .dist_context import get_default_distributed_context
 from .dist_tensor import DistributedTensor
 from .dist_op import DistributedOperatorHelper

--- a/python/paddle/distributed/auto_parallel/mapper.py
+++ b/python/paddle/distributed/auto_parallel/mapper.py
@@ -15,11 +15,8 @@
 import os
 import operator
 import functools
-import json
 import paddle
 from collections import deque
-from .graph import Node
-from .graph import Edge
 from .graph import Graph
 from .cluster import DeviceType
 from .process_group import get_process_group

--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -13,8 +13,7 @@
 # limitations under the License

 import abc
-import paddle
-from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
+from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
 from ..dist_attribute import OperatorDistributedAttribute
 from ..utils import _get_comm_group, _get_corresponding_rank, is_optimize_op
 from ..process_group import new_process_group

--- a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
@@ -16,10 +16,8 @@ from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
-from paddle.fluid import core, unique_name
-from paddle.fluid.framework import _non_static_mode
-from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
-from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
+from paddle.fluid import core
+from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
 from ..utils import set_var_dist_attr
 from ..utils import set_dist_op_desc_original_id
 from ..process_group import new_process_group

--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -17,19 +17,11 @@ from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import gradient_synchronization
 from .common import register_distributed_operator_impl, is_parameter_related
-from ..utils import is_dim_shard
-from ..utils import is_dim_replicate
-from ..utils import is_valid_list_index, is_prim_op
+from ..utils import is_prim_op
 from ..utils import compute_compatible_dim_mapping
-from ..utils import compute_compatible_dims_mapping
-from ..utils import compute_compatible_and_update_dim_mapping
 from ..utils import set_dist_op_desc_original_id
 from ..dist_attribute import OperatorDistributedAttribute
-from paddle.fluid import core, unique_name
-from paddle.fluid.framework import _non_static_mode
-from paddle.fluid.framework import Program, Parameter, Variable, program_guard
-from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
-from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
+from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
 from ..process_group import new_process_group
 from ..utils import _get_comm_group, _get_corresponding_rank
 from ..cost import _g_op_cost_factory

--- a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
@@ -17,20 +17,9 @@ from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl, is_parameter_related
 from .common import is_elementwise_op
-from ..utils import is_dim_shard
-from ..utils import is_dim_replicate
-from ..utils import is_valid_list_index
 from ..utils import compute_compatible_dim_mapping
 from ..utils import compute_compatible_dims_mapping
-from ..utils import compute_compatible_and_update_dim_mapping
-from ..dist_attribute import OperatorDistributedAttribute
-from paddle.fluid import core, unique_name
-from paddle.fluid.framework import _non_static_mode
-from paddle.fluid.framework import Program, Parameter, Variable, program_guard
-from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
-from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
-from ..process_group import new_process_group
-from ..utils import _get_comm_group, _get_corresponding_rank
+from paddle.distributed.fleet.meta_optimizers.common import OpRole
 from .dist_default import DistributedDefaultImpl0
 from ..cost import _g_op_cost_factory
 from ..cost import build_comp_desc_from_dist_op, build_dp_costs

--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -17,19 +17,14 @@ from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import gradient_synchronization
-from .common import register_distributed_operator_impl, set_comm_op_dist_attr_for_program, naive_copy_op_dist_attr_for_program, is_parameter_related
+from .common import naive_copy_op_dist_attr_for_program, register_distributed_operator_impl, set_comm_op_dist_attr_for_program
 from ..utils import is_dim_shard
 from ..utils import is_dim_replicate
-from ..utils import is_valid_list_index
-from ..utils import compute_compatible_dim_mapping
-from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
-from ..dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from ..dist_attribute import OperatorDistributedAttribute
 from paddle.fluid import core, unique_name
-from paddle.fluid.framework import _non_static_mode
-from paddle.fluid.framework import Program, Parameter, Variable
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
-from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
+from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
 from ..process_group import new_process_group
 from ..utils import _get_comm_group, _get_idx_in_axis, _get_corresponding_rank, set_var_dist_attr
 from ..cost import build_comp_desc_from_dist_op, build_comm_desc_from_dist_op

--- a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
@@ -16,23 +16,12 @@ from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
-from ..utils import is_dim_shard
-from ..utils import is_dim_replicate
-from ..utils import is_valid_list_index
-from ..utils import compute_compatible_dim_mapping
-from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
-from ..utils import set_dist_op_desc_original_id
-from paddle.fluid import core, unique_name
-from paddle.fluid.framework import _non_static_mode
-from paddle.fluid.framework import Program, Parameter, Variable, program_guard
-from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 from .dist_default import DistributedDefaultImpl0
 from ..cost import FillConstantBatchSizeLikeOpCost
-from ..cost import build_comp_desc_from_dist_op, build_dp_costs
+from ..cost import build_comp_desc_from_dist_op
 from ..cost import build_comp_costs_from_descs
-from paddle.distributed.auto_parallel.cost.comm_op_cost import AllreduceSumOpCost


 class DistributedFillConstantBatchSizeLike(DistributedOperatorImplContainer):

--- a/python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py
@@ -17,9 +17,6 @@ from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from ..utils import is_dim_shard, is_dim_replicate
-from ..utils import is_valid_list_index
-from ..utils import compute_compatible_dim_mapping
-from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
 from .dist_default import DistributedDefaultImpl0
 from ..utils import _get_comm_group, _get_corresponding_rank

--- a/python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py
@@ -17,9 +17,6 @@ from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from ..utils import is_dim_shard, is_dim_replicate
-from ..utils import is_valid_list_index
-from ..utils import compute_compatible_dim_mapping
-from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
 from .dist_default import DistributedDefaultImpl0
 from ..utils import _get_comm_group, _get_corresponding_rank

--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -20,20 +20,17 @@ from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from .common import gradient_synchronization
-from .common import set_comm_op_dist_attr_for_program, naive_copy_op_dist_attr_for_program, is_parameter_related
+from .common import is_parameter_related, set_comm_op_dist_attr_for_program
 from ..utils import is_dim_shard
 from ..utils import is_dim_replicate
 from ..utils import is_valid_list_index
-from ..utils import compute_compatible_dim_mapping
 from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
 from ..utils import set_dist_op_desc_original_id
 from ..dist_attribute import OperatorDistributedAttribute
 from paddle.fluid import core, unique_name
-from paddle.fluid.framework import _non_static_mode
-from paddle.fluid.framework import Program, Parameter, Variable, program_guard
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
-from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
+from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
 from ..process_group import new_process_group
 from ..utils import _get_comm_group, _get_corresponding_rank
 from .dist_default import DistributedDefaultImpl0

--- a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
@@ -13,23 +13,18 @@
 # limitations under the License.

 import copy
-import paddle
-import paddle.fluid.layers.utils as utils

 from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
-from .common import set_comm_op_dist_attr_for_program
-from .dist_default import DistributedDefaultImpl0
 from ..process_group import new_process_group
 from ..utils import is_dim_shard, is_dim_replicate, _get_corresponding_rank
 from ..utils import compute_compatible_dim_mapping, set_dist_op_desc_original_id, _get_comm_group
 from ..dist_attribute import TensorDistributedAttribute, OperatorDistributedAttribute

-from paddle.fluid import core, unique_name
+from paddle.fluid import core
 from paddle.fluid.framework import Operator
-from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype



--- a/python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py
@@ -15,22 +15,11 @@
 from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
-from .common import register_distributed_operator_impl, is_parameter_related
-from ..utils import is_dim_shard
-from ..utils import is_dim_replicate
-from ..utils import is_valid_list_index
-from ..utils import compute_compatible_dim_mapping
-from ..utils import compute_compatible_dims_mapping
-from ..utils import compute_compatible_and_update_dim_mapping
+from .common import register_distributed_operator_impl
 from ..utils import set_dist_op_desc_original_id
 from ..dist_attribute import OperatorDistributedAttribute
-from paddle.fluid import core, unique_name
-from paddle.fluid.framework import _non_static_mode
-from paddle.fluid.framework import Program, Parameter, Variable, program_guard
-from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
-from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
+from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
 from ..process_group import new_process_group
-from ..utils import _get_comm_group, _get_corresponding_rank


 class DistributedReduceSumPrimtive(DistributedOperatorImplContainer):

--- a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
@@ -17,19 +17,10 @@ from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl, is_parameter_related
 from ..utils import is_dim_shard
-from ..utils import is_dim_replicate
-from ..utils import is_valid_list_index
-from ..utils import compute_compatible_dim_mapping
-from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
 from ..utils import set_dist_op_desc_original_id
-from paddle.fluid import core, unique_name
-from paddle.fluid.framework import _non_static_mode
-from paddle.fluid.framework import Program, Parameter, Variable, program_guard
-from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
 from .dist_default import DistributedDefaultImpl0
 from ..cost import build_comp_desc_from_dist_op, build_comp_costs_from_descs
-from ..cost import build_comm_costs_from_descs
 from ..cost import Reshape2OpCost
 from ..cost import Reshape2GradOpCost
 from paddle.distributed.fleet.meta_optimizers.common import OpRole

--- a/python/paddle/distributed/auto_parallel/operators/dist_slice.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_slice.py
@@ -18,7 +18,6 @@ from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from ..utils import is_dim_shard
 from ..utils import compute_compatible_dim_mapping
-from ..utils import compute_compatible_and_update_dim_mapping
 from .dist_default import DistributedDefaultImpl0



--- a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py
@@ -18,18 +18,12 @@ from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from .common import is_parameter_related
 from ..utils import is_dim_shard
-from ..utils import is_dim_replicate
-from ..utils import is_valid_list_index
-from ..utils import compute_compatible_dim_mapping
-from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
 from .dist_default import DistributedDefaultImpl0
-from ..cost import _g_op_cost_factory
 from ..cost import build_comp_desc_from_dist_op, build_dp_costs
 from ..cost import build_comp_costs_from_descs
 from ..cost import SoftmaxOpCost, SoftmaxGradOpCost
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
-from paddle.distributed.auto_parallel.cost.comm_op_cost import AllreduceSumOpCost


 class DistributedSoftmax(DistributedOperatorImplContainer):

--- a/python/paddle/distributed/auto_parallel/operators/dist_split.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_split.py
@@ -17,9 +17,6 @@ from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from ..utils import is_dim_shard
-from ..utils import is_valid_list_index
-from ..utils import compute_compatible_dim_mapping
-from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
 from .dist_default import DistributedDefaultImpl0


--- a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py
@@ -17,18 +17,12 @@ from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from .common import is_parameter_related
-from ..utils import is_dim_shard
-from ..utils import is_dim_replicate
-from ..utils import is_valid_list_index
-from ..utils import compute_compatible_dim_mapping
-from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
 from .dist_default import DistributedDefaultImpl0
 from ..cost import Transpose2OpCost, Transpose2GradOpCost
-from ..cost import build_comp_desc_from_dist_op, build_comm_desc_from_dist_op, build_dp_costs
+from ..cost import build_comp_desc_from_dist_op, build_dp_costs
 from ..cost import build_comp_costs_from_descs
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
-from paddle.distributed.auto_parallel.cost.comm_op_cost import AllreduceSumOpCost


 class DistributedTranspose2(DistributedOperatorImplContainer):

--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -25,12 +25,10 @@ import time
 import paddle
 from paddle.fluid.backward import append_backward
 from paddle.distributed.utils.log_utils import get_logger
-from paddle.distributed.fleet import cloud_utils
 import paddle.fluid.core as core
 from paddle.fluid import program_guard
 from paddle.distributed.passes import new_pass, PassContext
 from .dist_context import DistributedContext
-from .dist_context import get_default_distributed_context
 from .dist_context import set_default_distributed_context
 from .completion import Completer
 from .partitioner import Partitioner
@@ -40,7 +38,6 @@ from .process_group import get_world_process_group
 from .process_group import _g_process_group_map, ProcessGroup
 from .utils import make_data_unshard
 from .utils import set_grad_var_shape
-from .utils import print_program_with_dist_attr
 from .utils import SerialProgramInfo
 from .utils import get_logger
 from .reshard import Resharder

--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -15,24 +15,17 @@
 import copy
 import time
 import logging
-from collections import defaultdict

-import paddle
 from paddle.fluid import program_guard
 from paddle.fluid.backward import append_backward
-from paddle.fluid.framework import _non_static_mode, unique_name
+from paddle.fluid.framework import unique_name
 from paddle.distributed.passes import new_pass

 from .reshard import Resharder
 from .partitioner import Partitioner
-from .dist_op import DistributedOperator
-from .dist_saver import DistributedSaver
-from .dist_loader import NonIterableGeneratorLoader
-from .utils import make_data_unshard, set_grad_var_shape
-from .utils import print_program_with_dist_attr, to_list
+from .utils import set_grad_var_shape
 from .utils import get_logger
-from .process_group import get_all_process_groups, get_world_process_group
-from .dist_context import DistributedContext, get_default_distributed_context
+from .process_group import get_world_process_group


 class Parallelizer:

--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -13,19 +13,14 @@
 # limitations under the License

 import copy
-import numpy as np
-import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid import framework as framework
-from paddle.fluid import core, unique_name
-from paddle.fluid.framework import Program, Parameter, Variable, program_guard
+from paddle.fluid import core
+from paddle.fluid.framework import Parameter, Program
 from paddle.distributed.auto_parallel.operators.common import get_distributed_operator_impl_container
-from paddle.distributed.auto_parallel.dist_context import DistributedContext, DistributedOperatorContext
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from .dist_attribute import OperatorDistributedAttribute
-from .process_group import new_process_group
-from .utils import set_dist_op_desc_original_id
-from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op, is_loss_op, is_optimize_op
+from .utils import is_backward_op, is_forward_op, is_loss_op, is_optimize_op
 from .operators.common import BACKWARD_ONLY_DIST_OPS

 __varname_not_in_block__ = ["lod_tensor_blocking_queue_0"]

--- a/python/paddle/distributed/auto_parallel/planner.py
+++ b/python/paddle/distributed/auto_parallel/planner.py
@@ -25,8 +25,7 @@ import paddle
 from paddle.distributed.fleet import auto
 from .cost_model import estimate_cost
 from .dist_op import DistributedOperator
-from .process_group import _g_process_group_map
-from .process_group import ProcessGroup, get_process_group
+from .process_group import get_process_group
 from .operators.common import is_elementwise_op
 from .operators.common import get_distributed_operator_impl_container
 from .utils import update_op_dims_mapping_by_default_dist_impl

--- a/python/paddle/distributed/auto_parallel/planner_v2.py
+++ b/python/paddle/distributed/auto_parallel/planner_v2.py
@@ -14,7 +14,6 @@

 from .completion import Completer
 from .dist_context import get_default_distributed_context
-from .utils import print_program_with_dist_attr

 # from .tuner.parallel_tuner import ParallelTuner


--- a/python/paddle/distributed/auto_parallel/process_mesh_v2.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh_v2.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import copy
 import numpy as np
 from paddle.fluid import core


--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License

-import copy
 from functools import reduce

 import paddle
@@ -22,15 +21,13 @@ from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import Program, OpProtoHolder
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 import paddle.fluid.layers.utils as utils
-from ..collective import _get_global_env
 from .dist_context import DistributedContext
-from .dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
-from .process_group import new_process_group, ProcessGroup, _g_process_group_map
+from .dist_attribute import TensorDistributedAttribute
+from .process_group import new_process_group
 from .cost import build_comm_desc, CommContext
 from .cost import AllgatherOpCost, SendOpCost
 from .cost import SliceOpCost, SplitOpCost, ConcatOpCost
-from .cluster import Cluster
-from .utils import print_program_with_dist_attr, is_gradient_clip_op
+from .utils import is_gradient_clip_op

 # NOTE: If op in _g_special_ops or _g_gradient_clip_ops, it will not be resharded.
 _g_special_ops = ['check_finite_and_unscale', 'update_loss_scaling']

--- a/python/paddle/distributed/auto_parallel/strategy.py
+++ b/python/paddle/distributed/auto_parallel/strategy.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License

-import os
 import copy
-import argparse
 from . import constants



--- a/python/paddle/distributed/auto_parallel/tuner/config.py
+++ b/python/paddle/distributed/auto_parallel/tuner/config.py
@@ -14,9 +14,7 @@

 import os
 import copy
-import pathlib

-import paddle
 from ..strategy import Strategy

 _tuning_supported_passes = ["sharding", "recompute"]

--- a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
@@ -24,20 +24,19 @@ import pickle
 import json
 import logging
 import subprocess
-import traceback

 import paddle
 from paddle.fluid import program_guard
 from paddle.fluid.backward import append_backward
 from paddle.distributed.passes import new_pass, PassContext

-from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.reshard import Resharder
 from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.process_group import clear_all_process_groups, get_all_process_groups
 from paddle.distributed.auto_parallel.utils import debug_program
-from paddle.distributed.auto_parallel.utils import make_data_unshard, set_grad_var_shape
+from paddle.distributed.auto_parallel.utils import set_grad_var_shape

 from ..utils import get_logger
 from .config import TuningConfig

--- a/python/paddle/distributed/auto_parallel/tuner/profiler.py
+++ b/python/paddle/distributed/auto_parallel/tuner/profiler.py
@@ -13,19 +13,16 @@
 # limitations under the License.

 import os
-import sys
 import argparse
 import traceback
 import pickle
 import json
 import time
-import numpy as np
-from functools import partial

 import paddle
 from paddle.fluid.framework import Program, _current_expected_place
-from paddle.fluid.framework import Operator, Parameter
-from paddle.distributed.auto_parallel.process_group import clear_all_process_groups, get_all_process_groups, new_process_group
+from paddle.fluid.framework import Operator
+from paddle.distributed.auto_parallel.process_group import get_all_process_groups, new_process_group
 from paddle.distributed.auto_parallel.dist_loader import NonIterableGeneratorLoader
 from paddle.distributed.collective import _get_global_env


--- a/python/paddle/distributed/auto_parallel/tuner/trial.py
+++ b/python/paddle/distributed/auto_parallel/tuner/trial.py
@@ -18,7 +18,6 @@
 import hashlib
 import random
 import time
-from enum import Enum

 from .storable import Storable
 from .recorder import MetricsRecorder

--- a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
@@ -15,13 +15,6 @@
 # Notice that the following codes are modified from KerasTuner to implement our own tuner.
 # Please refer to https://github.com/keras-team/keras-tuner/blob/master/keras_tuner/engine/hyperparameters.py.

-import collections
-import contextlib
-import copy
-import math
-import random
-import numpy as np
-
 from .tunable_variable import Boolean
 from .tunable_variable import Fixed
 from .tunable_variable import Choice

--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -1386,7 +1386,7 @@ def update_op_dims_mapping_by_elementwise_like_dist_impl(dist_op):
 def get_all_distributed_main_program(serial_program_info, dist_context,
                                     parallelizer):
    "Get all distributed main programs by dist_context."
-    from .dist_context import DistributedOperatorContext, DistributedContext
+    from .dist_context import DistributedOperatorContext
    cluster = serial_program_info.cluster
    copied_parallelizer = copy.deepcopy(parallelizer)
    all_dist_main_program = []

--- a/python/paddle/distributed/cloud_utils.py
+++ b/python/paddle/distributed/cloud_utils.py
@@ -13,7 +13,6 @@
 # limitations under the License.

 import os
-import paddle
 from paddle.distributed.utils.launch_utils import get_cluster, get_gpus, get_cluster_from_args
 from paddle.distributed.utils.launch_utils import logger

@@ -70,7 +69,6 @@ paddlecloud environment.".format(args_node_ips, node_ips))

            except Exception as e:
                print(e)
-                pass

        if started_port is None:
            started_port = 6170

--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -19,41 +19,28 @@ import io
 import datetime
 import time
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import Variable
 from ..fluid.framework import in_dygraph_mode
-from ..fluid.framework import OpProtoHolder
 from ..fluid.framework import _non_static_mode
-from ..fluid.framework import _in_legacy_dygraph
-from ..fluid.framework import convert_np_dtype_to_dtype_
-from ..fluid.framework import _varbase_creator
-from ..fluid.data_feeder import convert_dtype
 from ..fluid.data_feeder import check_variable_and_dtype
-from ..fluid.data_feeder import check_type
-from ..fluid.data_feeder import check_dtype
 from ..fluid.layers.tensor import fill_constant
-from ..fluid.layers import utils
-from ..fluid.dygraph import layers
-from ..fluid.dygraph.parallel import prepare_context
 import paddle
-import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle import _C_ops, _legacy_C_ops
-import paddle.fluid.dygraph_utils as dygraph_utils
+from paddle import _legacy_C_ops
 import contextlib
-from .fleet.layers.mpu.mp_ops import split
-from .fleet.layers.mpu.mp_ops import _c_identity
-from .fleet.layers.mpu.mp_ops import _c_concat
-from .fleet.layers.mpu.mp_ops import _c_split
-from .fleet.layers.mpu.mp_ops import _mp_allreduce
-from .fleet.layers.mpu.mp_ops import _c_lookup_table
-from .fleet.layers.mpu.mp_ops import _Linear
-from .fleet.layers.mpu.mp_ops import _set_var_distributed
-from .fleet.layers.mpu.mp_ops import _c_softmax_with_cross_entropy
-from .fleet.layers.mpu.mp_ops import _linear
-from .fleet.layers.mpu.mp_ops import _parallel_linear
-from .fleet.layers.mpu.mp_ops import _parallel_embedding
+from .fleet.layers.mpu.mp_ops import split  # noqa: F401
+from .fleet.layers.mpu.mp_ops import _c_identity  # noqa: F401
+from .fleet.layers.mpu.mp_ops import _c_concat  # noqa: F401
+from .fleet.layers.mpu.mp_ops import _c_split  # noqa: F401
+from .fleet.layers.mpu.mp_ops import _mp_allreduce  # noqa: F401
+from .fleet.layers.mpu.mp_ops import _c_lookup_table  # noqa: F401
+from .fleet.layers.mpu.mp_ops import _Linear  # noqa: F401
+from .fleet.layers.mpu.mp_ops import _set_var_distributed  # noqa: F401
+from .fleet.layers.mpu.mp_ops import _c_softmax_with_cross_entropy  # noqa: F401
+from .fleet.layers.mpu.mp_ops import _linear  # noqa: F401
+from .fleet.layers.mpu.mp_ops import _parallel_linear  # noqa: F401
+from .fleet.layers.mpu.mp_ops import _parallel_embedding  # noqa: F401
 from .communication.group import Group, _add_new_group
-from .communication.all_reduce import all_reduce
+from .communication.all_reduce import all_reduce  # noqa: F401
 from .communication.reduce import _get_reduce_op, ReduceOp

 __all__ = []

--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -14,8 +14,7 @@

 import os
 import json
-import paddle
-from paddle.distributed.fleet.launch_utils import get_cluster, logger, get_host_name_ip, DeviceMode
+from paddle.distributed.fleet.launch_utils import DeviceMode, get_cluster, get_host_name_ip

 __all__ = []


--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -15,7 +15,7 @@

 import paddle
 from paddle.distributed.fleet.proto import distributed_strategy_pb2
-from paddle.fluid.framework import Variable, set_flags, core, _global_flags
+from paddle.fluid.framework import _global_flags
 from paddle.fluid.wrapped_decorator import wrap_decorator
 import google.protobuf.text_format
 import google.protobuf
@@ -537,7 +537,6 @@ class DistributedStrategy(object):
            'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor',
            'DownpourDoubleUnitAccessor', 'DownpourCtrDymfAccessor'
        ]
-        from google.protobuf.descriptor import FieldDescriptor
        table_param = self.strategy.downpour_table_param

        def add_graph_config(graph, strategy):

--- a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from ..meta_optimizers import *
+from ..meta_optimizers import *  # noqa: F401

 __all__ = []


--- a/python/paddle/distributed/fleet/base/private_helper_function.py
+++ b/python/paddle/distributed/fleet/base/private_helper_function.py
@@ -15,7 +15,6 @@ import sys
 import time
 import socket
 from contextlib import closing
-from six import string_types

 __all__ = []


--- a/python/paddle/distributed/fleet/base/runtime_factory.py
+++ b/python/paddle/distributed/fleet/base/runtime_factory.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ..runtime.collective_runtime import CollectiveRuntime
-from ..runtime.parameter_server_runtime import ParameterServerRuntime
 from ...ps.the_one_ps import TheOnePSRuntime

 __all__ = []

--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import sys
 import paddle
 import collections
-import numpy as np
 from itertools import product
 from functools import reduce
 from ..utils.log_util import logger

--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -16,7 +16,7 @@
 """basic collective operations in python"""
 """remote file system"""

-from ..utils.fs import FS, LocalFS, HDFSClient
+from ..utils.fs import FS
 from paddle.fluid.proto import framework_pb2
 from paddle.fluid.framework import Program
 from paddle.fluid import debugger

--- a/python/paddle/distributed/fleet/cloud_utils.py
+++ b/python/paddle/distributed/fleet/cloud_utils.py
@@ -13,7 +13,6 @@
 # limitations under the License.

 import os
-import paddle
 from paddle.distributed.fleet.launch_utils import get_cluster, logger

 __all__ = []
@@ -67,7 +66,6 @@ paddlecloud environment.".format(args_node_ips, node_ips))

            except Exception as e:
                print(e)
-                pass

        if started_port is None:
            started_port = 6170

--- a/python/paddle/distributed/fleet/data_generator/data_generator.py
+++ b/python/paddle/distributed/fleet/data_generator/data_generator.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
 import sys

 __all__ = []

--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """This is definition of dataset class, which is high performance IO."""

-import paddle
 from paddle.fluid.proto import data_feed_pb2
 from google.protobuf import text_format
 import paddle.fluid.core as core

--- a/python/paddle/distributed/fleet/elastic/collective.py
+++ b/python/paddle/distributed/fleet/elastic/collective.py
@@ -13,9 +13,6 @@
 # limitations under the License.

 import tempfile
-from paddle.distributed.fleet import launch_utils
-from paddle.distributed.fleet import cloud_utils
-from paddle.distributed.fleet import ascend_utils

 from paddle.distributed.fleet.launch_utils import *


--- a/python/paddle/distributed/fleet/elastic/manager.py
+++ b/python/paddle/distributed/fleet/elastic/manager.py
@@ -17,7 +17,6 @@ import socket
 import os
 import six
 import copy
-import logging
 import signal
 import random
 import threading

--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -15,11 +15,9 @@
 import copy
 import paddle
 import os
-from types import MethodType
-import numpy as np
 from paddle.fluid.framework import _global_flags
 from paddle.fluid import compiler
-from .base.role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker, RoleMakerBase
+from .base.role_maker import PaddleCloudRoleMaker, RoleMakerBase
 from .base.strategy_compiler import StrategyCompiler
 from .base.distributed_strategy import DistributedStrategy
 from .base.meta_optimizer_factory import MetaOptimizerFactory
@@ -29,10 +27,7 @@ from paddle.fluid.dygraph import parallel_helper
 from paddle.fluid.ir import apply_build_strategy
 from .base import topology as tp
 from .meta_parallel import model_parallel_random_seed
-from paddle import _C_ops, _legacy_C_ops
-from paddle.fluid import core
 from .utils.log_util import logger, set_log_level
-import logging

 __all__ = []


--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -57,16 +57,12 @@ launch a process on each of the given gpu card or cpu machine.
 import shutil
 import sys
 import tempfile
-from sys import version
-import subprocess
 import os
 import time
 import six
 import copy
 import pathlib
-import argparse
 from argparse import ArgumentParser, REMAINDER
-import paddle
 import paddle.fluid as fluid
 from paddle.distributed.fleet import launch_utils


--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -24,12 +24,10 @@ import shutil
 from contextlib import closing
 import multiprocessing
 import socket
-import warnings
 import six
 import struct
 import json

-import paddle
 import paddle.fluid as fluid
 from distutils.util import strtobool
 import paddle.utils.cpp_extension.extension_utils as utils

--- a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -18,8 +18,6 @@ from paddle.fluid import core
 from paddle.fluid.dygraph.layers import Layer
 from .random import get_rng_state_tracker
 from paddle.nn import functional as F
-from paddle import framework
-from paddle.autograd import PyLayer
 from ...base import topology as tp

 __all__ = []

--- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -13,7 +13,7 @@
 # limitations under the License.

 import paddle
-from paddle import _C_ops, _legacy_C_ops
+from paddle import _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.framework import _in_legacy_dygraph

--- a/python/paddle/distributed/fleet/layers/mpu/random.py
+++ b/python/paddle/distributed/fleet/layers/mpu/random.py
@@ -15,10 +15,10 @@
 import paddle
 import numpy as np
 import contextlib
-from paddle import _C_ops, _legacy_C_ops
+from paddle import _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import _non_static_mode, default_main_program, Variable
+from paddle.fluid.framework import Variable, _non_static_mode
 from paddle.fluid.layer_helper import LayerHelper

 __all__ = []

--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
@@ -12,11 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
-import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import Optimizer
 import paddle.fluid.core as core
-import numpy as np
 from . import ascend_parser
 from paddle.distributed import fleet
 import hccl.manage.api as hccl

--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
@@ -11,11 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import paddle.fluid.framework as framework
-from paddle.fluid.optimizer import Optimizer
 import paddle.fluid.core as core
 import numpy as np
-from paddle.distributed import fleet
 from functools import reduce

 __all__ = []

--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import paddle
 from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid import framework


--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -12,16 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import sys
-from paddle.optimizer import Optimizer
 from ...base.topology import ParallelMode
 from paddle.fluid.dygraph import base as imperative_base
-from paddle.fluid import framework
-from paddle.fluid.framework import Variable
-import types
-from paddle.fluid import core
 import paddle
-from paddle import _C_ops, _legacy_C_ops
+from paddle import _legacy_C_ops

 __all__ = []


--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -12,15 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import sys
 import paddle
-from paddle.optimizer import Optimizer
 from paddle.fluid.clip import ClipGradByGlobalNorm
 from ...utils.hybrid_parallel_util import fused_allreduce_gradients, sharding_reduce_gradients
 from ...base.topology import ParallelMode
 from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid import framework
-from paddle.fluid.framework import Variable
 from ...utils.log_util import logger
 from paddle.fluid import core
 from paddle.fluid import layers

--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -22,15 +22,11 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.

-import copy
 import logging
 import numpy as np
-from itertools import chain
-from functools import reduce
 from collections import OrderedDict

 import paddle
-import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.optimizer import Optimizer
 from paddle.fluid.clip import ClipGradByGlobalNorm

--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -16,7 +16,7 @@ import paddle
 from paddle.fluid import program_guard, layers, default_main_program
 from paddle.fluid import default_startup_program
 from .meta_optimizer_base import MetaOptimizerBase
-from .common import OpRole, OP_ROLE_KEY, CollectiveHelper, is_update_op
+from .common import CollectiveHelper, OP_ROLE_KEY, OpRole

 __all__ = []


--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and

-from paddle import fluid
 from paddle.fluid import compiler
 from .parameter_server_optimizer import ParameterServerOptimizer


--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -11,14 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and

-import os
-
 import paddle.fluid as fluid
-from paddle.fluid import core, unique_name
-from ..base.private_helper_function import wait_server_ready
 from paddle.fluid.optimizer import PipelineOptimizer as PO
 from .meta_optimizer_base import MetaOptimizerBase
-from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_loss_grad_op, is_backward_op, is_optimizer_op
+from .common import CollectiveHelper, OP_ROLE_KEY, OP_ROLE_VAR_KEY, OpRole, is_backward_op, is_loss_grad_op

 __all__ = []


--- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
@@ -21,7 +21,6 @@ import os
 import platform
 from paddle.distributed.ps.utils.public import *
 from paddle.distributed.passes import PassContext
-from ..base.private_helper_function import wait_server_ready
 from paddle.distributed.ps.utils.ps_factory import PsProgramBuilderFactory



--- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
@@ -11,14 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and

-import os
-import collections
-import numpy as np
-
 import paddle.fluid as fluid
 from paddle.fluid import core, unique_name
-from paddle.fluid.dygraph import Layer, LayerList
-from ..base.private_helper_function import wait_server_ready
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_loss_grad_op, is_backward_op, is_optimizer_op


--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
@@ -13,7 +13,6 @@
 # limitations under the License.

 from paddle.distributed.fleet.meta_optimizers.common import is_optimizer_op, OP_ROLE_KEY, OpRole
-from paddle.distributed.fleet.meta_optimizers.sharding.utils import *

 from paddle.fluid import core


--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import copy
 from ..common import is_optimizer_op, OP_ROLE_KEY, OpRole, is_update_op
 from paddle.fluid import core, unique_name
-from .shard import Shard

 __all__ = []


--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import re
 from paddle.distributed.fleet.meta_optimizers.common import is_optimizer_op
-from paddle.distributed.fleet.meta_optimizers.sharding.utils import *
+from paddle.distributed.fleet.meta_optimizers.sharding.utils import get_var_size
 from paddle.distributed.fleet.meta_optimizers.sharding.fp16_helper import FP16Utils

 __all__ = []

--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -15,7 +15,7 @@ import paddle
 from paddle.fluid import core, unique_name
 from functools import reduce
 from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op, is_backward_op, is_optimizer_op
-from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
+from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole

 import re
 import os

--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import paddle
+import os
 from paddle.fluid import unique_name, core
 import paddle.fluid as fluid
 from paddle.static import default_startup_program, device_guard
@@ -28,9 +28,19 @@ from .sharding.gradient_clip_helper import GradientClipHelper
 from .sharding.offload_helper import OffloadHelper
 from .sharding.prune import ProgramDeps
 from .sharding import utils
-# FIXME: import *
-from .sharding.utils import *
-import logging
+from .sharding.utils import (
+    insert_sync_calc_op,
+    insert_sync_comm_ops,
+    insert_fill_constant_ops,
+    insert_cast_ops,
+    insert_allreduce_ops,
+    insert_reduce_ops,
+    get_grad_device,
+    get_first_optimize_op_idx,
+    insert_broadcast_ops,
+    get_var_size,
+    insert_scale_loss_grad_ops,
+)
 from ..utils.log_util import logger

 __all__ = []

--- a/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and

 import paddle.fluid as fluid
-from paddle.fluid import core, unique_name
 from .meta_optimizer_base import MetaOptimizerBase
-from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
+from .common import CollectiveHelper, OP_ROLE_KEY, OP_ROLE_VAR_KEY, OpRole, is_backward_op, is_loss_grad_op, is_optimizer_op

 __all__ = []


--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -42,14 +42,11 @@ import math
 import re
 import glob
 import os
-import numpy as np
-import random
 from functools import partial

 import paddle
 from paddle.fluid.dygraph.layers import Layer
 from ...utils.log_util import logger, layer_to_str
-from paddle.distributed import fleet
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.incubate.distributed.fleet import recompute_hybrid


--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -20,7 +20,7 @@ from ..utils.hybrid_parallel_util import broadcast_mp_parameters
 from ..utils.hybrid_parallel_util import broadcast_dp_parameters
 from ..utils.hybrid_parallel_util import broadcast_sharding_parameters
 from ..utils.log_util import logger
-from ..meta_optimizers.dygraph_optimizer import HybridParallelOptimizer, HybridParallelGradScaler
+from ..meta_optimizers.dygraph_optimizer import HybridParallelOptimizer
 import paddle.fluid.framework as framework
 from .pp_utils import p2p_communication as p2p
 import paddle.fluid.core as core

--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -15,9 +15,9 @@
 import paddle
 from ...utils.log_util import logger
 import numpy as np
-from paddle import _C_ops, _legacy_C_ops
+from paddle import _legacy_C_ops
 import paddle.fluid.core as core
-from paddle.fluid.framework import _in_legacy_dygraph, _non_static_mode, in_dygraph_mode
+from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 from .utils import paddle_2_number, paddle_2_number, number_2_dtype

 _hcg = None

--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -13,8 +13,7 @@
 # limitations under the License.

 import paddle
-from paddle.fluid import core
-from paddle import _C_ops, _legacy_C_ops
+from paddle import _legacy_C_ops

 __all__ = []


--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
@@ -22,19 +22,16 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.

-import copy
 import logging
 import warnings

-import numpy as np
 from collections import OrderedDict

 import paddle
-import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.optimizer import Optimizer
 from paddle.fluid.clip import ClipGradByGlobalNorm
-from paddle.distributed.collective import _get_global_group, new_group, broadcast, wait
+from paddle.distributed.collective import _get_global_group, broadcast, new_group

 from .group_sharded_storage import ParamStorage, GradStorage
 from .group_sharded_utils import Type, device_guard, GroupShardedClipGrad

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
@@ -23,11 +23,7 @@
 # LICENSE file in the root directory of this source tree.

 import logging
-import time
-import functools
-import numpy as np
 from functools import reduce
-from collections import deque
 from types import MethodType

 import paddle
@@ -37,7 +33,7 @@ from paddle.distributed.utils.log_utils import get_logger

 from .group_sharded_storage import GradStorage
 from .group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2
-from .group_sharded_utils import Taskflow, Type, device_guard
+from .group_sharded_utils import Type, device_guard

 logger_ = get_logger(logging.WARNING)


--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import time
 import logging
 import numpy as np
 from types import MethodType

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
@@ -22,8 +22,6 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.

-import os
-import time
 import numpy as np

 import paddle

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
 import contextlib
 from enum import Enum
 import numpy as np
 from types import MethodType

 import paddle
-from paddle import _C_ops, _legacy_C_ops
+from paddle import _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid import layers
 from paddle.fluid.dygraph import to_variable

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -22,11 +22,7 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.

-import os
-import contextlib
 import logging
-import time
-import functools
 import numpy as np
 from itertools import chain
 from functools import reduce

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -12,16 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
-import copy
-import time
-import contextlib
 import logging
-import functools
 import numpy as np
-from itertools import chain
 from types import MethodType
-from collections import deque, OrderedDict
+from collections import OrderedDict

 import paddle
 from paddle import nn

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -12,22 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
 import contextlib
-from collections import abc
 from enum import Enum
-from math import inf
 import numpy as np
 from types import MethodType

 import paddle
-from paddle import _C_ops, _legacy_C_ops
+from paddle import _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid import layers
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.framework import dygraph_only
 from paddle.fluid.dygraph import base as imperative_base
-from paddle.distributed.collective import _get_global_group


 class Taskflow:

--- a/python/paddle/distributed/fleet/meta_parallel/sharding_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding_parallel.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from paddle.fluid.dygraph.layers import Layer
 from .meta_parallel_base import MetaParallelBase
 from ..utils.hybrid_parallel_util import broadcast_sharding_parameters
 from ..utils.log_util import logger

--- a/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from paddle.fluid.dygraph.layers import Layer
 from .meta_parallel_base import MetaParallelBase
 from ..utils.hybrid_parallel_util import broadcast_dp_parameters
 from ..utils.hybrid_parallel_util import broadcast_input_data

--- a/python/paddle/distributed/fleet/model.py
+++ b/python/paddle/distributed/fleet/model.py
@@ -13,13 +13,9 @@
 # limitations under the License.

 import paddle
-import os
-import numpy as np
-from .base import topology as tp
 from .base.topology import ParallelMode
-from .meta_parallel import TensorParallel, model_parallel_random_seed
+from .meta_parallel import TensorParallel
 from .meta_parallel import PipelineParallel, ShardingParallel, PipelineParallelWithInterleave, PipelineLayer
-from paddle.fluid import core
 from paddle.fluid.dygraph.varbase_patch_methods import _grad_scalar
 from paddle.distributed import fleet

@@ -131,7 +127,7 @@ def distributed_model(model):
        # NOTE (JZ-LIANG) init parameters broadcast within sharding group
        # normally it should be done inside DataParallel
        if fleet_env.sharding_degree > 1:
-            from paddle.distributed.fleet.utils.hybrid_parallel_util import broadcast_mp_parameters, broadcast_sharding_parameters
+            from paddle.distributed.fleet.utils.hybrid_parallel_util import broadcast_sharding_parameters
            assert fleet_env.sharding_degree == fleet_env._hcg.get_sharding_parallel_world_size(
            )
            broadcast_sharding_parameters(model, fleet_env._hcg)

--- a/python/paddle/distributed/fleet/optimizer.py
+++ b/python/paddle/distributed/fleet/optimizer.py
@@ -14,12 +14,7 @@

 import copy
 import paddle
-import os
-import numpy as np
-from paddle.fluid.framework import dygraph_only, _global_flags
-from .base.distributed_strategy import DistributedStrategy
 from .meta_optimizers import HybridParallelOptimizer, HeterParallelOptimizer
-from paddle.fluid import core
 from paddle.distributed import fleet
 from .utils.log_util import logger


--- a/python/paddle/distributed/fleet/recompute/recompute.py
+++ b/python/paddle/distributed/fleet/recompute/recompute.py
@@ -21,7 +21,6 @@ from paddle.fluid import framework
 import contextlib
 from paddle.fluid.framework import in_dygraph_mode

-import logging
 from ..utils.log_util import logger

 __all__ = []
@@ -129,7 +128,6 @@ class LegacyRecomputeFunction(LegacyPyLayer):

    @staticmethod
    def backward(ctx, *args):
-        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
        with paddle.fluid.dygraph.guard():
            # TODO need to check the recompute calling is vaild or not

@@ -265,7 +263,6 @@ class RecomputeFunction(PyLayer):

    @staticmethod
    def backward(ctx, *args):
-        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
        with paddle.fluid.dygraph.guard():
            # TODO need to check the recompute calling is vaild or not


--- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
+++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
@@ -12,16 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import contextlib
-
 import paddle
-from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
 from paddle.autograd import PyLayer
 from paddle.fluid import framework
 from ..meta_parallel.parallel_layers.random import get_rng_state_tracker
-from paddle.fluid.framework import in_dygraph_mode
-from paddle.distributed import fleet
 from .recompute import check_recompute_necessary, detach_variable, swith_rng_state_tracker
 from ..meta_parallel.pp_utils import utils


--- a/python/paddle/distributed/fleet/runtime/collective_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/collective_runtime.py
@@ -26,26 +26,21 @@ class CollectiveRuntime(RuntimeBase):
    def _init_worker(self):
        logging.warn(
            "You should not call 'init_worker' method for collective mode.")
-        pass

    def _run_worker(self):
        logging.warn(
            "You should not call 'run_worker' method for collective mode.")
-        pass

    def _init_server(self, *args, **kwargs):
        logging.warn(
            "You should not call 'init_server' method for collective mode.")
-        pass

    def _run_server(self):
        logging.warn(
            "You should not call 'run_server' method for collective mode.")
-        pass

    def _stop_worker(self):
        logging.warn(
            "You should not call 'stop_worker' method for collective mode.")
-        pass

    # save inference model should be added here
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -21,7 +21,7 @@ from paddle.fluid.framework import Program
 from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.executor import Executor
 from paddle.fluid.parallel_executor import ParallelExecutor
-from paddle.fluid.framework import Variable, Parameter
+from paddle.fluid.framework import Variable

 from .runtime_base import RuntimeBase
 from ..base.private_helper_function import wait_server_ready

--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -21,7 +21,6 @@ from paddle.fluid.framework import Program
 from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.executor import Executor
 from paddle.fluid.parallel_executor import ParallelExecutor
-from paddle.fluid.framework import Variable, Parameter
 from .runtime_base import RuntimeBase
 from ..base.private_helper_function import wait_server_ready

@@ -670,7 +669,7 @@ class TheOnePSRuntime(RuntimeBase):

    def _init_worker(self):
        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
-            SyncStrategy, GeoStrategy
+            SyncStrategy

        is_sync = self.compiled_strategy.is_sync_mode()
        worker = self._get_fleet_proto(is_server=False, is_sync=is_sync)

--- a/python/paddle/distributed/fleet/scaler.py
+++ b/python/paddle/distributed/fleet/scaler.py
@@ -13,14 +13,13 @@
 # limitations under the License.

 import paddle
-from paddle.fluid.framework import dygraph_only
 from .base.topology import ParallelMode
 from paddle.distributed import fleet
 from types import MethodType
 from paddle.fluid import core
 from paddle.fluid.dygraph import to_variable
 import numpy as np
-from paddle import _C_ops, _legacy_C_ops
+from paddle import _legacy_C_ops


 def distributed_scaler(scaler):

--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -13,19 +13,12 @@
 # limitations under the License.

 import os
-import sys
-import subprocess
 import multiprocessing
-from datetime import datetime

 import re
-import copy
-import errno
 import time
-import logging
 import six
 import abc
-import paddle.fluid as fluid
 from paddle.fluid import core
 import functools


--- a/python/paddle/distributed/fleet/utils/http_server.py
+++ b/python/paddle/distributed/fleet/utils/http_server.py
@@ -15,14 +15,11 @@

 import logging

-import six
 # NOTE: HTTPServer has a different name in python2 and python3
 from http.server import HTTPServer
 import http.server as SimpleHTTPServer

-import time
 import threading
-import socket

 __all__ = []


--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -13,7 +13,7 @@
 # limitations under the License.

 from collections import defaultdict
-from paddle.fluid.framework import Program, Block, Operator
+from paddle.fluid.framework import Block, Program
 from paddle.fluid.framework import _non_static_mode
 import paddle.fluid.core as core
 import paddle.distributed.fleet as fleet

--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -11,16 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-import six
-import numpy as np

 from paddle import framework
 import paddle
 from paddle.fluid import core
 from paddle.fluid.dygraph.parallel import _split_tensors, sync_params_buffers, build_groups
 from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
-from collections import OrderedDict
 from .log_util import logger

 __all__ = []

--- a/python/paddle/distributed/fleet/utils/internal_storage.py
+++ b/python/paddle/distributed/fleet/utils/internal_storage.py
@@ -22,8 +22,6 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.

-import os
-import time
 import numpy as np

 import paddle

--- a/python/paddle/distributed/fleet/utils/log_util.py
+++ b/python/paddle/distributed/fleet/utils/log_util.py
@@ -13,7 +13,6 @@
 # limitations under the License.

 import logging
-import sys

 from paddle.distributed.utils.log_utils import get_logger


--- a/python/paddle/distributed/fleet/utils/ps_util.py
+++ b/python/paddle/distributed/fleet/utils/ps_util.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Parameter Server utils"""

-import numpy as np
 import os
 import paddle
 import warnings
@@ -85,8 +84,6 @@ class DistributedInfer:
        return self.sparse_table_maps

    def _init_dense_params(self, exe=None, dirname=None):
-        import paddle.distributed.fleet as fleet
-
        sparse_table_maps = self._get_sparse_table_map()

        if dirname is not None and exe is not None:

--- a/python/paddle/distributed/launch/controllers/collective.py
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -16,9 +16,6 @@ from .controller import Controller, ControleMode
 from ..context.device import DeviceType

 import json
-import os
-import six
-import time


 class CollectiveController(Controller):

--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -23,8 +23,6 @@ from paddle.distributed.launch.job.container import Container
 from .master import Master
 from .watcher import Watcher

-import time
-

 class ControleMode:
    COLLECTIVE = "collective"

--- a/python/paddle/distributed/launch/job/container.py
+++ b/python/paddle/distributed/launch/job/container.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from collections import OrderedDict
 from paddle.distributed.launch.utils.process_context import ProcessContext

 from .status import Status

-import os, copy, sys
+import os
+import sys


 class Container(object):

--- a/python/paddle/distributed/launch/job/pod.py
+++ b/python/paddle/distributed/launch/job/pod.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from collections import OrderedDict
 from .container import Container

 from .status import Status

--- a/python/paddle/distributed/launch/plugins/test.py
+++ b/python/paddle/distributed/launch/plugins/test.py
@@ -17,7 +17,7 @@ import paddle
 from paddle.distributed import fleet
 from paddle.vision.models import ResNet
 from paddle.vision.models.resnet import BottleneckBlock
-from paddle.io import Dataset, BatchSampler, DataLoader
+from paddle.io import DataLoader, Dataset

 base_lr = 0.1
 momentum_rate = 0.9

--- a/python/paddle/distributed/launch/utils/nvsmi.py
+++ b/python/paddle/distributed/launch/utils/nvsmi.py
@@ -13,7 +13,6 @@
 # limitations under the License.

 import subprocess
-import shlex
 import os
 import json
 import shutil

--- a/python/paddle/distributed/metric/metrics.py
+++ b/python/paddle/distributed/metric/metrics.py
@@ -14,7 +14,6 @@

 import sys
 import yaml
-import paddle.fluid as fluid
 import logging
 from paddle.distributed.utils.log_utils import get_logger


--- a/python/paddle/distributed/models/moe/utils.py
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -14,9 +14,9 @@

 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode
+from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle import _C_ops, _legacy_C_ops
+from paddle import _legacy_C_ops


 def _number_count(numbers, upper_range):

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -13,16 +13,12 @@
 # limitations under the License.

 import os
-import six
 import warnings
 from multiprocessing import Process  # noqa: F401
 from multiprocessing import Manager  # noqa: F401
 import time
-import sys
 import paddle

-from paddle import compat as cpt
-
 # deprecated module import
 from paddle.fluid import core
 from paddle.fluid.framework import in_dygraph_mode
@@ -31,11 +27,9 @@ from paddle.fluid.dygraph import parallel_helper
 from paddle.distributed.fleet.launch_utils import check_backend
 from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.distributed.fleet.base.private_helper_function import wait_server_ready  # noqa: F401
-from paddle.distributed import collective
 from paddle.distributed.collective import _set_group_map
 from paddle.distributed.collective import _set_group_map_by_name
 from paddle.distributed.collective import _get_group_map_by_name
-from paddle.distributed.collective import _group_map_by_name
 from paddle.distributed.collective import _default_group_name
 from paddle.distributed.collective import _valid_backend_list
 from paddle.distributed.collective import _set_default_backend

--- a/python/paddle/distributed/parallel_with_gloo.py
+++ b/python/paddle/distributed/parallel_with_gloo.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
-import sys
 import time
-import warnings
 from multiprocessing import Process, Manager

 # deprecated module import

--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -16,11 +16,11 @@ from collections import OrderedDict
 import numpy as np

 import paddle
-from paddle.fluid import core, unique_name
+from paddle.fluid import unique_name
 from paddle.fluid.framework import default_main_program
-from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
+from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
 from paddle.distributed.auto_parallel.operators.common import is_data_parallel_scale_op, is_data_parallel_reduce_op
-from paddle.distributed.auto_parallel.utils import is_loss_grad_op, is_optimize_op, is_backward_op, ring_id_to_process_group, find_higher_order_backward_op
+from paddle.distributed.auto_parallel.utils import find_higher_order_backward_op, is_loss_grad_op, is_optimize_op, ring_id_to_process_group
 from .pass_base import PassBase, PassType, register_pass

 # add new optimizers supporting rescale_grad here

--- a/python/paddle/distributed/passes/auto_parallel_grad_clip.py
+++ b/python/paddle/distributed/passes/auto_parallel_grad_clip.py
@@ -17,7 +17,6 @@ from functools import reduce

 import paddle

-from paddle.fluid import core
 from .pass_base import PassBase, register_pass
 from ..auto_parallel.reshard import Resharder
 from ..auto_parallel.process_group import get_world_process_group

--- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -12,14 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import numpy as np
-from collections import OrderedDict
 from typing import List, Tuple, Dict, Any

 import paddle
 from paddle.framework import core
 from paddle.fluid import layers
-from paddle.fluid.framework import program_guard, device_guard
+from paddle.fluid.framework import device_guard
 from .pass_base import PassBase, PassType, register_pass
 from paddle.distributed.auto_parallel.utils import set_var_dist_attr, is_optimize_op, OpRole, OP_ROLE_KEY
 from paddle.distributed.auto_parallel.utils import naive_set_dist_op_attr_for_program_by_mesh_and_mapping

--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -12,16 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import copy
 import logging

 from .pass_base import PassBase, register_pass
 from paddle.fluid import core, unique_name
 from paddle.fluid import framework as framework
-from paddle.fluid.framework import Variable, Operator
+from paddle.fluid.framework import Variable
 from paddle.fluid.backward import _append_grad_suffix_, _get_no_grad_set_name
 from paddle.fluid.backward import ProgramStats, _rename_arg_, _find_op_path_
-from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
 from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute
 from paddle.distributed.auto_parallel.utils import get_loss_op, set_var_dist_attr, set_dist_op_desc_original_id
 from paddle.distributed.auto_parallel.utils import naive_set_dist_op_attr_for_program_by_mesh_and_mapping

--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -13,10 +13,7 @@
 # limitations under the License.

 from functools import reduce
-from collections import OrderedDict
-import numpy as np

-import paddle
 from paddle.framework import core
 from paddle.fluid import unique_name
 from .pass_base import PassBase, register_pass

--- a/python/paddle/distributed/passes/fuse_all_reduce.py
+++ b/python/paddle/distributed/passes/fuse_all_reduce.py
@@ -15,7 +15,6 @@
 from paddle.framework import core
 from paddle.fluid import unique_name
 from .pass_base import PassBase, PassType, register_pass
-from collections import OrderedDict
 import numpy as np



--- a/python/paddle/distributed/passes/pass_base.py
+++ b/python/paddle/distributed/passes/pass_base.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import six
-import sys
 from abc import ABC, abstractmethod
-from paddle.fluid.framework import program_guard, _apply_pass as _apply_cpp_pass
+from paddle.fluid.framework import _apply_pass as _apply_cpp_pass


 class PassContext:

--- a/python/paddle/distributed/passes/ps_server_pass.py
+++ b/python/paddle/distributed/passes/ps_server_pass.py
@@ -12,13 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import paddle
 from ..ps.utils.public import *
-from paddle.framework import core
 from .pass_base import PassBase, register_pass
 from paddle.optimizer.lr import LRScheduler
-from paddle.optimizer.lr import ExponentialDecay, NoamDecay, PiecewiseDecay, NaturalExpDecay, InverseTimeDecay
-from paddle.fluid.layers.learning_rate_scheduler import exponential_decay, noam_decay, piecewise_decay, natural_exp_decay, inverse_time_decay
+from paddle.optimizer.lr import ExponentialDecay, InverseTimeDecay, NaturalExpDecay, NoamDecay
+from paddle.fluid.layers.learning_rate_scheduler import exponential_decay, inverse_time_decay, natural_exp_decay, noam_decay


 @register_pass("add_lr_decay_table_pass")

--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -20,7 +20,7 @@ from paddle.framework import core
 from paddle.distributed.passes.pass_base import PassBase, register_pass
 from paddle.fluid.transpiler.details.program_utils import delete_ops
 from paddle.fluid.transpiler.collective import SingleProcessMultiThread
-from _collections import deque, defaultdict
+from _collections import defaultdict
 from paddle.fluid.framework import Program, Parameter



--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -23,7 +23,6 @@ from paddle.fluid.framework import Program
 from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.executor import Executor
 from paddle.fluid.parallel_executor import ParallelExecutor
-from paddle.fluid.framework import Variable, Parameter
 from paddle.distributed.fleet.runtime.runtime_base import RuntimeBase
 from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
 from paddle.distributed.fleet.proto import the_one_ps_pb2

--- a/python/paddle/distributed/ps/utils/ps_factory.py
+++ b/python/paddle/distributed/ps/utils/ps_factory.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import paddle
 from .ps_program_builder import *
 from .public import *


--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import paddle
 from .public import *
 from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
-from paddle.distributed.passes import new_pass, PassContext
+from paddle.distributed.passes import new_pass


 class PsProgramBuilder(object):

--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -15,7 +15,6 @@
 from functools import reduce

 import collections
-import math
 import os
 import warnings
 import logging

--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -14,7 +14,6 @@

 import os
 import logging
-from enum import Enum

 import paddle


--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -28,7 +28,7 @@ from paddle.device import get_device

 # deprecated module import
 from paddle.fluid import core
-from paddle.fluid.framework import _cpu_num, set_flags
+from paddle.fluid.framework import set_flags

 __all__ = []


--- a/python/paddle/distributed/utils/launch_utils.py
+++ b/python/paddle/distributed/utils/launch_utils.py
@@ -20,7 +20,6 @@ import sys
 import subprocess
 from contextlib import closing
 import socket
-from paddle.fluid import core
 from distutils.util import strtobool
 import six