From 7f6968049492c90666d5126579bb1ca5f394030c Mon Sep 17 00:00:00 2001
From: Yulong Ao <aoyulong@baidu.com>
Date: Tue, 30 May 2023 14:07:49 +0800
Subject: [PATCH] [Auto Parallel] Reorganize the fold structure (#54059)

* [Auto Parallel] Reorganize the fold structure

* [Auto Parallel] Fix some import errors
---
 .../distributed/auto_parallel/__init__.py     |  2 +-
 .../auto_parallel/dygraph/__init__.py         | 13 ++++++++++
 .../distributed/auto_parallel/interface.py    |  8 +++----
 .../distributed/auto_parallel/process_mesh.py | 10 ++++----
 .../distributed/auto_parallel/random.py       |  2 +-
 .../auto_parallel/static/__init__.py          | 13 ++++++++++
 .../{ => static}/auto_align_tool.py           |  6 ++---
 .../auto_parallel/{ => static}/callbacks.py   |  2 +-
 .../auto_parallel/{ => static}/cluster.py     |  2 +-
 .../auto_parallel/{ => static}/cluster_v2.py  |  0
 .../auto_parallel/{ => static}/completion.py  |  6 ++---
 .../auto_parallel/{ => static}/converter.py   |  2 +-
 .../{ => static}/cost/__init__.py             |  0
 .../{ => static}/cost/base_cost.py            |  0
 .../{ => static}/cost/comm_op_cost.py         |  0
 .../{ => static}/cost/comp_op_cost.py         |  0
 .../{ => static}/cost/estimate_cost.py        |  0
 .../{ => static}/cost/tensor_cost.py          |  4 +++-
 .../auto_parallel/{ => static}/cost_model.py  |  0
 .../{ => static}/dist_attribute.py            |  0
 .../{ => static}/dist_context.py              |  2 +-
 .../auto_parallel/{ => static}/dist_loader.py |  0
 .../auto_parallel/{ => static}/dist_op.py     |  0
 .../auto_parallel/{ => static}/dist_saver.py  |  2 +-
 .../auto_parallel/{ => static}/dist_tensor.py |  0
 .../auto_parallel/{ => static}/engine.py      |  8 +++----
 .../auto_parallel/{ => static}/graph.py       |  0
 .../auto_parallel/{ => static}/helper.py      |  0
 .../auto_parallel/{ => static}/mapper.py      |  0
 .../{ => static}/operators/__init__.py        |  0
 .../{ => static}/operators/common.py          |  0
 .../{ => static}/operators/dist_assign.py     |  0
 .../dist_check_finite_and_unscale.py          |  2 +-
 .../{ => static}/operators/dist_default.py    |  0
 .../{ => static}/operators/dist_dropout.py    |  4 ++--
 .../{ => static}/operators/dist_eltwise.py    |  0
 .../{ => static}/operators/dist_embedding.py  |  2 +-
 .../dist_fill_constant_batch_size_like.py     |  0
 .../{ => static}/operators/dist_flash_attn.py |  4 ++--
 .../operators/dist_fused_attention.py         |  0
 .../operators/dist_fused_dropout_add.py       |  4 ++--
 .../operators/dist_fused_feedforward.py       |  0
 .../{ => static}/operators/dist_matmul.py     |  2 +-
 .../{ => static}/operators/dist_pnorm.py      |  0
 .../operators/dist_reduce_sum_p.py            |  0
 .../{ => static}/operators/dist_reshape.py    |  0
 .../{ => static}/operators/dist_scale.py      |  0
 .../{ => static}/operators/dist_shape.py      |  0
 .../{ => static}/operators/dist_slice.py      |  0
 .../{ => static}/operators/dist_softmax.py    |  0
 .../{ => static}/operators/dist_split.py      |  0
 .../{ => static}/operators/dist_transpose.py  |  0
 .../operators/dist_update_loss_scaling.py     |  0
 .../{ => static}/parallelizer.py              |  0
 .../{ => static}/parallelizer_v2.py           |  4 ++--
 .../auto_parallel/{ => static}/partitioner.py |  6 +++--
 .../auto_parallel/{ => static}/planner.py     |  0
 .../auto_parallel/{ => static}/planner_v2.py  | 12 ++++++----
 .../{ => static}/process_group.py             |  4 ++--
 .../{ => static}/process_mesh_v2.py           |  0
 .../auto_parallel/{ => static}/reshard.py     |  0
 .../auto_parallel/{ => static}/topology.py    |  0
 .../{ => static}/tuner/__init__.py            |  0
 .../{ => static}/tuner/algorithms.py          |  0
 .../{ => static}/tuner/config.py              |  2 +-
 .../{ => static}/tuner/optimization_tuner.py  | 16 +++++++------
 .../{ => static}/tuner/parallel_tuner.py      |  2 +-
 .../{ => static}/tuner/profiler.py            |  4 ++--
 .../{ => static}/tuner/recorder.py            |  0
 .../{ => static}/tuner/rule_based_tuner.py    | 24 +++++++++++--------
 .../{ => static}/tuner/storable.py            |  0
 .../auto_parallel/{ => static}/tuner/trial.py |  0
 .../{ => static}/tuner/tunable_space.py       |  0
 .../{ => static}/tuner/tunable_variable.py    |  0
 .../auto_parallel/{ => static}/utils.py       | 12 +++++-----
 python/paddle/distributed/fleet/fleet.py      |  2 +-
 .../distributed/passes/auto_parallel_amp.py   | 10 ++++----
 ...uto_parallel_data_parallel_optimization.py |  8 +++----
 .../distributed/passes/auto_parallel_fp16.py  |  8 ++++---
 .../passes/auto_parallel_grad_clip.py         | 15 +++++++-----
 .../passes/auto_parallel_gradient_merge.py    |  6 ++---
 .../passes/auto_parallel_quantization.py      |  7 ++++--
 .../passes/auto_parallel_recompute.py         |  4 ++--
 .../passes/auto_parallel_sharding.py          |  8 ++++---
 ...rallel_supplement_explicit_dependencies.py |  4 ++--
 python/paddle/fluid/backward.py               |  2 +-
 python/paddle/fluid/framework.py              |  4 ++--
 .../unittests/auto_parallel_autoconvert.py    |  4 ++--
 .../unittests/auto_parallel_save_load.py      |  2 +-
 .../fleet/dygraph_save_for_auto_infer.py      |  2 +-
 .../unittests/test_auto_parallel_cluster.py   |  2 +-
 .../test_auto_parallel_completion.py          |  8 ++++---
 .../test_auto_parallel_completion_gpt.py      |  6 +++--
 .../test_auto_parallel_cost_model.py          | 16 ++++++++-----
 .../test_auto_parallel_dist_tensor.py         | 20 +++++++++++-----
 .../unittests/test_auto_parallel_graph.py     |  2 +-
 .../unittests/test_auto_parallel_mapper.py    | 18 ++++++++------
 .../test_auto_parallel_partitioner.py         | 14 +++++++----
 .../test_auto_parallel_partitioner_gpt.py     | 14 +++++++----
 .../unittests/test_auto_parallel_reshard.py   | 16 ++++++++-----
 .../test_auto_parallel_reshard_dpmppp.py      | 14 +++++++----
 .../test_auto_parallel_reshard_mppp.py        | 18 ++++++++------
 .../test_auto_parallel_reshard_serial.py      |  9 ++++++-
 .../unittests/test_auto_parallel_searcher.py  | 16 ++++++++-----
 .../test_auto_search_dist_matmul_op.py        |  8 ++++---
 .../unittests/test_auto_search_dist_op.py     |  8 ++++---
 python/setup.py.in                            |  8 ++++---
 setup.py                                      |  8 ++++---
 test/auto_parallel/amp_o2_pass.py             |  5 +++-
 ...auto_parallel_relaunch_with_gpt_planner.py |  4 ++--
 .../auto_parallel_relaunch_with_planner.py    |  6 ++---
 test/auto_parallel/converter.py               |  2 +-
 test/auto_parallel/test_align_tool.py         |  4 +++-
 test/auto_parallel/test_base_cost.py          | 16 ++++++++-----
 test/auto_parallel/test_cluster.py            |  2 +-
 test/auto_parallel/test_cluster_partition.py  |  2 +-
 test/auto_parallel/test_cluster_v2.py         |  2 +-
 test/auto_parallel/test_comm_cost.py          |  4 ++--
 test/auto_parallel/test_comp_cost.py          |  4 ++--
 .../test_convert_to_process_meshes.py         |  2 +-
 test/auto_parallel/test_converter.py          |  2 +-
 test/auto_parallel/test_dist_assign.py        |  8 ++++---
 test/auto_parallel/test_dist_attr_v2.py       |  6 ++---
 test/auto_parallel/test_dist_context.py       |  4 +++-
 test/auto_parallel/test_dist_matmul.py        |  8 ++++---
 test/auto_parallel/test_dist_op_cost.py       | 10 ++++----
 test/auto_parallel/test_dist_pnorm.py         |  8 ++++---
 test/auto_parallel/test_dist_reshape.py       |  8 ++++---
 test/auto_parallel/test_dist_scale.py         |  8 ++++---
 test/auto_parallel/test_dist_shape.py         |  8 ++++---
 test/auto_parallel/test_dist_slice.py         |  8 ++++---
 test/auto_parallel/test_dist_split.py         |  8 ++++---
 test/auto_parallel/test_engine_callbacks.py   |  2 +-
 test/auto_parallel/test_fp16_assign.py        |  8 ++++---
 test/auto_parallel/test_group_operators.py    |  4 ++--
 test/auto_parallel/test_interface.py          |  4 ++--
 test/auto_parallel/test_new_cost_model.py     |  8 +++----
 test/auto_parallel/test_parallel_tuner.py     | 10 ++++----
 .../auto_parallel/test_parallel_tuner_full.py | 12 ++++++----
 .../test_parallel_tuner_predict.py            | 10 ++++----
 test/auto_parallel/test_pattern.py            |  2 +-
 test/auto_parallel/test_pattern_match.py      |  4 ++--
 test/auto_parallel/test_prim_dist_op.py       |  8 +++----
 test/auto_parallel/test_process_mesh.py       |  6 ++---
 test/auto_parallel/test_process_mesh_v2.py    |  2 +-
 test/auto_parallel/test_recorder.py           |  2 +-
 test/auto_parallel/test_rule_based_tuner.py   |  6 ++---
 .../auto_parallel/test_rule_based_tuner_o2.py |  6 ++---
 test/auto_parallel/test_serialization.py      |  4 ++--
 test/auto_parallel/test_to_static.py          |  5 +++-
 test/auto_parallel/test_topology.py           |  2 +-
 test/auto_parallel/test_trial.py              |  4 ++--
 test/auto_parallel/test_tunable_space.py      |  2 +-
 test/auto_parallel/test_tunable_variable.py   |  2 +-
 .../auto_parallel/test_while_op_completion.py |  6 +++--
 test/auto_parallel/test_while_op_partition.py |  8 +++----
 ...arallel_data_parallel_optimization_pass.py |  4 ++--
 157 files changed, 449 insertions(+), 298 deletions(-)
 create mode 100644 python/paddle/distributed/auto_parallel/dygraph/__init__.py
 create mode 100644 python/paddle/distributed/auto_parallel/static/__init__.py
 rename python/paddle/distributed/auto_parallel/{ => static}/auto_align_tool.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/callbacks.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/cluster.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/cluster_v2.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/completion.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/converter.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/cost/__init__.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/cost/base_cost.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/cost/comm_op_cost.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/cost/comp_op_cost.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/cost/estimate_cost.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/cost/tensor_cost.py (97%)
 rename python/paddle/distributed/auto_parallel/{ => static}/cost_model.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/dist_attribute.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/dist_context.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/dist_loader.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/dist_op.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/dist_saver.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/dist_tensor.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/engine.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/graph.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/helper.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/mapper.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/__init__.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/common.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_assign.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_check_finite_and_unscale.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_default.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_dropout.py (98%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_eltwise.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_embedding.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_fill_constant_batch_size_like.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_flash_attn.py (97%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_fused_attention.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_fused_dropout_add.py (98%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_fused_feedforward.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_matmul.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_pnorm.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_reduce_sum_p.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_reshape.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_scale.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_shape.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_slice.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_softmax.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_split.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_transpose.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/operators/dist_update_loss_scaling.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/parallelizer.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/parallelizer_v2.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/partitioner.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/planner.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/planner_v2.py (96%)
 rename python/paddle/distributed/auto_parallel/{ => static}/process_group.py (98%)
 rename python/paddle/distributed/auto_parallel/{ => static}/process_mesh_v2.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/reshard.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/topology.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/tuner/__init__.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/tuner/algorithms.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/tuner/config.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/tuner/optimization_tuner.py (97%)
 rename python/paddle/distributed/auto_parallel/{ => static}/tuner/parallel_tuner.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/tuner/profiler.py (98%)
 rename python/paddle/distributed/auto_parallel/{ => static}/tuner/recorder.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/tuner/rule_based_tuner.py (99%)
 rename python/paddle/distributed/auto_parallel/{ => static}/tuner/storable.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/tuner/trial.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/tuner/tunable_space.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/tuner/tunable_variable.py (100%)
 rename python/paddle/distributed/auto_parallel/{ => static}/utils.py (99%)

diff --git a/python/paddle/distributed/auto_parallel/__init__.py b/python/paddle/distributed/auto_parallel/__init__.py
index 835ca68df2d..4486b3220fa 100644
--- a/python/paddle/distributed/auto_parallel/__init__.py
+++ b/python/paddle/distributed/auto_parallel/__init__.py
@@ -14,7 +14,7 @@
 
 from .strategy import Strategy
 from .process_mesh import ProcessMesh
-from .engine import Engine
+from .static.engine import Engine
 from .interface import shard_tensor
 from .interface import shard_op
 from .interface import recompute
diff --git a/python/paddle/distributed/auto_parallel/dygraph/__init__.py b/python/paddle/distributed/auto_parallel/dygraph/__init__.py
new file mode 100644
index 00000000000..1ee2fa6eb06
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/dygraph/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py
index 76207bc5889..06a24b0c543 100644
--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -14,11 +14,11 @@
 
 import paddle
 
-from .dist_context import get_default_distributed_context
-from .dist_op import DistributedOperatorHelper
-from .dist_tensor import DistributedTensor
 from .process_mesh import ProcessMesh, get_current_process_mesh
-from .utils import (
+from .static.dist_context import get_default_distributed_context
+from .static.dist_op import DistributedOperatorHelper
+from .static.dist_tensor import DistributedTensor
+from .static.utils import (
     __no_shape_var_type__,
     convert_to_dims_mapping,
     verify_shard_spec,
diff --git a/python/paddle/distributed/auto_parallel/process_mesh.py b/python/paddle/distributed/auto_parallel/process_mesh.py
index e2ccd16aaaa..1c2f292e5f8 100644
--- a/python/paddle/distributed/auto_parallel/process_mesh.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh.py
@@ -140,12 +140,12 @@ class ProcessMesh(core.ProcessMesh):
         )
 
         # Store all process meshes
-        from .dist_context import get_default_distributed_context
+        from .static.dist_context import get_default_distributed_context
 
         default_dist_cxt = get_default_distributed_context()
         default_dist_cxt.add_process_mesh(self)
         # Add new processes to process group 0
-        from .process_group import get_process_group
+        from .static.process_group import get_process_group
 
         pg0 = get_process_group(0)
         pg0.add_ranks(self.process_ids)
@@ -204,14 +204,14 @@ class ProcessMesh(core.ProcessMesh):
         self._old_op_size = len(cur_block.ops)
 
     def __exit__(self, exc_type, exc_value, exc_traceback):
-        from .dist_op import DistributedOperator
-        from .dist_tensor import DistributedTensor
+        from .static.dist_op import DistributedOperator
+        from .static.dist_tensor import DistributedTensor
 
         default_prog = paddle.static.default_main_program()
         cur_block = default_prog.current_block()
         new_var_names = list(cur_block.vars.keys())
         new_op_size = len(cur_block.ops)
-        from .dist_context import get_default_distributed_context
+        from .static.dist_context import get_default_distributed_context
 
         default_dist_ctx = get_default_distributed_context()
         for name in new_var_names:
diff --git a/python/paddle/distributed/auto_parallel/random.py b/python/paddle/distributed/auto_parallel/random.py
index 5ca6d9e9ea0..d238fd60232 100644
--- a/python/paddle/distributed/auto_parallel/random.py
+++ b/python/paddle/distributed/auto_parallel/random.py
@@ -17,7 +17,7 @@ import paddle
 
 from ..utils.log_utils import get_logger
 from .process_mesh import retrive_unique_id_for_process_mesh
-from .utils import _get_idx_in_axis
+from .static.utils import _get_idx_in_axis
 
 _logger = get_logger(logging.INFO)
 
diff --git a/python/paddle/distributed/auto_parallel/static/__init__.py b/python/paddle/distributed/auto_parallel/static/__init__.py
new file mode 100644
index 00000000000..6f0ea85344b
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/static/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/auto_parallel/auto_align_tool.py b/python/paddle/distributed/auto_parallel/static/auto_align_tool.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/auto_align_tool.py
rename to python/paddle/distributed/auto_parallel/static/auto_align_tool.py
index 76a8db09fdc..2cd9e4a05d9 100644
--- a/python/paddle/distributed/auto_parallel/auto_align_tool.py
+++ b/python/paddle/distributed/auto_parallel/static/auto_align_tool.py
@@ -21,11 +21,11 @@ import numpy as np
 
 import paddle
 import paddle.distributed as dist
-from paddle.distributed.auto_parallel.converter import Converter
-from paddle.distributed.auto_parallel.dist_context import (
+from paddle.distributed.auto_parallel.static.converter import Converter
+from paddle.distributed.auto_parallel.static.dist_context import (
     get_default_distributed_context,
 )
-from paddle.distributed.auto_parallel.utils import (
+from paddle.distributed.auto_parallel.static.utils import (
     is_backward_op,
     is_forward_op,
     is_loss_op,
diff --git a/python/paddle/distributed/auto_parallel/callbacks.py b/python/paddle/distributed/auto_parallel/static/callbacks.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/callbacks.py
rename to python/paddle/distributed/auto_parallel/static/callbacks.py
index db7f460b0f0..6cbfaceee34 100644
--- a/python/paddle/distributed/auto_parallel/callbacks.py
+++ b/python/paddle/distributed/auto_parallel/static/callbacks.py
@@ -24,7 +24,7 @@ from paddle.hapi.callbacks import (
     ProgBarLogger,
 )
 
-from .interface import CollectionNames, get_collection
+from ..interface import CollectionNames, get_collection
 
 
 def config_callbacks(
diff --git a/python/paddle/distributed/auto_parallel/cluster.py b/python/paddle/distributed/auto_parallel/static/cluster.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/cluster.py
rename to python/paddle/distributed/auto_parallel/static/cluster.py
index 93740436970..c5df57be2bf 100644
--- a/python/paddle/distributed/auto_parallel/cluster.py
+++ b/python/paddle/distributed/auto_parallel/static/cluster.py
@@ -20,7 +20,7 @@ from enum import IntEnum, unique
 
 import paddle
 
-from ..utils.log_utils import get_logger
+from ...utils.log_utils import get_logger
 
 
 @unique
diff --git a/python/paddle/distributed/auto_parallel/cluster_v2.py b/python/paddle/distributed/auto_parallel/static/cluster_v2.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/cluster_v2.py
rename to python/paddle/distributed/auto_parallel/static/cluster_v2.py
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/static/completion.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/completion.py
rename to python/paddle/distributed/auto_parallel/static/completion.py
index 5f2ab7e102b..cd505be0289 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/static/completion.py
@@ -18,11 +18,11 @@ import logging
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 from paddle.framework import core
 
+from ..process_mesh import ProcessMesh, compute_compatible_process_mesh
 from .dist_attribute import OperatorDistAttr, TensorDistAttr
 from .dist_context import _node_id
 from .operators import find_compatible_distributed_operator_impls
 from .process_group import get_world_process_group
-from .process_mesh import ProcessMesh, compute_compatible_process_mesh
 from .utils import (
     __no_shape_var_type__,
     get_logger,
@@ -1641,7 +1641,7 @@ class Completer:
         """Complete the annotation of vars and ops in the update phase for parallel program."""
         # Copy the dist tensors and dist ops annotated by users from the default context
         # global mesh
-        from paddle.distributed.auto_parallel.process_group import (
+        from paddle.distributed.auto_parallel.static.process_group import (
             get_world_process_group,
         )
 
@@ -1895,7 +1895,7 @@ class Completer:
     def _init_global_mesh_for_program(self):
         # Copy the dist tensors and dist ops annotated by users from the default context
         # global mesh
-        from paddle.distributed.auto_parallel.process_group import (
+        from paddle.distributed.auto_parallel.static.process_group import (
             get_world_process_group,
         )
 
diff --git a/python/paddle/distributed/auto_parallel/converter.py b/python/paddle/distributed/auto_parallel/static/converter.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/converter.py
rename to python/paddle/distributed/auto_parallel/static/converter.py
index 65df19ad69c..68f571857d7 100644
--- a/python/paddle/distributed/auto_parallel/converter.py
+++ b/python/paddle/distributed/auto_parallel/static/converter.py
@@ -19,7 +19,7 @@ import numpy as np
 
 import paddle
 
-from ..utils.log_utils import get_logger
+from ...utils.log_utils import get_logger
 
 
 class Converter:
diff --git a/python/paddle/distributed/auto_parallel/cost/__init__.py b/python/paddle/distributed/auto_parallel/static/cost/__init__.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/cost/__init__.py
rename to python/paddle/distributed/auto_parallel/static/cost/__init__.py
diff --git a/python/paddle/distributed/auto_parallel/cost/base_cost.py b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/cost/base_cost.py
rename to python/paddle/distributed/auto_parallel/static/cost/base_cost.py
diff --git a/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py b/python/paddle/distributed/auto_parallel/static/cost/comm_op_cost.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
rename to python/paddle/distributed/auto_parallel/static/cost/comm_op_cost.py
diff --git a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/static/cost/comp_op_cost.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
rename to python/paddle/distributed/auto_parallel/static/cost/comp_op_cost.py
diff --git a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py b/python/paddle/distributed/auto_parallel/static/cost/estimate_cost.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/cost/estimate_cost.py
rename to python/paddle/distributed/auto_parallel/static/cost/estimate_cost.py
diff --git a/python/paddle/distributed/auto_parallel/cost/tensor_cost.py b/python/paddle/distributed/auto_parallel/static/cost/tensor_cost.py
similarity index 97%
rename from python/paddle/distributed/auto_parallel/cost/tensor_cost.py
rename to python/paddle/distributed/auto_parallel/static/cost/tensor_cost.py
index 6567088cae9..17d3b047608 100644
--- a/python/paddle/distributed/auto_parallel/cost/tensor_cost.py
+++ b/python/paddle/distributed/auto_parallel/static/cost/tensor_cost.py
@@ -15,7 +15,9 @@
 from functools import reduce
 
 import paddle
-from paddle.distributed.auto_parallel.dist_tensor import DistributedTensor
+from paddle.distributed.auto_parallel.static.dist_tensor import (
+    DistributedTensor,
+)
 from paddle.static import Variable
 
 from .base_cost import Cost
diff --git a/python/paddle/distributed/auto_parallel/cost_model.py b/python/paddle/distributed/auto_parallel/static/cost_model.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/cost_model.py
rename to python/paddle/distributed/auto_parallel/static/cost_model.py
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/static/dist_attribute.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/dist_attribute.py
rename to python/paddle/distributed/auto_parallel/static/dist_attribute.py
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/static/dist_context.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/dist_context.py
rename to python/paddle/distributed/auto_parallel/static/dist_context.py
index f3418f27182..df774d79774 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_context.py
@@ -18,9 +18,9 @@ from collections import defaultdict
 from paddle.distributed.passes import PassContext
 from paddle.framework import IrGraph, core, set_flags
 
+from ..process_mesh import ProcessMesh
 from .dist_op import DistributedOperator
 from .dist_tensor import DistributedTensor
-from .process_mesh import ProcessMesh
 from .utils import (
     __no_shape_var_type__,
     _copy_dist_attr_to_cpp,
diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/static/dist_loader.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/dist_loader.py
rename to python/paddle/distributed/auto_parallel/static/dist_loader.py
diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/static/dist_op.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/dist_op.py
rename to python/paddle/distributed/auto_parallel/static/dist_op.py
diff --git a/python/paddle/distributed/auto_parallel/dist_saver.py b/python/paddle/distributed/auto_parallel/static/dist_saver.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/dist_saver.py
rename to python/paddle/distributed/auto_parallel/static/dist_saver.py
index 9e99c58d848..26b9c32c92c 100644
--- a/python/paddle/distributed/auto_parallel/dist_saver.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_saver.py
@@ -23,7 +23,7 @@ import numpy as np
 import paddle
 from paddle.framework import core
 
-from ..utils.log_utils import get_logger
+from ...utils.log_utils import get_logger
 from .process_group import _g_process_group_map
 from .utils import get_dist_attr
 
diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/static/dist_tensor.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/dist_tensor.py
rename to python/paddle/distributed/auto_parallel/static/dist_tensor.py
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/engine.py
rename to python/paddle/distributed/auto_parallel/static/engine.py
index 7a979a86420..4ab2d4a7c9a 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -22,7 +22,7 @@ import random
 import numpy as np
 
 import paddle
-import paddle.distributed.auto_parallel.utils as auto_utils
+import paddle.distributed.auto_parallel.static.utils as auto_utils
 from paddle import static, utils
 from paddle.distributed import fleet
 from paddle.fluid.executor import _to_name_str
@@ -32,7 +32,9 @@ from paddle.framework import core, in_dynamic_mode
 from paddle.metric import Metric
 from paddle.static import InputSpec, Operator, Variable, global_scope
 
-from ..utils.log_utils import get_logger
+from ...utils.log_utils import get_logger
+from ..interface import CollectionNames, fetch, get_collection
+from ..strategy import Strategy
 from .callbacks import config_callbacks
 from .cluster import Cluster, get_default_cluster
 from .converter import Converter
@@ -45,11 +47,9 @@ from .dist_loader import (
 from .dist_op import DistributedOperator
 from .dist_saver import DistributedSaver
 from .helper import ProgramHelper
-from .interface import CollectionNames, fetch, get_collection
 from .parallelizer_v2 import Parallelizer
 from .planner_v2 import Planner
 from .process_group import get_all_process_groups, new_process_group
-from .strategy import Strategy
 
 
 class Engine:
diff --git a/python/paddle/distributed/auto_parallel/graph.py b/python/paddle/distributed/auto_parallel/static/graph.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/graph.py
rename to python/paddle/distributed/auto_parallel/static/graph.py
diff --git a/python/paddle/distributed/auto_parallel/helper.py b/python/paddle/distributed/auto_parallel/static/helper.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/helper.py
rename to python/paddle/distributed/auto_parallel/static/helper.py
diff --git a/python/paddle/distributed/auto_parallel/mapper.py b/python/paddle/distributed/auto_parallel/static/mapper.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/mapper.py
rename to python/paddle/distributed/auto_parallel/static/mapper.py
diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/static/operators/__init__.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/__init__.py
rename to python/paddle/distributed/auto_parallel/static/operators/__init__.py
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/static/operators/common.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/common.py
rename to python/paddle/distributed/auto_parallel/static/operators/common.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_assign.py b/python/paddle/distributed/auto_parallel/static/operators/dist_assign.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_assign.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_assign.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py b/python/paddle/distributed/auto_parallel/static/operators/dist_check_finite_and_unscale.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_check_finite_and_unscale.py
index 2327793e459..b397903ee78 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_check_finite_and_unscale.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from paddle.distributed.auto_parallel.process_group import (
+from paddle.distributed.auto_parallel.static.process_group import (
     get_world_process_group,
 )
 from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_default.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_default.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_dropout.py b/python/paddle/distributed/auto_parallel/static/operators/dist_dropout.py
similarity index 98%
rename from python/paddle/distributed/auto_parallel/operators/dist_dropout.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_dropout.py
index dde852e613e..a5af154f385 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_dropout.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_dropout.py
@@ -18,10 +18,10 @@ import paddle
 from paddle.framework import core
 from paddle.utils import unique_name
 
-from ...utils.log_utils import get_logger
+from ....utils.log_utils import get_logger
 
 _logger = get_logger(logging.INFO)
-from ..random import determinate_rng, is_enable_auto_rand_ctrl
+from ...random import determinate_rng, is_enable_auto_rand_ctrl
 from ..utils import (
     naive_set_dist_op_attr_for_program_by_mesh_and_mapping,
     set_var_dist_attr,
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/static/operators/dist_eltwise.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_eltwise.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/operators/dist_embedding.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
index 4f13c89bb14..8e6bbae74df 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
@@ -13,7 +13,7 @@
 # limitations under the License
 
 from paddle.common_ops_import import check_dtype, check_variable_and_dtype
-from paddle.distributed.auto_parallel.cost.comm_op_cost import (
+from paddle.distributed.auto_parallel.static.cost.comm_op_cost import (
     AllreduceSumOpCost,
     IdentityOpCost,
 )
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py b/python/paddle/distributed/auto_parallel/static/operators/dist_fill_constant_batch_size_like.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_fill_constant_batch_size_like.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_flash_attn.py b/python/paddle/distributed/auto_parallel/static/operators/dist_flash_attn.py
similarity index 97%
rename from python/paddle/distributed/auto_parallel/operators/dist_flash_attn.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_flash_attn.py
index 331bdfd25ae..2812554eb0a 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_flash_attn.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_flash_attn.py
@@ -14,10 +14,10 @@
 
 import logging
 
-from ...utils.log_utils import get_logger
+from ....utils.log_utils import get_logger
 
 _logger = get_logger(logging.INFO)
-from ..random import determinate_rng, is_enable_auto_rand_ctrl
+from ...random import determinate_rng, is_enable_auto_rand_ctrl
 from .common import (
     DistributedOperatorImplContainer,
     register_distributed_operator_impl,
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_attention.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_fused_attention.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fused_dropout_add.py b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_dropout_add.py
similarity index 98%
rename from python/paddle/distributed/auto_parallel/operators/dist_fused_dropout_add.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_fused_dropout_add.py
index 12612540a9a..a97309a587d 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_fused_dropout_add.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_dropout_add.py
@@ -18,10 +18,10 @@ import paddle
 from paddle.framework import core
 from paddle.utils import unique_name
 
-from ...utils.log_utils import get_logger
+from ....utils.log_utils import get_logger
 
 _logger = get_logger(logging.INFO)
-from ..random import determinate_rng, is_enable_auto_rand_ctrl
+from ...random import determinate_rng, is_enable_auto_rand_ctrl
 from ..utils import (
     naive_set_dist_op_attr_for_program_by_mesh_and_mapping,
     set_var_dist_attr,
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_feedforward.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_fused_feedforward.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/operators/dist_matmul.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
index 8825e14d9ab..28eed81c6bc 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
@@ -15,7 +15,7 @@
 import copy
 
 from paddle.common_ops_import import check_dtype, check_variable_and_dtype
-from paddle.distributed.auto_parallel.cost.comm_op_cost import (
+from paddle.distributed.auto_parallel.static.cost.comm_op_cost import (
     AllreduceSumOpCost,
     IdentityOpCost,
 )
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py b/python/paddle/distributed/auto_parallel/static/operators/dist_pnorm.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_pnorm.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py b/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/static/operators/dist_reshape.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_reshape.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_reshape.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_scale.py b/python/paddle/distributed/auto_parallel/static/operators/dist_scale.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_scale.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_scale.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_shape.py b/python/paddle/distributed/auto_parallel/static/operators/dist_shape.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_shape.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_shape.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_slice.py b/python/paddle/distributed/auto_parallel/static/operators/dist_slice.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_slice.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_slice.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py b/python/paddle/distributed/auto_parallel/static/operators/dist_softmax.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_softmax.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_softmax.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_split.py b/python/paddle/distributed/auto_parallel/static/operators/dist_split.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_split.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_split.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py b/python/paddle/distributed/auto_parallel/static/operators/dist_transpose.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_transpose.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_transpose.py
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py b/python/paddle/distributed/auto_parallel/static/operators/dist_update_loss_scaling.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
rename to python/paddle/distributed/auto_parallel/static/operators/dist_update_loss_scaling.py
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/static/parallelizer.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/parallelizer.py
rename to python/paddle/distributed/auto_parallel/static/parallelizer.py
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/parallelizer_v2.py
rename to python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
index 6807016c34f..8a5def0ec9d 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
@@ -20,10 +20,10 @@ from paddle.distributed.passes import PassManager, new_pass
 from paddle.static import append_backward, program_guard
 from paddle.utils import unique_name
 
-from ..utils.log_utils import get_logger
+from ...utils.log_utils import get_logger
+from ..random import init_auto_parallel_rng
 from .partitioner import Partitioner
 from .process_group import get_world_process_group
-from .random import init_auto_parallel_rng
 from .reshard import Resharder
 from .utils import set_grad_var_shape
 
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/static/partitioner.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/partitioner.py
rename to python/paddle/distributed/auto_parallel/static/partitioner.py
index f542b49fdec..a0190c3d3c4 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/static/partitioner.py
@@ -15,8 +15,10 @@
 import copy
 
 import paddle
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
-from paddle.distributed.auto_parallel.operators.common import (
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
+from paddle.distributed.auto_parallel.static.operators.common import (
     get_distributed_operator_impl_container,
 )
 from paddle.framework import Program, core
diff --git a/python/paddle/distributed/auto_parallel/planner.py b/python/paddle/distributed/auto_parallel/static/planner.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/planner.py
rename to python/paddle/distributed/auto_parallel/static/planner.py
diff --git a/python/paddle/distributed/auto_parallel/planner_v2.py b/python/paddle/distributed/auto_parallel/static/planner_v2.py
similarity index 96%
rename from python/paddle/distributed/auto_parallel/planner_v2.py
rename to python/paddle/distributed/auto_parallel/static/planner_v2.py
index efe154b1900..f0ac9253710 100755
--- a/python/paddle/distributed/auto_parallel/planner_v2.py
+++ b/python/paddle/distributed/auto_parallel/static/planner_v2.py
@@ -18,15 +18,17 @@ import pickle
 
 import numpy as np
 
-from paddle.distributed.auto_parallel.dist_attribute import (
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.static.dist_attribute import (
     OperatorDistAttr,
     TensorDistAttr,
 )
-from paddle.distributed.auto_parallel.dist_op import DistributedOperator
-from paddle.distributed.auto_parallel.dist_tensor import DistributedTensor
-from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.static.dist_op import DistributedOperator
+from paddle.distributed.auto_parallel.static.dist_tensor import (
+    DistributedTensor,
+)
 
-from ..utils.log_utils import get_logger
+from ...utils.log_utils import get_logger
 from .completion import Completer
 from .dist_context import get_default_distributed_context
 from .tuner.parallel_tuner import ParallelTuner
diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/static/process_group.py
similarity index 98%
rename from python/paddle/distributed/auto_parallel/process_group.py
rename to python/paddle/distributed/auto_parallel/static/process_group.py
index e7d8a758161..578ec21e808 100644
--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/static/process_group.py
@@ -17,8 +17,8 @@ from collections import OrderedDict
 import paddle
 from paddle.framework import core
 
-from ..collective import _get_global_env, _new_ring_id
-from ..utils.log_utils import get_logger
+from ...collective import _get_global_env, _new_ring_id
+from ...utils.log_utils import get_logger
 from .utils import dygraph_guard
 
 logger = get_logger("INFO", __name__)
diff --git a/python/paddle/distributed/auto_parallel/process_mesh_v2.py b/python/paddle/distributed/auto_parallel/static/process_mesh_v2.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/process_mesh_v2.py
rename to python/paddle/distributed/auto_parallel/static/process_mesh_v2.py
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/static/reshard.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/reshard.py
rename to python/paddle/distributed/auto_parallel/static/reshard.py
diff --git a/python/paddle/distributed/auto_parallel/topology.py b/python/paddle/distributed/auto_parallel/static/topology.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/topology.py
rename to python/paddle/distributed/auto_parallel/static/topology.py
diff --git a/python/paddle/distributed/auto_parallel/tuner/__init__.py b/python/paddle/distributed/auto_parallel/static/tuner/__init__.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/tuner/__init__.py
rename to python/paddle/distributed/auto_parallel/static/tuner/__init__.py
diff --git a/python/paddle/distributed/auto_parallel/tuner/algorithms.py b/python/paddle/distributed/auto_parallel/static/tuner/algorithms.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/tuner/algorithms.py
rename to python/paddle/distributed/auto_parallel/static/tuner/algorithms.py
diff --git a/python/paddle/distributed/auto_parallel/tuner/config.py b/python/paddle/distributed/auto_parallel/static/tuner/config.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/tuner/config.py
rename to python/paddle/distributed/auto_parallel/static/tuner/config.py
index 78f94b87b36..28ab9536b9b 100644
--- a/python/paddle/distributed/auto_parallel/tuner/config.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/config.py
@@ -15,7 +15,7 @@
 import copy
 import os
 
-from ..strategy import Strategy
+from ...strategy import Strategy
 
 _tuning_supported_passes = ["sharding", "recompute"]
 
diff --git a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py b/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
similarity index 97%
rename from python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
rename to python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
index b3a925070b3..8b3d23c68cb 100644
--- a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
@@ -27,16 +27,18 @@ import sys
 import time
 
 import paddle
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
-from paddle.distributed.auto_parallel.partitioner import Partitioner
-from paddle.distributed.auto_parallel.process_group import (
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
+from paddle.distributed.auto_parallel.static.partitioner import Partitioner
+from paddle.distributed.auto_parallel.static.process_group import (
     clear_all_process_groups,
     get_all_process_groups,
     new_process_group,
 )
-from paddle.distributed.auto_parallel.reshard import Resharder
-from paddle.distributed.auto_parallel.utils import (
+from paddle.distributed.auto_parallel.static.reshard import Resharder
+from paddle.distributed.auto_parallel.static.utils import (
     debug_program,
     set_grad_var_shape,
 )
@@ -465,7 +467,7 @@ class OptimizationTuner:
             ]
         )
         cmd_args = (
-            "-m paddle.distributed.auto_parallel.tuner.profiler"
+            "-m paddle.distributed.auto_parallel.static.tuner.profiler"
             + " "
             + profile_args
         )
diff --git a/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py b/python/paddle/distributed/auto_parallel/static/tuner/parallel_tuner.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py
rename to python/paddle/distributed/auto_parallel/static/tuner/parallel_tuner.py
index 4a3f85d6b21..c2c1055663c 100644
--- a/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/parallel_tuner.py
@@ -21,13 +21,13 @@ from collections import defaultdict
 
 import numpy as np
 
+from ...process_mesh import ProcessMesh
 from ..completion import Completer
 from ..cost import CostEstimator
 from ..dist_context import _node_id
 from ..dist_op import DistributedOperator
 from ..operators.common import find_compatible_distributed_operator_impls
 from ..parallelizer_v2 import Parallelizer
-from ..process_mesh import ProcessMesh
 from .trial import Trial, TrialStatus
 from .tunable_space import TunableSpace
 from .tunable_variable import Boolean, IntRange
diff --git a/python/paddle/distributed/auto_parallel/tuner/profiler.py b/python/paddle/distributed/auto_parallel/static/tuner/profiler.py
similarity index 98%
rename from python/paddle/distributed/auto_parallel/tuner/profiler.py
rename to python/paddle/distributed/auto_parallel/static/tuner/profiler.py
index 486db968ee3..55f83b48647 100644
--- a/python/paddle/distributed/auto_parallel/tuner/profiler.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/profiler.py
@@ -21,10 +21,10 @@ import time
 import traceback
 
 import paddle
-from paddle.distributed.auto_parallel.dist_loader import (
+from paddle.distributed.auto_parallel.static.dist_loader import (
     DistributedDataLoaderFromGenerator,
 )
-from paddle.distributed.auto_parallel.process_group import (
+from paddle.distributed.auto_parallel.static.process_group import (
     get_all_process_groups,
     new_process_group,
 )
diff --git a/python/paddle/distributed/auto_parallel/tuner/recorder.py b/python/paddle/distributed/auto_parallel/static/tuner/recorder.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/tuner/recorder.py
rename to python/paddle/distributed/auto_parallel/static/tuner/recorder.py
diff --git a/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py b/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
rename to python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
index 5ef0e872933..bef30c7ce3a 100644
--- a/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
@@ -26,20 +26,24 @@ from functools import reduce
 import numpy as np
 
 import paddle
-from paddle.distributed.auto_parallel.cluster_v2 import DeviceMesh
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.cost import CostEstimator
-from paddle.distributed.auto_parallel.dist_attribute import (
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.static.cluster_v2 import DeviceMesh
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.cost import CostEstimator
+from paddle.distributed.auto_parallel.static.dist_attribute import (
     OperatorDistAttr,
     TensorDistAttr,
 )
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
-from paddle.distributed.auto_parallel.dist_tensor import DistributedTensor
-from paddle.distributed.auto_parallel.process_group import (
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
+from paddle.distributed.auto_parallel.static.dist_tensor import (
+    DistributedTensor,
+)
+from paddle.distributed.auto_parallel.static.process_group import (
     get_world_process_group,
 )
-from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
-from paddle.distributed.auto_parallel.utils import (
+from paddle.distributed.auto_parallel.static.utils import (
     is_gradient_clip_op,
     print_program_with_dist_attr,
 )
@@ -48,7 +52,7 @@ from paddle.fluid import program_guard
 from paddle.fluid.backward import append_backward
 from paddle.fluid.framework import Parameter, unique_name
 
-from ...utils.log_utils import get_logger
+from ....utils.log_utils import get_logger
 from ..graph import Graph
 
 _PATTERNS = {}
diff --git a/python/paddle/distributed/auto_parallel/tuner/storable.py b/python/paddle/distributed/auto_parallel/static/tuner/storable.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/tuner/storable.py
rename to python/paddle/distributed/auto_parallel/static/tuner/storable.py
diff --git a/python/paddle/distributed/auto_parallel/tuner/trial.py b/python/paddle/distributed/auto_parallel/static/tuner/trial.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/tuner/trial.py
rename to python/paddle/distributed/auto_parallel/static/tuner/trial.py
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py b/python/paddle/distributed/auto_parallel/static/tuner/tunable_space.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/tuner/tunable_space.py
rename to python/paddle/distributed/auto_parallel/static/tuner/tunable_space.py
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py b/python/paddle/distributed/auto_parallel/static/tuner/tunable_variable.py
similarity index 100%
rename from python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
rename to python/paddle/distributed/auto_parallel/static/tuner/tunable_variable.py
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/static/utils.py
similarity index 99%
rename from python/paddle/distributed/auto_parallel/utils.py
rename to python/paddle/distributed/auto_parallel/static/utils.py
index d5a196a080d..fa9aeacd001 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/static/utils.py
@@ -27,8 +27,8 @@ from paddle.framework import core
 from paddle.framework.io_utils import is_belong_to_optimizer, is_parameter
 from paddle.static import Variable
 
+from ..process_mesh import ProcessMesh
 from .dist_attribute import OperatorDistAttr, TensorDistAttr
-from .process_mesh import ProcessMesh
 
 OpRole = core.op_proto_and_checker_maker.OpRole
 OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
@@ -1868,7 +1868,7 @@ def get_lr(optimizer):
 def initialize_pg_in_full_mode(all_process_groups, cur_rank):
     import socket
 
-    from ..collective import _get_global_env
+    from ...collective import _get_global_env
 
     has_recv_by_socket = []
     # This is a magic number
@@ -1946,7 +1946,7 @@ def is_recompute_op(op):
 
 
 def set_recompute_segments(model, losses, strategy, program):
-    from ..passes.auto_parallel_recompute import RecomputeState
+    from ...passes.auto_parallel_recompute import RecomputeState
 
     if not losses:
         return
@@ -2054,7 +2054,7 @@ def validate_opt(optimizer):
 
 
 def set_data_parallel(x):
-    from .interface import ProcessMesh, shard_tensor
+    from ..interface import ProcessMesh, shard_tensor
     from .process_group import get_world_process_group
 
     world_ranks = get_world_process_group().ranks
@@ -2095,7 +2095,7 @@ def _copy_tensor_dist_attr_to_cpp(cpp_dist_attr, py_dist_attr):
 
 
 def _copy_tensor_dist_attr_from_cpp(cpp_dist_attr, py_dist_attr):
-    from .process_mesh import ProcessMesh
+    from ..process_mesh import ProcessMesh
 
     cpp_process_mesh = cpp_dist_attr.process_mesh
     if cpp_process_mesh is not None:
@@ -2128,7 +2128,7 @@ def _copy_op_dist_attr_to_cpp(cpp_dist_attr, py_dist_attr):
 
 
 def _copy_op_dist_attr_from_cpp(cpp_dist_attr, py_dist_attr):
-    from .process_mesh import ProcessMesh
+    from ..process_mesh import ProcessMesh
 
     cpp_process_mesh = cpp_dist_attr.process_mesh
     if cpp_process_mesh is not None:
diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py
index 39948ab28e6..de003916b7d 100755
--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -1335,7 +1335,7 @@ class Fleet:
                 self._user_defined_strategy.semi_auto
                 or self._user_defined_strategy.auto_search
             ):
-                from ..auto_parallel.parallelizer import AutoParallelizer
+                from ..auto_parallel.static.parallelizer import AutoParallelizer
 
                 auto_parallelizer = AutoParallelizer(self)
                 (
diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py
index def5156f811..a6f12af17fa 100644
--- a/python/paddle/distributed/passes/auto_parallel_amp.py
+++ b/python/paddle/distributed/passes/auto_parallel_amp.py
@@ -13,11 +13,13 @@
 # limitations under the License.
 
 import paddle
-from paddle.distributed.auto_parallel.dist_attribute import OperatorDistAttr
-from paddle.distributed.auto_parallel.process_group import (
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    OperatorDistAttr,
+)
+from paddle.distributed.auto_parallel.static.process_group import (
     get_world_process_group,
 )
-from paddle.distributed.auto_parallel.utils import (
+from paddle.distributed.auto_parallel.static.utils import (
     naive_set_dist_op_attr_for_program_by_mesh_and_mapping,
     set_var_dist_attr,
 )
@@ -42,7 +44,7 @@ from paddle.static.amp.fp16_utils import (
 from paddle.utils import unique_name
 
 from ..auto_parallel.process_mesh import ProcessMesh
-from ..auto_parallel.utils import (
+from ..auto_parallel.static.utils import (
     is_backward_op,
     is_forward_op,
     is_loss_grad_op,
diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
index 5d519bcc94e..a371792c519 100644
--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -15,16 +15,16 @@
 from collections import OrderedDict
 
 import paddle
-from paddle.distributed.auto_parallel.dist_attribute import (
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.static.dist_attribute import (
     OperatorDistAttr,
     TensorDistAttr,
 )
-from paddle.distributed.auto_parallel.operators.common import (
+from paddle.distributed.auto_parallel.static.operators.common import (
     is_data_parallel_reduce_op,
     is_data_parallel_scale_op,
 )
-from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
-from paddle.distributed.auto_parallel.utils import (
+from paddle.distributed.auto_parallel.static.utils import (
     find_higher_order_backward_op,
     get_var_numel,
     insert_dependencies_for_vars,
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index 6a763ce1503..8da9edb3425 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -16,11 +16,13 @@ from collections import defaultdict
 
 import paddle
 from paddle.common_ops_import import check_type, check_variable_and_dtype
-from paddle.distributed.auto_parallel.dist_attribute import OperatorDistAttr
-from paddle.distributed.auto_parallel.process_group import (
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    OperatorDistAttr,
+)
+from paddle.distributed.auto_parallel.static.process_group import (
     get_world_process_group,
 )
-from paddle.distributed.auto_parallel.utils import (
+from paddle.distributed.auto_parallel.static.utils import (
     is_backward_op,
     is_forward_op,
     naive_set_dist_op_attr_for_program_by_mesh_and_mapping,
diff --git a/python/paddle/distributed/passes/auto_parallel_grad_clip.py b/python/paddle/distributed/passes/auto_parallel_grad_clip.py
index 481ba3b6c31..bda2b557fc5 100644
--- a/python/paddle/distributed/passes/auto_parallel_grad_clip.py
+++ b/python/paddle/distributed/passes/auto_parallel_grad_clip.py
@@ -19,18 +19,21 @@ import numpy as np
 import paddle
 from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
 
-from ..auto_parallel.dist_attribute import OperatorDistAttr, TensorDistAttr
-from ..auto_parallel.operators.common import (
+from ..auto_parallel.process_mesh import ProcessMesh
+from ..auto_parallel.static.dist_attribute import (
+    OperatorDistAttr,
+    TensorDistAttr,
+)
+from ..auto_parallel.static.operators.common import (
     SyncMode,
     is_data_parallel_reduce_op,
 )
-from ..auto_parallel.process_group import (
+from ..auto_parallel.static.process_group import (
     get_all_process_groups,
     get_world_process_group,
 )
-from ..auto_parallel.process_mesh import ProcessMesh
-from ..auto_parallel.reshard import Resharder
-from ..auto_parallel.utils import (
+from ..auto_parallel.static.reshard import Resharder
+from ..auto_parallel.static.utils import (
     _get_comm_group,
     insert_dependencies_for_vars,
     is_gradient_clip_op,
diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
index 4bf460d1b42..8a87ac7f599 100644
--- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -15,11 +15,11 @@
 from typing import Any, Dict, List, Tuple
 
 import paddle
-from paddle.distributed.auto_parallel.process_group import (
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.static.process_group import (
     get_world_process_group,
 )
-from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
-from paddle.distributed.auto_parallel.utils import (
+from paddle.distributed.auto_parallel.static.utils import (
     is_optimize_op,
     naive_set_dist_op_attr_for_program_by_mesh_and_mapping,
     set_var_dist_attr,
diff --git a/python/paddle/distributed/passes/auto_parallel_quantization.py b/python/paddle/distributed/passes/auto_parallel_quantization.py
index f2f35b33728..759e79680fc 100644
--- a/python/paddle/distributed/passes/auto_parallel_quantization.py
+++ b/python/paddle/distributed/passes/auto_parallel_quantization.py
@@ -26,8 +26,11 @@ from paddle.static.quantization import (
     quant_config,
 )
 
-from ..auto_parallel.converter import Converter
-from ..auto_parallel.dist_attribute import OperatorDistAttr, TensorDistAttr
+from ..auto_parallel.static.converter import Converter
+from ..auto_parallel.static.dist_attribute import (
+    OperatorDistAttr,
+    TensorDistAttr,
+)
 from .pass_base import PassBase, register_pass
 
 TRANSFORM_PASS_OP_TYPES = list(
diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py
index 5de90af8e2e..d64e8df305f 100644
--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -26,8 +26,8 @@ from paddle.fluid.backward import (
 from paddle.framework import core
 from paddle.utils import unique_name
 
-from ..auto_parallel.dist_attribute import OperatorDistAttr
-from ..auto_parallel.utils import (
+from ..auto_parallel.static.dist_attribute import OperatorDistAttr
+from ..auto_parallel.static.utils import (
     get_loss_op,
     insert_dependencies_for_two_ops,
     is_backward_op,
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
index 44045155cb7..ac1d7fd8f07 100644
--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -16,13 +16,15 @@ import logging
 from functools import reduce
 
 import paddle
-from paddle.distributed.auto_parallel.operators.common import (
+from paddle.distributed.auto_parallel.static.operators.common import (
     ParallelMode,
     is_data_parallel_reduce_op,
     is_parameter_related,
 )
-from paddle.distributed.auto_parallel.process_group import new_process_group
-from paddle.distributed.auto_parallel.utils import (
+from paddle.distributed.auto_parallel.static.process_group import (
+    new_process_group,
+)
+from paddle.distributed.auto_parallel.static.utils import (
     _get_comm_group,
     get_logger,
     get_var_numel,
diff --git a/python/paddle/distributed/passes/auto_parallel_supplement_explicit_dependencies.py b/python/paddle/distributed/passes/auto_parallel_supplement_explicit_dependencies.py
index c164b6e8ddb..7bd4024fa70 100644
--- a/python/paddle/distributed/passes/auto_parallel_supplement_explicit_dependencies.py
+++ b/python/paddle/distributed/passes/auto_parallel_supplement_explicit_dependencies.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.distributed.auto_parallel.operators.common import (
+from paddle.distributed.auto_parallel.static.operators.common import (
     is_amp_flag_sync_op,
     is_data_parallel_reduce_op,
     is_global_norm_sync_op,
 )
-from paddle.distributed.auto_parallel.utils import (
+from paddle.distributed.auto_parallel.static.utils import (
     OpRole,
     insert_dependencies_for_vars,
 )
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 1635c7d5d21..a0864992c4e 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1439,7 +1439,7 @@ def _append_backward_ops_(
             )
         else:
             default_ctx = getattr(
-                paddle.distributed.auto_parallel.dist_context,
+                paddle.distributed.auto_parallel.static.dist_context,
                 '_g_default_distributed_context',
                 None,
             )
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 904a30f64fa..38b62736e58 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1681,7 +1681,7 @@ class Variable(metaclass=VariableMetaClass):
         if self.persistable:
             var_str = "persist " + var_str
 
-        from paddle.distributed.auto_parallel.dist_context import (
+        from paddle.distributed.auto_parallel.static.dist_context import (
             get_default_distributed_context,
         )
 
@@ -3137,7 +3137,7 @@ class Operator:
             if i != len(attr_names) - 1:
                 attrs_str += ", "
 
-        from paddle.distributed.auto_parallel.dist_context import (
+        from paddle.distributed.auto_parallel.static.dist_context import (
             get_default_distributed_context,
         )
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
index 554c578f850..2a947adc030 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
@@ -22,10 +22,10 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, utils
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.dist_context import (
+from paddle.distributed.auto_parallel.static.dist_context import (
     set_default_distributed_context,
 )
-from paddle.distributed.auto_parallel.utils import (
+from paddle.distributed.auto_parallel.static.utils import (
     get_dist_attr,
     load_checkpoint_into_program,
     load_distributed_checkpoint,
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
index 1ef9634f8db..3f862705fed 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
@@ -23,7 +23,7 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, utils
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.utils import (
+from paddle.distributed.auto_parallel.static.utils import (
     load_checkpoint_into_program,
     save_distributed_checkpoint,
 )
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
index a2a9c911327..16ede226d20 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
@@ -25,7 +25,7 @@ import numpy as np
 import paddle
 from paddle import distributed as dist
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel import engine
+from paddle.distributed.auto_parallel.static import engine
 from paddle.distributed.fleet.layers.mpu.mp_layers import (
     ColumnParallelLinear,
     RowParallelLinear,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
index d1104c2ce59..84606eb1216 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
@@ -17,7 +17,7 @@ import os
 import tempfile
 import unittest
 
-from paddle.distributed.auto_parallel.cluster import (
+from paddle.distributed.auto_parallel.static.cluster import (
     Cluster,
     DeviceType,
     LinkType,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
index 810f99e0dd7..103651728f8 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
@@ -18,8 +18,10 @@ import unittest.mock
 import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, tensor, utils
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
 from paddle.distributed.fleet import auto
 
 paddle.enable_static()
@@ -188,7 +190,7 @@ class TestMLPAutoCompletion(unittest.TestCase):
     #     #                                     dist_context)
     #     dist_context.finalize_distributed_attr_for_program(
     #         complete_train_program)
-    #     from paddle.distributed.auto_parallel.interface import _g_process_mesh_map
+    #     from paddle.distributed.auto_parallel.static.interface import _g_process_mesh_map
     #     for block in complete_train_program.blocks:
     #         for tensor in block.vars.values():
     #             desc = tensor.desc
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
index d136aa6adb5..cc09ac989e1 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
@@ -18,8 +18,10 @@ import unittest
 import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, tensor, utils
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
 from paddle.distributed.fleet import auto
 from paddle.fluid import layers
 from paddle.nn.layer.transformer import _convert_param_attr_to_list
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
index 5746df433fe..7cf8b2d399f 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
@@ -18,12 +18,16 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, utils
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.cost_model import estimate_cost
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
-from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
-from paddle.distributed.auto_parallel.partitioner import Partitioner
-from paddle.distributed.auto_parallel.reshard import Resharder
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.cost_model import estimate_cost
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
+from paddle.distributed.auto_parallel.static.parallelizer import (
+    AutoParallelizer,
+)
+from paddle.distributed.auto_parallel.static.partitioner import Partitioner
+from paddle.distributed.auto_parallel.static.reshard import Resharder
 from paddle.distributed.fleet import auto
 from paddle.fluid import core
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
index 95b7f95c98c..420e8b7f526 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
@@ -20,12 +20,20 @@ from test_auto_parallel_reshard import mlp_forward
 
 import paddle
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.dist_attribute import TensorDistAttr
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
-from paddle.distributed.auto_parallel.dist_tensor import DistributedTensor
-from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
-from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    TensorDistAttr,
+)
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
+from paddle.distributed.auto_parallel.static.dist_tensor import (
+    DistributedTensor,
+)
+from paddle.distributed.auto_parallel.static.parallelizer import (
+    AutoParallelizer,
+)
+from paddle.distributed.auto_parallel.static.partitioner import Partitioner
 from paddle.distributed.fleet import auto
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py
index b8628f671c0..a9b1fa973f7 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from paddle.distributed.auto_parallel.graph import Graph
+from paddle.distributed.auto_parallel.static.graph import Graph
 
 
 class TestAutoParallelGraph(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 11f20b68939..cae7c24a161 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -23,17 +23,21 @@ import paddle
 import paddle.nn.functional as F
 from paddle import fluid, nn, static, utils
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.cluster import Cluster
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
-from paddle.distributed.auto_parallel.mapper import (
+from paddle.distributed.auto_parallel.static.cluster import Cluster
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
+from paddle.distributed.auto_parallel.static.mapper import (
     get_comm_volume,
     get_dtype_bytes,
     mapping,
 )
-from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
-from paddle.distributed.auto_parallel.partitioner import Partitioner
-from paddle.distributed.auto_parallel.reshard import Resharder
+from paddle.distributed.auto_parallel.static.parallelizer import (
+    AutoParallelizer,
+)
+from paddle.distributed.auto_parallel.static.partitioner import Partitioner
+from paddle.distributed.auto_parallel.static.reshard import Resharder
 from paddle.distributed.fleet import auto
 from paddle.fluid import core
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
index 33db190dfc6..71b6a7b7a2d 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
@@ -19,11 +19,15 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, tensor, utils
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
-from paddle.distributed.auto_parallel.partitioner import Partitioner
-from paddle.distributed.auto_parallel.process_group import new_process_group
-from paddle.distributed.auto_parallel.utils import _get_comm_group
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
+from paddle.distributed.auto_parallel.static.partitioner import Partitioner
+from paddle.distributed.auto_parallel.static.process_group import (
+    new_process_group,
+)
+from paddle.distributed.auto_parallel.static.utils import _get_comm_group
 from paddle.distributed.fleet import auto
 
 paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
index 3e058bfb18e..038f1b4854b 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
@@ -18,11 +18,15 @@ import unittest
 import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, tensor, utils
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
-from paddle.distributed.auto_parallel.partitioner import Partitioner
-from paddle.distributed.auto_parallel.process_group import new_process_group
-from paddle.distributed.auto_parallel.utils import _get_comm_group
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.parallelizer import (
+    AutoParallelizer,
+)
+from paddle.distributed.auto_parallel.static.partitioner import Partitioner
+from paddle.distributed.auto_parallel.static.process_group import (
+    new_process_group,
+)
+from paddle.distributed.auto_parallel.static.utils import _get_comm_group
 from paddle.distributed.fleet import auto
 from paddle.fluid import layers
 from paddle.nn.layer.transformer import _convert_param_attr_to_list
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
index 4698667b985..4af3fc831ab 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
@@ -18,15 +18,19 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, utils
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
-from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
-from paddle.distributed.auto_parallel.partitioner import Partitioner
-from paddle.distributed.auto_parallel.process_group import (
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
+from paddle.distributed.auto_parallel.static.parallelizer import (
+    AutoParallelizer,
+)
+from paddle.distributed.auto_parallel.static.partitioner import Partitioner
+from paddle.distributed.auto_parallel.static.process_group import (
     ProcessGroup,
     _g_process_group_map,
 )
-from paddle.distributed.auto_parallel.reshard import Resharder
+from paddle.distributed.auto_parallel.static.reshard import Resharder
 from paddle.distributed.fleet import auto
 
 paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
index e59cfa1a1f1..b8afece8001 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
@@ -18,11 +18,15 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, utils
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
-from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
-from paddle.distributed.auto_parallel.partitioner import Partitioner
-from paddle.distributed.auto_parallel.reshard import Resharder
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
+from paddle.distributed.auto_parallel.static.parallelizer import (
+    AutoParallelizer,
+)
+from paddle.distributed.auto_parallel.static.partitioner import Partitioner
+from paddle.distributed.auto_parallel.static.reshard import Resharder
 from paddle.distributed.fleet import auto
 
 paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
index 33acd017292..ebc7b95290e 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
@@ -18,13 +18,17 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, utils
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.cluster import Cluster
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.cost import CostEstimator
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
-from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
-from paddle.distributed.auto_parallel.partitioner import Partitioner
-from paddle.distributed.auto_parallel.reshard import Resharder
+from paddle.distributed.auto_parallel.static.cluster import Cluster
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.cost import CostEstimator
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
+from paddle.distributed.auto_parallel.static.parallelizer import (
+    AutoParallelizer,
+)
+from paddle.distributed.auto_parallel.static.partitioner import Partitioner
+from paddle.distributed.auto_parallel.static.reshard import Resharder
 from paddle.distributed.fleet import auto
 
 paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
index 11c817b9bae..2ff75315725 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
@@ -22,7 +22,7 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, utils
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.dist_context import (
+from paddle.distributed.auto_parallel.static.dist_context import (
     get_default_distributed_context,
 )
 from paddle.distributed.fleet import auto
@@ -80,6 +80,7 @@ class MLPLayer(nn.Layer):
 
 
 def mlp_forward(train_program, start_program):
+    print("mlp_forward outer", flush=True)
     with static.program_guard(
         train_program, start_program
     ), utils.unique_name.guard():
@@ -99,6 +100,7 @@ def mlp_forward(train_program, start_program):
         elif _global_parallel_strategy == "dp":
             auto.shard_tensor(input, _global_process_mesh, ["x", None])
         else:
+            print("mlp_forward inner", flush=True)
             auto.shard_tensor(input, _global_process_mesh, [None, None])
 
         mlp = MLPLayer(
@@ -128,10 +130,14 @@ def get_dist_prog_with_parallelizer(
     dist_strategy.semi_auto = True
     fleet.init(is_collective=True, strategy=dist_strategy)
 
+    print("mlp_forward before", flush=True)
+
     loss, train_program, startup_program = mlp_forward(
         train_program, startup_program
     )
 
+    print("mlp_forward after", flush=True)
+
     optimizer = paddle.fluid.optimizer.AdamOptimizer(
         learning_rate=0.00001,
         beta1=0.9,
@@ -185,6 +191,7 @@ def check_send_recv_result(dist_main_prog, rank_id):
 )
 class TestMLPReshard(unittest.TestCase):
     def test_mlp_serial(self):
+        print("################-0")
         global _global_parallel_strategy
         _global_parallel_strategy = None
         global _global_process_mesh
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
index 277072a24e2..d5bfd588942 100755
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
@@ -17,13 +17,15 @@ import unittest
 import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, utils
-from paddle.distributed.auto_parallel.dist_attribute import (
+from paddle.distributed.auto_parallel.static.dist_attribute import (
     OperatorDistAttr,
     TensorDistAttr,
 )
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
-from paddle.distributed.auto_parallel.planner import PlanSpace
-from paddle.distributed.auto_parallel.utils import (
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
+from paddle.distributed.auto_parallel.static.planner import PlanSpace
+from paddle.distributed.auto_parallel.static.utils import (
     update_op_dims_mapping_by_default_dist_impl,
     update_op_dims_mapping_by_elementwise_like_dist_impl,
 )
@@ -177,8 +179,10 @@ class TestMLPSearcher(unittest.TestCase):
         set_default_dist_attr(train_program, dist_context, global_process_mesh)
         ops = train_program.global_block().ops
         vars = train_program.global_block().vars
-        from paddle.distributed.auto_parallel.dist_op import DistributedOperator
-        from paddle.distributed.auto_parallel.operators.common import (
+        from paddle.distributed.auto_parallel.static.dist_op import (
+            DistributedOperator,
+        )
+        from paddle.distributed.auto_parallel.static.operators.common import (
             get_distributed_operator_impl_container,
             is_elementwise_op,
         )
diff --git a/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py b/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
index c9d7d6346ca..a1c1f86bb1f 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
@@ -16,9 +16,11 @@ import unittest
 import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, utils
-from paddle.distributed.auto_parallel.dist_attribute import OperatorDistAttr
-from paddle.distributed.auto_parallel.dist_op import DistributedOperator
-from paddle.distributed.auto_parallel.operators.common import (
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    OperatorDistAttr,
+)
+from paddle.distributed.auto_parallel.static.dist_op import DistributedOperator
+from paddle.distributed.auto_parallel.static.operators.common import (
     get_distributed_operator_impl_container,
 )
 from paddle.framework import core
diff --git a/python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py b/python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py
index 19da767fcf9..369fdec36e5 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py
@@ -16,9 +16,11 @@ import unittest
 import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, utils
-from paddle.distributed.auto_parallel.dist_attribute import OperatorDistAttr
-from paddle.distributed.auto_parallel.dist_op import DistributedOperator
-from paddle.distributed.auto_parallel.operators.common import (
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    OperatorDistAttr,
+)
+from paddle.distributed.auto_parallel.static.dist_op import DistributedOperator
+from paddle.distributed.auto_parallel.static.operators.common import (
     get_distributed_operator_impl_container,
 )
 from paddle.fluid import core
diff --git a/python/setup.py.in b/python/setup.py.in
index 9a6517a7d55..3e6fdb00679 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -426,9 +426,11 @@ packages=['paddle',
           'paddle.distributed.fleet.meta_parallel.sharding',
           'paddle.distributed.fleet.meta_parallel.parallel_layers',
           'paddle.distributed.auto_parallel',
-          'paddle.distributed.auto_parallel.operators',
-          'paddle.distributed.auto_parallel.tuner',
-          'paddle.distributed.auto_parallel.cost',
+          'paddle.distributed.auto_parallel.dygraph',
+          'paddle.distributed.auto_parallel.static',
+          'paddle.distributed.auto_parallel.static.operators',
+          'paddle.distributed.auto_parallel.static.tuner',
+          'paddle.distributed.auto_parallel.static.cost',
           'paddle.distributed.passes',
           'paddle.distributed.models',
           'paddle.distributed.models.moe',
diff --git a/setup.py b/setup.py
index f8858321ae6..ae8cf524baf 100644
--- a/setup.py
+++ b/setup.py
@@ -1430,9 +1430,11 @@ def get_setup_parameters():
         'paddle.distributed.fleet.meta_parallel.sharding',
         'paddle.distributed.fleet.meta_parallel.parallel_layers',
         'paddle.distributed.auto_parallel',
-        'paddle.distributed.auto_parallel.operators',
-        'paddle.distributed.auto_parallel.tuner',
-        'paddle.distributed.auto_parallel.cost',
+        'paddle.distributed.auto_parallel.dygraph',
+        'paddle.distributed.auto_parallel.static',
+        'paddle.distributed.auto_parallel.static.operators',
+        'paddle.distributed.auto_parallel.static.tuner',
+        'paddle.distributed.auto_parallel.static.cost',
         'paddle.distributed.passes',
         'paddle.distributed.models',
         'paddle.distributed.models.moe',
diff --git a/test/auto_parallel/amp_o2_pass.py b/test/auto_parallel/amp_o2_pass.py
index 767b95c8083..04af0112e31 100644
--- a/test/auto_parallel/amp_o2_pass.py
+++ b/test/auto_parallel/amp_o2_pass.py
@@ -120,7 +120,10 @@ class TestShardingStage2WithNewEXE(unittest.TestCase):
 
         # bf16
         mp_bf16_engine = self.get_engine(use_amp=True)
-        if not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000:
+        if not (
+            paddle.amp.is_bfloat16_supported()
+            and paddle.device.cuda.get_device_capability()[0] >= 8
+        ):
             return
 
         mp_bf16_history = mp_bf16_engine.fit(
diff --git a/test/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py b/test/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py
index cd11f2fabf7..6f61cafbcd8 100644
--- a/test/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py
+++ b/test/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py
@@ -20,7 +20,7 @@ import paddle
 from paddle import static
 from paddle.distributed import fleet
 
-sys.path.append("..")
+sys.path.append("../legacy_test")
 import auto_parallel_gpt_model as modeling
 from auto_parallel_gpt_model import (
     GPTForPretraining,
@@ -151,7 +151,7 @@ def train():
                 },
                 fetch_list=[loss],
             )
-            print(f"step: {step}, loss: {loss_print[0]:f}")
+            print(f"step: {step}, loss: {loss_print:f}")
         else:
             exe.run(
                 distributed_main_program,
diff --git a/test/auto_parallel/auto_parallel_relaunch_with_planner.py b/test/auto_parallel/auto_parallel_relaunch_with_planner.py
index 00b769d8c7d..4ad1dfb1965 100644
--- a/test/auto_parallel/auto_parallel_relaunch_with_planner.py
+++ b/test/auto_parallel/auto_parallel_relaunch_with_planner.py
@@ -15,9 +15,9 @@
 import paddle
 from paddle import static
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.cluster import Cluster
-from paddle.distributed.auto_parallel.cost import CostEstimator
-from paddle.distributed.auto_parallel.dist_context import (
+from paddle.distributed.auto_parallel.static.cluster import Cluster
+from paddle.distributed.auto_parallel.static.cost import CostEstimator
+from paddle.distributed.auto_parallel.static.dist_context import (
     get_default_distributed_context,
 )
 
diff --git a/test/auto_parallel/converter.py b/test/auto_parallel/converter.py
index 5e0506c3785..411900eaa42 100644
--- a/test/auto_parallel/converter.py
+++ b/test/auto_parallel/converter.py
@@ -15,7 +15,7 @@
 import numpy as np
 
 import paddle
-from paddle.distributed.auto_parallel.converter import Converter
+from paddle.distributed.auto_parallel.static.converter import Converter
 
 
 def test_convert():
diff --git a/test/auto_parallel/test_align_tool.py b/test/auto_parallel/test_align_tool.py
index c0c331b0d7f..500b11c7891 100644
--- a/test/auto_parallel/test_align_tool.py
+++ b/test/auto_parallel/test_align_tool.py
@@ -20,7 +20,9 @@ import numpy as np
 
 import paddle
 from paddle import fluid, nn, optimizer, static
-from paddle.distributed.auto_parallel.auto_align_tool import AutoAlignTool
+from paddle.distributed.auto_parallel.static.auto_align_tool import (
+    AutoAlignTool,
+)
 from paddle.vision.datasets import MNIST
 
 warnings.filterwarnings("ignore")
diff --git a/test/auto_parallel/test_base_cost.py b/test/auto_parallel/test_base_cost.py
index 01a488e2db3..c9e3e64c6a8 100644
--- a/test/auto_parallel/test_base_cost.py
+++ b/test/auto_parallel/test_base_cost.py
@@ -23,21 +23,25 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, static, utils
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.cluster import Cluster
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.cost import (
+from paddle.distributed.auto_parallel.static.cluster import Cluster
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.cost import (
     AllreduceSumOpCost,
     _g_op_cost_factory,
 )
-from paddle.distributed.auto_parallel.cost.base_cost import (
+from paddle.distributed.auto_parallel.static.cost.base_cost import (
     build_comm_costs_from_descs,
     build_comm_desc_from_dist_op,
     build_comp_costs_from_descs,
     build_comp_desc_from_dist_op,
     build_dp_costs,
 )
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
-from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
+from paddle.distributed.auto_parallel.static.parallelizer import (
+    AutoParallelizer,
+)
 from paddle.distributed.fleet import auto
 
 paddle.enable_static()
diff --git a/test/auto_parallel/test_cluster.py b/test/auto_parallel/test_cluster.py
index c25b6013fa1..679b3f8a3cd 100644
--- a/test/auto_parallel/test_cluster.py
+++ b/test/auto_parallel/test_cluster.py
@@ -17,7 +17,7 @@ import os
 import tempfile
 import unittest
 
-from paddle.distributed.auto_parallel.cluster import (
+from paddle.distributed.auto_parallel.static.cluster import (
     Cluster,
     get_default_cluster,
 )
diff --git a/test/auto_parallel/test_cluster_partition.py b/test/auto_parallel/test_cluster_partition.py
index 9071b481eb5..25087ff1627 100644
--- a/test/auto_parallel/test_cluster_partition.py
+++ b/test/auto_parallel/test_cluster_partition.py
@@ -18,7 +18,7 @@ import unittest
 class TestClusterPartition(unittest.TestCase):
     def test_cluster_partition(self):
         clusters = [(5, 8), (1, 8), (4, 8), (16, 8), (2, 8), (3, 8)]
-        from paddle.distributed.auto_parallel.tuner.rule_based_tuner import (
+        from paddle.distributed.auto_parallel.static.tuner.rule_based_tuner import (
             ClusterPartitionUtil,
         )
 
diff --git a/test/auto_parallel/test_cluster_v2.py b/test/auto_parallel/test_cluster_v2.py
index 3f10fb95b84..671db9708e6 100644
--- a/test/auto_parallel/test_cluster_v2.py
+++ b/test/auto_parallel/test_cluster_v2.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from paddle.distributed.auto_parallel.cluster_v2 import DeviceMesh
+from paddle.distributed.auto_parallel.static.cluster_v2 import DeviceMesh
 from paddle.framework import core
 
 
diff --git a/test/auto_parallel/test_comm_cost.py b/test/auto_parallel/test_comm_cost.py
index 0f664947f27..734cbf8ff6a 100644
--- a/test/auto_parallel/test_comm_cost.py
+++ b/test/auto_parallel/test_comm_cost.py
@@ -20,8 +20,8 @@ import unittest
 from test_cluster import cluster_json, multi_cluster_json
 
 import paddle
-from paddle.distributed.auto_parallel.cluster import Cluster
-from paddle.distributed.auto_parallel.cost import (
+from paddle.distributed.auto_parallel.static.cluster import Cluster
+from paddle.distributed.auto_parallel.static.cost import (
     AllgatherOpCost,
     AllreduceSumOpCost,
     BroadcastOpCost,
diff --git a/test/auto_parallel/test_comp_cost.py b/test/auto_parallel/test_comp_cost.py
index c4e4502e502..7afb077b7e1 100644
--- a/test/auto_parallel/test_comp_cost.py
+++ b/test/auto_parallel/test_comp_cost.py
@@ -18,8 +18,8 @@ import unittest
 
 from test_cluster import cluster_json
 
-from paddle.distributed.auto_parallel.cluster import Cluster
-from paddle.distributed.auto_parallel.cost.comp_op_cost import (
+from paddle.distributed.auto_parallel.static.cluster import Cluster
+from paddle.distributed.auto_parallel.static.cost.comp_op_cost import (
     AssignOpCost,
     AssignValueOpCost,
     BeamSearchDecodeOpCost,
diff --git a/test/auto_parallel/test_convert_to_process_meshes.py b/test/auto_parallel/test_convert_to_process_meshes.py
index 120a7ba438a..472719aef56 100644
--- a/test/auto_parallel/test_convert_to_process_meshes.py
+++ b/test/auto_parallel/test_convert_to_process_meshes.py
@@ -18,7 +18,7 @@ import unittest
 class TestConvertToProcessMeshes(unittest.TestCase):
     def test_convert_to_process_meshes(self):
         device_meshes = [[1, 8], [4, 8], [15, 8]]
-        from paddle.distributed.auto_parallel.tuner.rule_based_tuner import (
+        from paddle.distributed.auto_parallel.static.tuner.rule_based_tuner import (
             convert_to_process_meshes,
         )
 
diff --git a/test/auto_parallel/test_converter.py b/test/auto_parallel/test_converter.py
index edd888acf69..f6b95011fc9 100644
--- a/test/auto_parallel/test_converter.py
+++ b/test/auto_parallel/test_converter.py
@@ -18,7 +18,7 @@ import sys
 import tempfile
 import unittest
 
-from paddle.distributed.auto_parallel.converter import Converter
+from paddle.distributed.auto_parallel.static.converter import Converter
 
 
 class TestConverter(unittest.TestCase):
diff --git a/test/auto_parallel/test_dist_assign.py b/test/auto_parallel/test_dist_assign.py
index 87064a45a49..b7cdb0d6b7f 100644
--- a/test/auto_parallel/test_dist_assign.py
+++ b/test/auto_parallel/test_dist_assign.py
@@ -38,9 +38,11 @@ def make_program():
 
 
 def parallelizer(program_func, rank):
-    from paddle.distributed.auto_parallel.completion import Completer
-    from paddle.distributed.auto_parallel.dist_context import DistributedContext
-    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.static.completion import Completer
+    from paddle.distributed.auto_parallel.static.dist_context import (
+        DistributedContext,
+    )
+    from paddle.distributed.auto_parallel.static.partitioner import Partitioner
 
     main_program, start_program = program_func()
 
diff --git a/test/auto_parallel/test_dist_attr_v2.py b/test/auto_parallel/test_dist_attr_v2.py
index 1d15c34221f..37f13f5af9d 100644
--- a/test/auto_parallel/test_dist_attr_v2.py
+++ b/test/auto_parallel/test_dist_attr_v2.py
@@ -21,12 +21,12 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, static
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.dist_context import (
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.static.dist_context import (
     DistributedContext,
     set_default_distributed_context,
 )
-from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
-from paddle.distributed.auto_parallel.utils import (
+from paddle.distributed.auto_parallel.static.utils import (
     _copy_dist_attr_from_cpp,
     _copy_dist_attr_from_cpp_for_graph,
     _copy_dist_attr_to_cpp,
diff --git a/test/auto_parallel/test_dist_context.py b/test/auto_parallel/test_dist_context.py
index 2944b2db2a3..695949fd698 100644
--- a/test/auto_parallel/test_dist_context.py
+++ b/test/auto_parallel/test_dist_context.py
@@ -21,7 +21,9 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, static
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
 from paddle.distributed.fleet import auto
 
 paddle.enable_static()
diff --git a/test/auto_parallel/test_dist_matmul.py b/test/auto_parallel/test_dist_matmul.py
index 0a07b98de70..77c15942709 100644
--- a/test/auto_parallel/test_dist_matmul.py
+++ b/test/auto_parallel/test_dist_matmul.py
@@ -103,9 +103,11 @@ def matmulv2_dp2mp2(init_x, init_y, trans_x, trans_y):
 
 
 def parallelizer(program_func, *args, **kwargs):
-    from paddle.distributed.auto_parallel.completion import Completer
-    from paddle.distributed.auto_parallel.dist_context import DistributedContext
-    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.static.completion import Completer
+    from paddle.distributed.auto_parallel.static.dist_context import (
+        DistributedContext,
+    )
+    from paddle.distributed.auto_parallel.static.partitioner import Partitioner
 
     main_program, start_program, loss = program_func(*args, **kwargs)
 
diff --git a/test/auto_parallel/test_dist_op_cost.py b/test/auto_parallel/test_dist_op_cost.py
index ecff2bbf893..4d7cca7e5b3 100644
--- a/test/auto_parallel/test_dist_op_cost.py
+++ b/test/auto_parallel/test_dist_op_cost.py
@@ -16,8 +16,8 @@ import copy
 import unittest
 
 import paddle
-from paddle.distributed.auto_parallel.cluster import Cluster
-from paddle.distributed.auto_parallel.operators.common import (
+from paddle.distributed.auto_parallel.static.cluster import Cluster
+from paddle.distributed.auto_parallel.static.operators.common import (
     get_distributed_operator_impl_container,
     is_elementwise_op,
 )
@@ -29,8 +29,10 @@ paddle.enable_static()
 
 
 def parallelizer(program_func, rank):
-    from paddle.distributed.auto_parallel.completion import Completer
-    from paddle.distributed.auto_parallel.dist_context import DistributedContext
+    from paddle.distributed.auto_parallel.static.completion import Completer
+    from paddle.distributed.auto_parallel.static.dist_context import (
+        DistributedContext,
+    )
 
     main_program, startup_program, loss = program_func()
 
diff --git a/test/auto_parallel/test_dist_pnorm.py b/test/auto_parallel/test_dist_pnorm.py
index 5ff30d27b6d..62311420815 100644
--- a/test/auto_parallel/test_dist_pnorm.py
+++ b/test/auto_parallel/test_dist_pnorm.py
@@ -75,9 +75,11 @@ def make_program_serial():
 
 
 def parallelizer(program_func, rank):
-    from paddle.distributed.auto_parallel.completion import Completer
-    from paddle.distributed.auto_parallel.dist_context import DistributedContext
-    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.static.completion import Completer
+    from paddle.distributed.auto_parallel.static.dist_context import (
+        DistributedContext,
+    )
+    from paddle.distributed.auto_parallel.static.partitioner import Partitioner
 
     main_program, start_program, loss = program_func()
 
diff --git a/test/auto_parallel/test_dist_reshape.py b/test/auto_parallel/test_dist_reshape.py
index 8dd84da9175..743cda599e4 100644
--- a/test/auto_parallel/test_dist_reshape.py
+++ b/test/auto_parallel/test_dist_reshape.py
@@ -37,9 +37,11 @@ def make_program_dp2():
 
 
 def parallelizer(program_func, rank):
-    from paddle.distributed.auto_parallel.completion import Completer
-    from paddle.distributed.auto_parallel.dist_context import DistributedContext
-    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.static.completion import Completer
+    from paddle.distributed.auto_parallel.static.dist_context import (
+        DistributedContext,
+    )
+    from paddle.distributed.auto_parallel.static.partitioner import Partitioner
 
     main_program, start_program = program_func()
 
diff --git a/test/auto_parallel/test_dist_scale.py b/test/auto_parallel/test_dist_scale.py
index b68131e361e..270f6951ece 100644
--- a/test/auto_parallel/test_dist_scale.py
+++ b/test/auto_parallel/test_dist_scale.py
@@ -34,9 +34,11 @@ def make_program():
 
 
 def parallelizer(program_func, rank):
-    from paddle.distributed.auto_parallel.completion import Completer
-    from paddle.distributed.auto_parallel.dist_context import DistributedContext
-    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.static.completion import Completer
+    from paddle.distributed.auto_parallel.static.dist_context import (
+        DistributedContext,
+    )
+    from paddle.distributed.auto_parallel.static.partitioner import Partitioner
 
     main_program, start_program = program_func()
 
diff --git a/test/auto_parallel/test_dist_shape.py b/test/auto_parallel/test_dist_shape.py
index 0322a817934..6bc33e82dac 100644
--- a/test/auto_parallel/test_dist_shape.py
+++ b/test/auto_parallel/test_dist_shape.py
@@ -34,9 +34,11 @@ def make_program():
 
 
 def parallelizer(program_func, rank):
-    from paddle.distributed.auto_parallel.completion import Completer
-    from paddle.distributed.auto_parallel.dist_context import DistributedContext
-    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.static.completion import Completer
+    from paddle.distributed.auto_parallel.static.dist_context import (
+        DistributedContext,
+    )
+    from paddle.distributed.auto_parallel.static.partitioner import Partitioner
 
     main_program, start_program = program_func()
 
diff --git a/test/auto_parallel/test_dist_slice.py b/test/auto_parallel/test_dist_slice.py
index cdca9904d62..e94dcf32f7b 100644
--- a/test/auto_parallel/test_dist_slice.py
+++ b/test/auto_parallel/test_dist_slice.py
@@ -56,9 +56,11 @@ def make_program_serial():
 
 
 def parallelizer(program_func, rank):
-    from paddle.distributed.auto_parallel.completion import Completer
-    from paddle.distributed.auto_parallel.dist_context import DistributedContext
-    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.static.completion import Completer
+    from paddle.distributed.auto_parallel.static.dist_context import (
+        DistributedContext,
+    )
+    from paddle.distributed.auto_parallel.static.partitioner import Partitioner
 
     main_program, start_program = program_func()
 
diff --git a/test/auto_parallel/test_dist_split.py b/test/auto_parallel/test_dist_split.py
index edc711ea4c8..b44d180685e 100644
--- a/test/auto_parallel/test_dist_split.py
+++ b/test/auto_parallel/test_dist_split.py
@@ -34,9 +34,11 @@ def make_program_dp2():
 
 
 def parallelizer(program_func, rank):
-    from paddle.distributed.auto_parallel.completion import Completer
-    from paddle.distributed.auto_parallel.dist_context import DistributedContext
-    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.static.completion import Completer
+    from paddle.distributed.auto_parallel.static.dist_context import (
+        DistributedContext,
+    )
+    from paddle.distributed.auto_parallel.static.partitioner import Partitioner
 
     main_program, start_program = program_func()
 
diff --git a/test/auto_parallel/test_engine_callbacks.py b/test/auto_parallel/test_engine_callbacks.py
index d62cff86245..f00d62cc035 100644
--- a/test/auto_parallel/test_engine_callbacks.py
+++ b/test/auto_parallel/test_engine_callbacks.py
@@ -20,7 +20,7 @@ import unittest
 
 import paddle
 import paddle.vision.transforms as T
-from paddle.distributed.auto_parallel.callbacks import config_callbacks
+from paddle.distributed.auto_parallel.static.callbacks import config_callbacks
 from paddle.distributed.fleet import auto
 from paddle.static import InputSpec
 from paddle.vision.datasets import MNIST
diff --git a/test/auto_parallel/test_fp16_assign.py b/test/auto_parallel/test_fp16_assign.py
index eb34226ac89..b1a13d81148 100644
--- a/test/auto_parallel/test_fp16_assign.py
+++ b/test/auto_parallel/test_fp16_assign.py
@@ -64,9 +64,11 @@ def make_program():
 
 
 def parallelizer(program_func, rank):
-    from paddle.distributed.auto_parallel.completion import Completer
-    from paddle.distributed.auto_parallel.dist_context import DistributedContext
-    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.static.completion import Completer
+    from paddle.distributed.auto_parallel.static.dist_context import (
+        DistributedContext,
+    )
+    from paddle.distributed.auto_parallel.static.partitioner import Partitioner
 
     main_program, start_program = program_func()
 
diff --git a/test/auto_parallel/test_group_operators.py b/test/auto_parallel/test_group_operators.py
index 6dea719a111..aec75934e5e 100644
--- a/test/auto_parallel/test_group_operators.py
+++ b/test/auto_parallel/test_group_operators.py
@@ -112,10 +112,10 @@ class TestGroupOperators(unittest.TestCase):
             sequence_len,
             vocab_size,
         )
-        from paddle.distributed.auto_parallel.dist_context import (
+        from paddle.distributed.auto_parallel.static.dist_context import (
             DistributedContext,
         )
-        from paddle.distributed.auto_parallel.tuner.rule_based_tuner import (
+        from paddle.distributed.auto_parallel.static.tuner.rule_based_tuner import (
             RuleBasedTuner,
         )
 
diff --git a/test/auto_parallel/test_interface.py b/test/auto_parallel/test_interface.py
index 3d57049410a..5ea4209a625 100644
--- a/test/auto_parallel/test_interface.py
+++ b/test/auto_parallel/test_interface.py
@@ -17,10 +17,10 @@ import unittest
 import paddle
 import paddle.nn.functional as F
 from paddle import nn, static
-from paddle.distributed.auto_parallel.dist_context import (
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.static.dist_context import (
     get_default_distributed_context,
 )
-from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
 from paddle.distributed.fleet import auto
 
 paddle.enable_static()
diff --git a/test/auto_parallel/test_new_cost_model.py b/test/auto_parallel/test_new_cost_model.py
index 8439df7ae88..b3e9016e4d2 100644
--- a/test/auto_parallel/test_new_cost_model.py
+++ b/test/auto_parallel/test_new_cost_model.py
@@ -20,10 +20,10 @@ import unittest
 from test_cluster import cluster_json
 
 import paddle
-import paddle.distributed.auto_parallel.cost as cost_model
-from paddle.distributed.auto_parallel.cluster import Cluster
-from paddle.distributed.auto_parallel.cost import CommContext
-from paddle.distributed.auto_parallel.cost.base_cost import (
+import paddle.distributed.auto_parallel.static.cost as cost_model
+from paddle.distributed.auto_parallel.static.cluster import Cluster
+from paddle.distributed.auto_parallel.static.cost import CommContext
+from paddle.distributed.auto_parallel.static.cost.base_cost import (
     build_comp_desc_from_op,
     build_comp_desc_str_for_predict,
     calc_time_by_modeling,
diff --git a/test/auto_parallel/test_parallel_tuner.py b/test/auto_parallel/test_parallel_tuner.py
index 258bf0c398b..76203cbfc9a 100644
--- a/test/auto_parallel/test_parallel_tuner.py
+++ b/test/auto_parallel/test_parallel_tuner.py
@@ -18,13 +18,15 @@ import unittest
 import paddle
 from paddle import static
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.cluster import Cluster
-from paddle.distributed.auto_parallel.dist_context import (
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.static.cluster import Cluster
+from paddle.distributed.auto_parallel.static.dist_context import (
     DistributedContext,
     set_default_distributed_context,
 )
-from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
-from paddle.distributed.auto_parallel.tuner.parallel_tuner import ParallelTuner
+from paddle.distributed.auto_parallel.static.tuner.parallel_tuner import (
+    ParallelTuner,
+)
 
 sys.path.append("../legacy_test")
 import auto_parallel_gpt_model as modeling
diff --git a/test/auto_parallel/test_parallel_tuner_full.py b/test/auto_parallel/test_parallel_tuner_full.py
index 7df76ef097e..181f77b0eb9 100644
--- a/test/auto_parallel/test_parallel_tuner_full.py
+++ b/test/auto_parallel/test_parallel_tuner_full.py
@@ -18,15 +18,17 @@ import unittest
 import paddle
 from paddle import static
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.cluster import Cluster
-from paddle.distributed.auto_parallel.dist_context import (
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.static.cluster import Cluster
+from paddle.distributed.auto_parallel.static.dist_context import (
     DistributedContext,
     set_default_distributed_context,
 )
-from paddle.distributed.auto_parallel.planner_v2 import Planner
-from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.static.planner_v2 import Planner
+from paddle.distributed.auto_parallel.static.tuner.parallel_tuner import (
+    ParallelTuner,
+)
 from paddle.distributed.auto_parallel.strategy import Strategy
-from paddle.distributed.auto_parallel.tuner.parallel_tuner import ParallelTuner
 
 sys.path.append("../legacy_test")
 import auto_parallel_gpt_model as modeling
diff --git a/test/auto_parallel/test_parallel_tuner_predict.py b/test/auto_parallel/test_parallel_tuner_predict.py
index 1e3c6ea87e8..63b9186c0c8 100644
--- a/test/auto_parallel/test_parallel_tuner_predict.py
+++ b/test/auto_parallel/test_parallel_tuner_predict.py
@@ -18,13 +18,15 @@ import unittest
 import paddle
 from paddle import static
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.cluster import Cluster
-from paddle.distributed.auto_parallel.dist_context import (
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.static.cluster import Cluster
+from paddle.distributed.auto_parallel.static.dist_context import (
     DistributedContext,
     set_default_distributed_context,
 )
-from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
-from paddle.distributed.auto_parallel.tuner.parallel_tuner import ParallelTuner
+from paddle.distributed.auto_parallel.static.tuner.parallel_tuner import (
+    ParallelTuner,
+)
 
 sys.path.append("../legacy_test")
 import auto_parallel_gpt_model as modeling
diff --git a/test/auto_parallel/test_pattern.py b/test/auto_parallel/test_pattern.py
index bdccc68d984..1f7e89c08c5 100644
--- a/test/auto_parallel/test_pattern.py
+++ b/test/auto_parallel/test_pattern.py
@@ -112,7 +112,7 @@ class TestGroupOperatorsAndPatterns(unittest.TestCase):
             sequence_len,
             vocab_size,
         )
-        from paddle.distributed.auto_parallel.tuner.rule_based_tuner import (
+        from paddle.distributed.auto_parallel.static.tuner.rule_based_tuner import (
             _PATTERNS,
             GraphUtil,
         )
diff --git a/test/auto_parallel/test_pattern_match.py b/test/auto_parallel/test_pattern_match.py
index c240969ef9d..0bbf7af68a0 100644
--- a/test/auto_parallel/test_pattern_match.py
+++ b/test/auto_parallel/test_pattern_match.py
@@ -112,10 +112,10 @@ class TestPatternMatch(unittest.TestCase):
             sequence_len,
             vocab_size,
         )
-        from paddle.distributed.auto_parallel.dist_context import (
+        from paddle.distributed.auto_parallel.static.dist_context import (
             DistributedContext,
         )
-        from paddle.distributed.auto_parallel.tuner.rule_based_tuner import (
+        from paddle.distributed.auto_parallel.static.tuner.rule_based_tuner import (
             GraphUtil,
             RuleBasedTuner,
         )
diff --git a/test/auto_parallel/test_prim_dist_op.py b/test/auto_parallel/test_prim_dist_op.py
index 5a4a1b5a512..b92f550d41f 100644
--- a/test/auto_parallel/test_prim_dist_op.py
+++ b/test/auto_parallel/test_prim_dist_op.py
@@ -15,13 +15,13 @@
 import unittest
 
 import paddle
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.dist_context import (
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.dist_context import (
     DistributedContext,
     get_default_distributed_context,
 )
-from paddle.distributed.auto_parallel.partitioner import Partitioner
-from paddle.distributed.auto_parallel.utils import set_var_dist_attr
+from paddle.distributed.auto_parallel.static.partitioner import Partitioner
+from paddle.distributed.auto_parallel.static.utils import set_var_dist_attr
 from paddle.distributed.fleet import auto
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.incubate.autograd import enable_prim
diff --git a/test/auto_parallel/test_process_mesh.py b/test/auto_parallel/test_process_mesh.py
index 07da754e797..d4b91a5dcc3 100644
--- a/test/auto_parallel/test_process_mesh.py
+++ b/test/auto_parallel/test_process_mesh.py
@@ -19,14 +19,14 @@ import numpy as np
 import paddle
 import paddle.nn.functional as F
 from paddle import nn, static
-from paddle.distributed.auto_parallel.dist_context import (
-    get_default_distributed_context,
-)
 from paddle.distributed.auto_parallel.process_mesh import (
     ProcessMesh,
     compute_compatible_process_mesh,
     merge_process_meshes,
 )
+from paddle.distributed.auto_parallel.static.dist_context import (
+    get_default_distributed_context,
+)
 
 paddle.enable_static()
 
diff --git a/test/auto_parallel/test_process_mesh_v2.py b/test/auto_parallel/test_process_mesh_v2.py
index 03ec95c7187..0d98caad3a7 100644
--- a/test/auto_parallel/test_process_mesh_v2.py
+++ b/test/auto_parallel/test_process_mesh_v2.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from paddle.distributed.auto_parallel.process_mesh_v2 import (
+from paddle.distributed.auto_parallel.static.process_mesh_v2 import (
     ProcessMesh,
     compute_compatible_process_mesh,
     merge_process_mesh,
diff --git a/test/auto_parallel/test_recorder.py b/test/auto_parallel/test_recorder.py
index eaaefcbe073..185d3d3ef3d 100644
--- a/test/auto_parallel/test_recorder.py
+++ b/test/auto_parallel/test_recorder.py
@@ -16,7 +16,7 @@ import unittest
 
 import numpy as np
 
-from paddle.distributed.auto_parallel.tuner import recorder as rd
+from paddle.distributed.auto_parallel.static.tuner import recorder as rd
 
 
 class TestRecorder(unittest.TestCase):
diff --git a/test/auto_parallel/test_rule_based_tuner.py b/test/auto_parallel/test_rule_based_tuner.py
index a3ef694b5c3..7c4c980fd99 100644
--- a/test/auto_parallel/test_rule_based_tuner.py
+++ b/test/auto_parallel/test_rule_based_tuner.py
@@ -112,11 +112,11 @@ class TestRuleBasedTuner(unittest.TestCase):
             sequence_len,
             vocab_size,
         )
-        from paddle.distributed.auto_parallel.cluster import Cluster
-        from paddle.distributed.auto_parallel.dist_context import (
+        from paddle.distributed.auto_parallel.static.cluster import Cluster
+        from paddle.distributed.auto_parallel.static.dist_context import (
             DistributedContext,
         )
-        from paddle.distributed.auto_parallel.tuner.rule_based_tuner import (
+        from paddle.distributed.auto_parallel.static.tuner.rule_based_tuner import (
             RuleBasedTuner,
         )
 
diff --git a/test/auto_parallel/test_rule_based_tuner_o2.py b/test/auto_parallel/test_rule_based_tuner_o2.py
index 999535d7204..5fdb1fc83e9 100644
--- a/test/auto_parallel/test_rule_based_tuner_o2.py
+++ b/test/auto_parallel/test_rule_based_tuner_o2.py
@@ -112,11 +112,11 @@ class TestRuleBasedTuner(unittest.TestCase):
             sequence_len,
             vocab_size,
         )
-        from paddle.distributed.auto_parallel.cluster import Cluster
-        from paddle.distributed.auto_parallel.dist_context import (
+        from paddle.distributed.auto_parallel.static.cluster import Cluster
+        from paddle.distributed.auto_parallel.static.dist_context import (
             DistributedContext,
         )
-        from paddle.distributed.auto_parallel.tuner.rule_based_tuner import (
+        from paddle.distributed.auto_parallel.static.tuner.rule_based_tuner import (
             RuleBasedTuner,
         )
 
diff --git a/test/auto_parallel/test_serialization.py b/test/auto_parallel/test_serialization.py
index d89c9596f4c..495f3adf620 100644
--- a/test/auto_parallel/test_serialization.py
+++ b/test/auto_parallel/test_serialization.py
@@ -20,11 +20,11 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, static
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.dist_context import (
+from paddle.distributed.auto_parallel.static.dist_context import (
     DistributedContext,
     set_default_distributed_context,
 )
-from paddle.distributed.auto_parallel.process_mesh_v2 import ProcessMesh
+from paddle.distributed.auto_parallel.static.process_mesh_v2 import ProcessMesh
 from paddle.distributed.fleet import auto
 from paddle.fluid.core import TensorDistAttr
 from paddle.fluid.framework import Program
diff --git a/test/auto_parallel/test_to_static.py b/test/auto_parallel/test_to_static.py
index 2057d509ad1..1550c2d2669 100644
--- a/test/auto_parallel/test_to_static.py
+++ b/test/auto_parallel/test_to_static.py
@@ -19,7 +19,10 @@ import numpy as np
 import paddle
 import paddle.nn.functional as F
 from paddle import LazyGuard, nn
-from paddle.distributed.auto_parallel.helper import ProgramHelper, ProxyLayer
+from paddle.distributed.auto_parallel.static.helper import (
+    ProgramHelper,
+    ProxyLayer,
+)
 from paddle.distributed.fleet import auto
 from paddle.framework import in_dynamic_mode
 from paddle.io import Dataset
diff --git a/test/auto_parallel/test_topology.py b/test/auto_parallel/test_topology.py
index 6807d22ffc3..0119821532e 100644
--- a/test/auto_parallel/test_topology.py
+++ b/test/auto_parallel/test_topology.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from paddle.distributed.auto_parallel.topo import SingleNodeTopology
+from paddle.distributed.auto_parallel.static.topo import SingleNodeTopology
 
 
 def check_empty_json_object(json_object):
diff --git a/test/auto_parallel/test_trial.py b/test/auto_parallel/test_trial.py
index 5fcf38b2e65..7861ab82f8f 100644
--- a/test/auto_parallel/test_trial.py
+++ b/test/auto_parallel/test_trial.py
@@ -14,8 +14,8 @@
 
 import unittest
 
-from paddle.distributed.auto_parallel.tuner import trial as tr
-from paddle.distributed.auto_parallel.tuner import tunable_space as ts
+from paddle.distributed.auto_parallel.static.tuner import trial as tr
+from paddle.distributed.auto_parallel.static.tuner import tunable_space as ts
 
 
 class TestTiral(unittest.TestCase):
diff --git a/test/auto_parallel/test_tunable_space.py b/test/auto_parallel/test_tunable_space.py
index badc90275fd..b32e96107b5 100644
--- a/test/auto_parallel/test_tunable_space.py
+++ b/test/auto_parallel/test_tunable_space.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from paddle.distributed.auto_parallel.tuner import tunable_space as ts
+from paddle.distributed.auto_parallel.static.tuner import tunable_space as ts
 
 
 class TestTunableSpace(unittest.TestCase):
diff --git a/test/auto_parallel/test_tunable_variable.py b/test/auto_parallel/test_tunable_variable.py
index 641f7b4347e..208ecf7238f 100644
--- a/test/auto_parallel/test_tunable_variable.py
+++ b/test/auto_parallel/test_tunable_variable.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from paddle.distributed.auto_parallel.tuner import tunable_variable as tv
+from paddle.distributed.auto_parallel.static.tuner import tunable_variable as tv
 
 
 class TestTunableVariable(unittest.TestCase):
diff --git a/test/auto_parallel/test_while_op_completion.py b/test/auto_parallel/test_while_op_completion.py
index 3f9b5b151ab..67887916c66 100644
--- a/test/auto_parallel/test_while_op_completion.py
+++ b/test/auto_parallel/test_while_op_completion.py
@@ -20,8 +20,10 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, static
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
 from paddle.distributed.fleet import auto
 
 paddle.enable_static()
diff --git a/test/auto_parallel/test_while_op_partition.py b/test/auto_parallel/test_while_op_partition.py
index 00f3a70bbcf..ef3189542cb 100644
--- a/test/auto_parallel/test_while_op_partition.py
+++ b/test/auto_parallel/test_while_op_partition.py
@@ -20,12 +20,12 @@ import paddle
 import paddle.nn.functional as F
 from paddle import fluid, nn, static
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.completion import Completer
-from paddle.distributed.auto_parallel.dist_context import (
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.dist_context import (
     get_default_distributed_context,
 )
-from paddle.distributed.auto_parallel.partitioner import Partitioner
-from paddle.distributed.auto_parallel.utils import make_data_unshard
+from paddle.distributed.auto_parallel.static.partitioner import Partitioner
+from paddle.distributed.auto_parallel.static.utils import make_data_unshard
 from paddle.distributed.fleet import auto
 
 paddle.enable_static()
diff --git a/test/distributed_passes/test_auto_parallel_data_parallel_optimization_pass.py b/test/distributed_passes/test_auto_parallel_data_parallel_optimization_pass.py
index aa989df7025..33672c3fa7f 100644
--- a/test/distributed_passes/test_auto_parallel_data_parallel_optimization_pass.py
+++ b/test/distributed_passes/test_auto_parallel_data_parallel_optimization_pass.py
@@ -23,10 +23,10 @@ from auto_parallel_pass_test_base import AutoPallelPassTestBase
 
 import paddle
 from paddle.distributed import fleet
-from paddle.distributed.auto_parallel.dist_context import (
+from paddle.distributed.auto_parallel.static.dist_context import (
     get_default_distributed_context,
 )
-from paddle.distributed.auto_parallel.operators.common import (
+from paddle.distributed.auto_parallel.static.operators.common import (
     is_data_parallel_reduce_op,
 )
 from paddle.distributed.passes import PassContext, new_pass
-- 
GitLab