diff --git a/test/collective/fleet/hybrid_parallel_mp_amp.py b/test/collective/fleet/hybrid_parallel_mp_amp.py
index 2ae5fcf70291c85e579036985cc0991066f0117d..7b139c096647f48b278345099b5e2d8b518cca88 100644
--- a/test/collective/fleet/hybrid_parallel_mp_amp.py
+++ b/test/collective/fleet/hybrid_parallel_mp_amp.py
@@ -14,13 +14,13 @@
 
 import unittest
 
-from hybrid_parallel_mp_model import TestDistMPTraning
+from hybrid_parallel_mp_model import TestDistMPTraining
 
 import paddle
 from paddle.distributed import fleet
 
 
-class TestMPClipGrad(TestDistMPTraning):
+class TestMPClipGrad(TestDistMPTraining):
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
         scheduler = paddle.optimizer.lr.ExponentialDecay(
diff --git a/test/collective/fleet/hybrid_parallel_mp_bf16.py b/test/collective/fleet/hybrid_parallel_mp_bf16.py
index ae977f9891768b4567021452be629d32a8bb8bb6..2ddf1868dd0e51cdd4a4b140165fbb754ffa1144 100644
--- a/test/collective/fleet/hybrid_parallel_mp_bf16.py
+++ b/test/collective/fleet/hybrid_parallel_mp_bf16.py
@@ -14,14 +14,14 @@
 
 import unittest
 
-from hybrid_parallel_mp_model import TestDistMPTraning
+from hybrid_parallel_mp_model import TestDistMPTraining
 
 import paddle
 from paddle.distributed import fleet
 from paddle.distributed.utils.nccl_utils import check_nccl_version_for_bf16
 
 
-class TestMPFP16(TestDistMPTraning):
+class TestMPFP16(TestDistMPTraining):
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0)
         scheduler = paddle.optimizer.lr.ExponentialDecay(
diff --git a/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py b/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py
index b376afa1a9913a9617ec13162a7895c112470d80..9872bbf20e9275971d4f4b069aeeedd1bfae6d70 100644
--- a/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py
+++ b/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py
@@ -19,7 +19,7 @@ import numpy as np
 from hybrid_parallel_mp_model import (
     SimpleDPNet,
     SimpleMPNet,
-    TestDistMPTraning,
+    TestDistMPTraining,
     parallel_matmul,
     set_random_seed,
 )
@@ -58,7 +58,7 @@ class SimpleDPMultimodalNet(SimpleDPNet):
         return x
 
 
-class TestMPBroadcastObj(TestDistMPTraning):
+class TestMPBroadcastObj(TestDistMPTraining):
     def build_model_optimizer(self):
         hcg = fleet.get_hybrid_communicate_group()
         word_size = hcg.get_model_parallel_world_size()
diff --git a/test/collective/fleet/hybrid_parallel_mp_clip_grad.py b/test/collective/fleet/hybrid_parallel_mp_clip_grad.py
index 1f674675ecb8c58f757160410302c89b5b94726b..dab3998f9814de5742d8857b43d560c68e615214 100644
--- a/test/collective/fleet/hybrid_parallel_mp_clip_grad.py
+++ b/test/collective/fleet/hybrid_parallel_mp_clip_grad.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from hybrid_parallel_mp_model import TestDistMPTraning
+from hybrid_parallel_mp_model import TestDistMPTraining
 
 import paddle
 
@@ -22,7 +22,7 @@ import paddle
 # log.setLevel(logging.WARNING)
 
 
-class TestMPClipGrad(TestDistMPTraning):
+class TestMPClipGrad(TestDistMPTraining):
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
         scheduler = paddle.optimizer.lr.ExponentialDecay(
diff --git a/test/collective/fleet/hybrid_parallel_mp_fp16.py b/test/collective/fleet/hybrid_parallel_mp_fp16.py
index 5d10ac0a7aebcf96afbee7478cdf277cb0d42088..ca5fdc3a4062e096d2fba95cf60c012dd2b3e702 100644
--- a/test/collective/fleet/hybrid_parallel_mp_fp16.py
+++ b/test/collective/fleet/hybrid_parallel_mp_fp16.py
@@ -14,13 +14,13 @@
 
 import unittest
 
-from hybrid_parallel_mp_model import TestDistMPTraning
+from hybrid_parallel_mp_model import TestDistMPTraining
 
 import paddle
 from paddle.distributed import fleet
 
 
-class TestMPFP16(TestDistMPTraning):
+class TestMPFP16(TestDistMPTraining):
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0)
         scheduler = paddle.optimizer.lr.ExponentialDecay(
diff --git a/test/collective/fleet/hybrid_parallel_mp_layers.py b/test/collective/fleet/hybrid_parallel_mp_layers.py
index 751bc9255c100a5bbdb14a991fe59776edff7a92..4a7b223c6879a4f9d3cb10ab95be16d88457415d 100644
--- a/test/collective/fleet/hybrid_parallel_mp_layers.py
+++ b/test/collective/fleet/hybrid_parallel_mp_layers.py
@@ -115,7 +115,7 @@ class SimpleEmbedding(paddle.nn.Layer):
         return output
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
diff --git a/test/collective/fleet/hybrid_parallel_mp_model.py b/test/collective/fleet/hybrid_parallel_mp_model.py
index 44ceb368b9b67920a47a31bcc2ad54ac6fa2cc48..08ae8f51e47f139db456d08d041528c4345cc690 100644
--- a/test/collective/fleet/hybrid_parallel_mp_model.py
+++ b/test/collective/fleet/hybrid_parallel_mp_model.py
@@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer):
         return x
 
 
-class TestDistMPSyncTraning(unittest.TestCase):
+class TestDistMPSyncTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
@@ -348,7 +348,7 @@ class TestDistMPSyncTraning(unittest.TestCase):
         )
 
 
-class TestDistMPSyncModelTraning(TestDistMPSyncTraning):
+class TestDistMPSyncModelTraining(TestDistMPSyncTraining):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
@@ -368,7 +368,7 @@ class TestDistMPSyncModelTraning(TestDistMPSyncTraning):
         fleet.init(is_collective=True, strategy=strategy)
 
 
-class TestDistMPTraning(unittest.TestCase):
+class TestDistMPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
diff --git a/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py b/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py
index fa78482601f4035a3ca8e7d32fa62b03077b4a31..a4f11294f3815be33275f0d87c33c48bb9d84266 100644
--- a/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py
+++ b/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py
@@ -200,7 +200,7 @@ class SimpleDPNet(paddle.nn.Layer):
         return x
 
 
-class TestDistSPSyncTraning(unittest.TestCase):
+class TestDistSPSyncTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
@@ -370,7 +370,7 @@ class TestDistSPSyncTraning(unittest.TestCase):
         )
 
 
-class TestDistSPSyncModelTraning(TestDistSPSyncTraning):
+class TestDistSPSyncModelTraining(TestDistSPSyncTraining):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
@@ -390,7 +390,7 @@ class TestDistSPSyncModelTraning(TestDistSPSyncTraning):
         fleet.init(is_collective=True, strategy=strategy)
 
 
-class TestDistSPTraning(unittest.TestCase):
+class TestDistSPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
diff --git a/test/collective/fleet/hybrid_parallel_mp_random.py b/test/collective/fleet/hybrid_parallel_mp_random.py
index 717e983e262596790aa98cf4f438fe7e1fc191c9..00877818b1e7ecb5c2169fea075c1dd13b933934 100644
--- a/test/collective/fleet/hybrid_parallel_mp_random.py
+++ b/test/collective/fleet/hybrid_parallel_mp_random.py
@@ -20,7 +20,7 @@ import paddle
 from paddle.distributed import fleet
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
diff --git a/test/collective/fleet/hybrid_parallel_pp_amp.py b/test/collective/fleet/hybrid_parallel_pp_amp.py
index 7c9f973ee1302c616996ef6b400fbc36e2f3395c..f3fe88a9161cfc5f2a11912ed12acfa134558af8 100644
--- a/test/collective/fleet/hybrid_parallel_pp_amp.py
+++ b/test/collective/fleet/hybrid_parallel_pp_amp.py
@@ -34,7 +34,7 @@ batch_size = 4
 micro_batch_size = 2
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_bf16.py b/test/collective/fleet/hybrid_parallel_pp_bf16.py
index 70b3aec1515a84867602c98c12374a1a7fc95583..f260cd88f2f20b10d510cd7c064f2b8e3aaaa4a4 100644
--- a/test/collective/fleet/hybrid_parallel_pp_bf16.py
+++ b/test/collective/fleet/hybrid_parallel_pp_bf16.py
@@ -35,7 +35,7 @@ batch_size = 4
 micro_batch_size = 2
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_clip_grad.py b/test/collective/fleet/hybrid_parallel_pp_clip_grad.py
index 695e3ee90ef6590bdc6cb33f79cf8b495aadf349..76b0fa90c05aab2a06311def6956ef5f8ab46583 100644
--- a/test/collective/fleet/hybrid_parallel_pp_clip_grad.py
+++ b/test/collective/fleet/hybrid_parallel_pp_clip_grad.py
@@ -17,12 +17,12 @@ import unittest
 
 sys.path.append("../../legacy_test")
 
-from hybrid_parallel_pp_alexnet import TestDistPPTraning
+from hybrid_parallel_pp_alexnet import TestDistPPTraining
 
 import paddle
 
 
-class TestPPClipGrad(TestDistPPTraning):
+class TestPPClipGrad(TestDistPPTraining):
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
         scheduler = paddle.optimizer.lr.PiecewiseDecay(
@@ -36,7 +36,7 @@ class TestPPClipGrad(TestDistPPTraning):
         return scheduler, optimizer
 
 
-class TestPPClipGradParamGroup(TestDistPPTraning):
+class TestPPClipGradParamGroup(TestDistPPTraining):
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
         scheduler = paddle.optimizer.lr.PiecewiseDecay(
diff --git a/test/collective/fleet/hybrid_parallel_pp_embedding.py b/test/collective/fleet/hybrid_parallel_pp_embedding.py
index 2dd335eb0efec6e979c470976d97c1f2ee311934..d485e77a79972857d5012a77c6c5b2c4aa0d384b 100644
--- a/test/collective/fleet/hybrid_parallel_pp_embedding.py
+++ b/test/collective/fleet/hybrid_parallel_pp_embedding.py
@@ -120,7 +120,7 @@ class SimpleNetPipe(Layer):
         return feat
 
 
-class TestDistEmbeddingTraning(unittest.TestCase):
+class TestDistEmbeddingTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_fp16.py b/test/collective/fleet/hybrid_parallel_pp_fp16.py
index 84430f4be3730360e79177405d1950705a3f68e0..c6c107a852a222340f99b49860431a8a1589cefc 100644
--- a/test/collective/fleet/hybrid_parallel_pp_fp16.py
+++ b/test/collective/fleet/hybrid_parallel_pp_fp16.py
@@ -38,7 +38,7 @@ batch_size = 4
 micro_batch_size = 2
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_recompute.py b/test/collective/fleet/hybrid_parallel_pp_recompute.py
index 793f2effd09bc9f4c70b62d24cb8f3f585d31ebb..fd03b562f25a36acb375baeb5f0969000cccd804 100644
--- a/test/collective/fleet/hybrid_parallel_pp_recompute.py
+++ b/test/collective/fleet/hybrid_parallel_pp_recompute.py
@@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer):
         )
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_save_load.py b/test/collective/fleet/hybrid_parallel_pp_save_load.py
index a2ab6a0654d5a44fd3587100213ec8d0a5998f75..cfa7c6961aef15e28e07d288fef7df84729b7454 100644
--- a/test/collective/fleet/hybrid_parallel_pp_save_load.py
+++ b/test/collective/fleet/hybrid_parallel_pp_save_load.py
@@ -30,7 +30,7 @@ micro_batch_size = 2
 vocab_size = 128
 
 
-class TestDistPPSaveLoadTraning(unittest.TestCase):
+class TestDistPPSaveLoadTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py
index 5d6152f4e9f05c964f9fb32ce811e59dbb0ab8a3..8e922792cdb8a4c04e73b868b42674b737c297d7 100644
--- a/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py
+++ b/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py
@@ -33,7 +33,7 @@ micro_batch_size = 2
 vocab_size = 128
 
 
-class TestDistPPSaveLoadTraning(unittest.TestCase):
+class TestDistPPSaveLoadTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer.py b/test/collective/fleet/hybrid_parallel_pp_transformer.py
index 216f37796daf08dd904117b9b720ff86b5432a94..18986e3df34fbe976f0fca8c821bdf9152fbb9cc 100644
--- a/test/collective/fleet/hybrid_parallel_pp_transformer.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer.py
@@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer):
         )
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_save.py b/test/collective/fleet/hybrid_parallel_pp_transformer_save.py
index a8cf970f73d229cd68af1831df477022ad9359ac..1bd865e107ec4ed3a5b5e95de47efb7aaf3812d1 100644
--- a/test/collective/fleet/hybrid_parallel_pp_transformer_save.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer_save.py
@@ -29,7 +29,7 @@ vocab_size = 128
 transformer_layer_num = 8
 
 
-class TestDistPPSaveTraning(unittest.TestCase):
+class TestDistPPSaveTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py
index 372cbe7f48d93a4295d921561b9302b32c9f71be..fea23b62661b9118e928c68ad504c8f01acc0c0f 100644
--- a/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py
@@ -33,7 +33,7 @@ vocab_size = 128
 transformer_layer_num = 8
 
 
-class TestDistPPSaveTraning(unittest.TestCase):
+class TestDistPPSaveTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py b/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py
index 60b1e1052bda3d613a87e3be286078381afe5fdd..bc43f514ea9a62cff8672e12e0492c65cdc82122 100644
--- a/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py
@@ -17,7 +17,7 @@ import unittest
 import numpy as np
 from hybrid_parallel_pp_transformer import (
     ModelPipe,
-    TestDistPPTraning,
+    TestDistPPTraining,
     batch_size,
     length,
     micro_batch_size,
@@ -30,7 +30,7 @@ import paddle.distributed as dist
 from paddle.distributed import fleet
 
 
-class TestDistPPTraningUnbalancedData(TestDistPPTraning):
+class TestDistPPTrainingUnbalancedData(TestDistPPTraining):
     def test_pp_model(self):
         hcg = fleet.get_hybrid_communicate_group()
         word_size = hcg.get_model_parallel_world_size()
diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
index 3d7991727ee83599e94d5fef25f09a104953b10a..35a17f17acea249cdfc32f4f6564043247f5a85b 100644
--- a/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
@@ -137,7 +137,7 @@ class ModelPipe(PipelineLayer):
         )
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_qat.py b/test/collective/fleet/hybrid_parallel_qat.py
index 11662d849eb4d1d0bfecbfe2497e227f53807d89..00bc0f746e761b34f30c6a35b245c55a35da5fc7 100644
--- a/test/collective/fleet/hybrid_parallel_qat.py
+++ b/test/collective/fleet/hybrid_parallel_qat.py
@@ -235,7 +235,7 @@ class SimpleDPNet(nn.Layer):
         return x
 
 
-class TestDistMPTraning(unittest.TestCase):
+class TestDistMPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
diff --git a/test/collective/fleet/hybrid_parallel_sharding_model.py b/test/collective/fleet/hybrid_parallel_sharding_model.py
index bb1ace6a6a47653f272174dc456cdea3e06bb768..41343d2dbda9edabb868ed889dee2f225baa3f81 100644
--- a/test/collective/fleet/hybrid_parallel_sharding_model.py
+++ b/test/collective/fleet/hybrid_parallel_sharding_model.py
@@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer):
         return x
 
 
-class TestDistMPTraning(unittest.TestCase):
+class TestDistMPTraining(unittest.TestCase):
     def setUp(self):
         random.seed(2021)
         np.random.seed(2021)
diff --git a/test/collective/fleet/hybrid_parallel_shared_weight.py b/test/collective/fleet/hybrid_parallel_shared_weight.py
index c110d5d00a4a91f5694ff5f9b12588dfcf33f6ff..f54b994768740ad040923d4d4354a97aa9ae9939 100644
--- a/test/collective/fleet/hybrid_parallel_shared_weight.py
+++ b/test/collective/fleet/hybrid_parallel_shared_weight.py
@@ -152,7 +152,7 @@ class SimpleNetPipe(PipelineLayer):
         super().__init__(layers=self.descs, loss_fn=LossNet(), **kwargs)
 
 
-class TestDistEmbeddingTraning(unittest.TestCase):
+class TestDistEmbeddingTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py b/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
index a3f4d5f0b1610dd2910bb324fc269173b9823c27..85f24f61d794dcae7c06fef10f54b66a2dbc2409 100644
--- a/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
+++ b/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.share_net(tmp)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_gpus(self):
         self.trainer_id = dist.get_rank()
         dist.init_parallel_env()
diff --git a/test/collective/fleet/test_fleet_static_mp_layers.py b/test/collective/fleet/test_fleet_static_mp_layers.py
index 6f20943a12837194f23a062fcaec271d03f80a5e..3cd07943d114d306ad66a609766ee2b599ee33a8 100644
--- a/test/collective/fleet/test_fleet_static_mp_layers.py
+++ b/test/collective/fleet/test_fleet_static_mp_layers.py
@@ -66,7 +66,7 @@ class EmbeddingNet(paddle.nn.Layer):
         return output
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "2"
         os.environ[
diff --git a/test/legacy_test/auto_parallel_gpt_model.py b/test/legacy_test/auto_parallel_gpt_model.py
index 1be27f9bc803ab43a2296f58cefb392d156ecede..4d5e1955e23f118b2bd5131ec3a5805ec67a95f3 100644
--- a/test/legacy_test/auto_parallel_gpt_model.py
+++ b/test/legacy_test/auto_parallel_gpt_model.py
@@ -35,7 +35,7 @@ def init_global():
 
 class MultiHeadAttention(nn.Layer):
     """
-    Attention mapps queries and a set of key-value pairs to outputs, and
+    Attention maps queries and a set of key-value pairs to outputs, and
     Multi-Head Attention performs multiple parallel attention to jointly attending
     to information from different representation subspaces.
     """
@@ -114,7 +114,7 @@ class MultiHeadAttention(nn.Layer):
 
     def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
         """
-        Prapares linear projected queries, keys and values for usage of subsequnt
+        Prepares linear projected queries, keys and values for usage of subsequent
         multiple parallel attention. If `cache` is not None, using cached results
         to reduce redundant calculations.
         """
@@ -203,7 +203,7 @@ class MultiHeadAttention(nn.Layer):
 
     def gen_cache(self, key, value=None, type=Cache):
         """
-        Generates cache for `forward` usage in inference accroding to arguments.
+        Generates cache for `forward` usage in inference according to arguments.
         The generated cache is an instance of `MultiHeadAttention.Cache` or an
         instance of `MultiHeadAttention.StaticCache`.
         """
@@ -573,7 +573,7 @@ class GPTEmbeddings(nn.Layer):
             ones = paddle.ones_like(input_ids, dtype="int64")
             seq_length = paddle.cumsum(ones, axis=-1)
             position_ids = seq_length - ones
-        input_embedings = self.word_embeddings(input_ids)
+        input_embeddings = self.word_embeddings(input_ids)
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
                 self.word_embeddings.weight, _global_process_mesh, ["x", None]
@@ -592,7 +592,7 @@ class GPTEmbeddings(nn.Layer):
             )
 
         position_embeddings = self.position_embeddings(position_ids)
-        embeddings = input_embedings + position_embeddings
+        embeddings = input_embeddings + position_embeddings
         embeddings = self.dropout(embeddings)
         return embeddings
 
diff --git a/test/legacy_test/benchmark_sum_op.py b/test/legacy_test/benchmark_sum_op.py
index 6854fd7f208c5edeffb78f4941027c93f8fa150d..7bc15957e1cdabdf109c0ac4de076f6a2072125e 100644
--- a/test/legacy_test/benchmark_sum_op.py
+++ b/test/legacy_test/benchmark_sum_op.py
@@ -59,7 +59,7 @@ class TestSumOp(BenchmarkSuite):
 
     def test_timeit_output(self):
         """
-        perf the op, time cost will be averged in iters.
+        perf the op, time cost will be averaged in iters.
         output example
         >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
         >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
@@ -68,7 +68,7 @@ class TestSumOp(BenchmarkSuite):
 
     def test_timeit_grad(self):
         """
-        perf the op gradient, time cost will be averged in iters.
+        perf the op gradient, time cost will be averaged in iters.
         output example
         >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
         >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653
diff --git a/test/legacy_test/dist_fleet_ctr.py b/test/legacy_test/dist_fleet_ctr.py
index a5634a0cfba28fd23ccc1a23ead9858a536861a8..419344edfae5de7df40a4b1b8b775090ce93d21f 100644
--- a/test/legacy_test/dist_fleet_ctr.py
+++ b/test/legacy_test/dist_fleet_ctr.py
@@ -129,7 +129,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             dnn_out = fc
 
         # build lr model
-        lr_embbding = paddle.static.nn.embedding(
+        lr_embedding = paddle.static.nn.embedding(
             is_distributed=False,
             input=lr_data,
             size=[lr_input_dim, 1],
@@ -141,7 +141,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             padding_idx=0,
         )
         lr_pool = paddle.static.nn.sequence_lod.sequence_pool(
-            input=lr_embbding.squeeze(-2), pool_type="sum"
+            input=lr_embedding.squeeze(-2), pool_type="sum"
         )
 
         merge_layer = paddle.concat([dnn_out, lr_pool], axis=1)
diff --git a/test/legacy_test/dist_hapi_mnist_dynamic.py b/test/legacy_test/dist_hapi_mnist_dynamic.py
index 7fa896cf3ddc270e8144e87885cab9c93eb838d4..66b5f66119b4f8e5eea03477aaba55134a66b13f 100644
--- a/test/legacy_test/dist_hapi_mnist_dynamic.py
+++ b/test/legacy_test/dist_hapi_mnist_dynamic.py
@@ -52,7 +52,7 @@ def compute_accuracy(pred, gt):
 @unittest.skipIf(
     not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
 )
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_dynamic_multiple_gpus(self):
         device = set_device('gpu')
 
diff --git a/test/legacy_test/dist_hapi_mnist_static.py b/test/legacy_test/dist_hapi_mnist_static.py
index 9d9b488f907e6f238afdba8b120c57cf50b667e6..c465ef7fe85a368e4226bdb920ade3b950fb4f3a 100644
--- a/test/legacy_test/dist_hapi_mnist_static.py
+++ b/test/legacy_test/dist_hapi_mnist_static.py
@@ -52,7 +52,7 @@ def compute_accuracy(pred, gt):
 @unittest.skipIf(
     not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
 )
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_static_multiple_gpus(self):
         paddle.enable_static()
         device = set_device('gpu')
diff --git a/test/legacy_test/dist_hapi_pure_fp16_static.py b/test/legacy_test/dist_hapi_pure_fp16_static.py
index f63866991fe267640893b85efa4d7690ab184842..3506b22d31f3003765c015444940206f5a488428 100644
--- a/test/legacy_test/dist_hapi_pure_fp16_static.py
+++ b/test/legacy_test/dist_hapi_pure_fp16_static.py
@@ -26,10 +26,10 @@ from paddle.vision.models import LeNet
 @unittest.skipIf(
     not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
 )
-class TestDistTraningWithPureFP16(unittest.TestCase):
+class TestDistTrainingWithPureFP16(unittest.TestCase):
     def test_amp_training_purefp16(self):
         if not fluid.is_compiled_with_cuda():
-            self.skipTest('module not tested when ONLY_CPU compling')
+            self.skipTest('module not tested when ONLY_CPU compiling')
         data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32)
         label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
 
diff --git a/test/legacy_test/gradient_checker.py b/test/legacy_test/gradient_checker.py
index 23e69c6b8ae62d5313b6edc71c4913d246cb0542..c24691eda1503057a48f16e9fdb789b35dd76f85 100644
--- a/test/legacy_test/gradient_checker.py
+++ b/test/legacy_test/gradient_checker.py
@@ -269,7 +269,7 @@ def grad_check(
     if program is None:
         program = fluid.default_main_program()
 
-    # init variable in strtup program
+    # init variable in startup program
     scope = fluid.executor.global_scope()
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
@@ -595,7 +595,7 @@ def get_static_double_grad(
     if program is None:
         program = fluid.default_main_program()
 
-    # init variable in strtup program
+    # init variable in startup program
     scope = fluid.executor.global_scope()
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
@@ -657,7 +657,7 @@ def get_eager_double_grad(
         the second order derivative and the inputs of second order derivative's calculation
         will be returned for higher order derivative's calculation.
         If 'return_mid_result' set False.
-        A list of numpy array that stores second derivative result calulated by dygraph.
+        A list of numpy array that stores second derivative result calculated by dygraph.
     """
     if isinstance(place, fluid.CPUPlace):
         paddle.set_device("cpu")
@@ -684,7 +684,7 @@ def get_eager_double_grad(
     )
     d_inputs = [d_input for d_input in d_inputs if d_input is not None]
 
-    # calcluate second derivative
+    # calculate second derivative
     inputs = inputs + dys
     ddys = []
     if return_mid_result:
@@ -808,7 +808,7 @@ def get_static_triple_grad(
         program (Program|None): a Program with forward pass.
             If None, use fluid.default_main_program().
     Returns:
-        A list of numpy array that stores third derivative result calulated by static graph.
+        A list of numpy array that stores third derivative result calculated by static graph.
     """
     if program is None:
         program = fluid.default_main_program()
@@ -858,13 +858,13 @@ def get_eager_triple_grad(
         place (fluid.CPUPlace or fluid.CUDAPlace): the device.
         return_mid_result (list[Tensor], list[Tensor]): If set True, the
     Returns:
-        A list of numpy array that stores second derivative result calulated by dygraph
+        A list of numpy array that stores second derivative result calculated by dygraph
     """
     dd_y, dd_x = get_eager_double_grad(
         func, x_init, dy_init, place, return_mid_result=True
     )
 
-    # calcluate third derivative
+    # calculate third derivative
     dddys = []
     for dd_yi in dd_y:
         dd_yi.stop_gradient = False
diff --git a/test/legacy_test/hybrid_parallel_pp_alexnet.py b/test/legacy_test/hybrid_parallel_pp_alexnet.py
index d8e433456d1250ab7280ca9a67448c8a07d323d4..dc461176da1679ac2798012721f73545e4a8b441 100644
--- a/test/legacy_test/hybrid_parallel_pp_alexnet.py
+++ b/test/legacy_test/hybrid_parallel_pp_alexnet.py
@@ -41,7 +41,7 @@ batch_size = 4
 micro_batch_size = 2
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
@@ -136,7 +136,7 @@ class TestDistPPTraning(unittest.TestCase):
             )
 
 
-class TestDistPPDelayScaleLoss(TestDistPPTraning):
+class TestDistPPDelayScaleLoss(TestDistPPTraining):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
@@ -158,7 +158,7 @@ class TestDistPPDelayScaleLoss(TestDistPPTraning):
         fleet.init(is_collective=True, strategy=strategy)
 
 
-class TestDistPPMainGrad(TestDistPPTraning):
+class TestDistPPMainGrad(TestDistPPTraining):
     def wrapper_mix_precision(self, model, optimizer):
         model = MixPrecisionLayer(model, dtype="float16")
         optimizer = MixPrecisionOptimizer(optimizer)
diff --git a/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py b/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py
index febf4b5f87d4ac52adc863ad071869723aff22d2..87167fd7318ab5fffa3ef70f111f456bf70b0570 100644
--- a/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py
+++ b/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py
@@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.linear(inputs)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_gpus(self):
         self.trainer_id = dist.get_rank()
         dist.init_parallel_env()
diff --git a/test/legacy_test/parallel_dygraph_gradient_check.py b/test/legacy_test/parallel_dygraph_gradient_check.py
index a6c47b65d8f5a2f22ad6aed2843627a87bbbc7ed..5566b74f9aa65454636d833a6d2019c6a0a87e52 100644
--- a/test/legacy_test/parallel_dygraph_gradient_check.py
+++ b/test/legacy_test/parallel_dygraph_gradient_check.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.share_net(tmp)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_gpus(self):
         dist.init_parallel_env()
         self.trainer_id = dist.get_rank()
diff --git a/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py b/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py
index df66ff7616a08d372fb04d49ae08a4fcf2c6db05..7cb8c326a3edef36dc17d1c55e813a08e24d7beb 100644
--- a/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py
+++ b/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.share_net(tmp)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_gpus(self):
         self.trainer_id = dist.get_rank()
         self.pg = dist.init_parallel_env()
diff --git a/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py b/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py
index 6db6e9e62a5f94650b4f8cb85d99dbc0829e4784..805bf251acce6f51c9a150bbfc5b2dae5fcfcfcd 100644
--- a/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py
+++ b/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py
@@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.linear(inputs)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_xpus(self):
         self.trainer_id = dist.get_rank()
         dist.init_parallel_env()
diff --git a/test/xpu/parallel_dygraph_gradient_check.py b/test/xpu/parallel_dygraph_gradient_check.py
index 9de687f524aae6a03b0e398478d952870731184f..4f9ac5e3b98c78977e74d434993627efb2e9d3da 100644
--- a/test/xpu/parallel_dygraph_gradient_check.py
+++ b/test/xpu/parallel_dygraph_gradient_check.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.share_net(tmp)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_xpus(self):
         dist.init_parallel_env()
         self.trainer_id = dist.get_rank()
diff --git a/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py b/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
index f0e46b2db27c21e27b735e48e9acb9fb876b35eb..497b4b025a2bb839655f289060653bd848075e66 100644
--- a/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
+++ b/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.share_net(tmp)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_xpus(self):
         self.trainer_id = dist.get_rank()
         self.pg = dist.init_parallel_env()